diff --git a/.editorconfig b/.editorconfig
index 1eadda334..c90b171f5 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -48,3 +48,7 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[vendor/miniaudio/miniaudio.h]
+trim_trailing_whitespace = unset
+insert_final_newline = unset
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
index dbd31e589..92dc41f9d 100644
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -26,12 +26,12 @@ jobs:
           sudo apt-get install -y --no-install-recommends \
                   build-essential \
                   gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libcurl4-openssl-dev:riscv64
+                  g++-14-riscv64-linux-gnu
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
                          -DLLAMA_BUILD_TOOLS=ON \
@@ -72,12 +72,12 @@ jobs:
                   glslc \
                   gcc-14-riscv64-linux-gnu \
                   g++-14-riscv64-linux-gnu \
-                  libvulkan-dev:riscv64 \
-                  libcurl4-openssl-dev:riscv64
+                  libvulkan-dev:riscv64
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
@@ -118,12 +118,12 @@ jobs:
                   build-essential \
                   glslc \
                   crossbuild-essential-arm64 \
-                  libvulkan-dev:arm64 \
-                  libcurl4-openssl-dev:arm64
+                  libvulkan-dev:arm64
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
@@ -163,12 +163,12 @@ jobs:
           sudo apt-get install -y --no-install-recommends \
                   build-essential \
                   gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu \
-                  libcurl4-openssl-dev:ppc64el
+                  g++-14-powerpc64le-linux-gnu
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
                          -DLLAMA_BUILD_TOOLS=ON \
@@ -209,12 +209,12 @@ jobs:
                   glslc \
                   gcc-14-powerpc64le-linux-gnu \
                   g++-14-powerpc64le-linux-gnu \
-                  libvulkan-dev:ppc64el \
-                  libcurl4-openssl-dev:ppc64el
+                  libvulkan-dev:ppc64el
 
       - name: Build
         run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
                          -DGGML_VULKAN=ON \
                          -DGGML_OPENMP=OFF \
                          -DLLAMA_BUILD_EXAMPLES=ON \
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ed827bf70..65ed24465 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,4 +1,4 @@
-name: Create Release
+name: Release
 
 on:
   workflow_dispatch: # allows manual triggering
@@ -227,6 +227,69 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
           name: llama-bin-ubuntu-vulkan-x64.zip
 
+  windows-cpu:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        include:
+          - arch: 'x64'
+          - arch: 'arm64'
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-cpu-${{ matrix.arch }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install Ninja
+        run: |
+          choco install ninja
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
+
+      - name: Build
+        shell: cmd
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
+            -DGGML_OPENMP=ON ^
+            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
+          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
+          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bin-win-cpu-${{ matrix.arch }}.zip
+          name: llama-bin-win-cpu-${{ matrix.arch }}.zip
+
   windows:
     runs-on: windows-latest
 
@@ -237,52 +300,30 @@ jobs:
     strategy:
       matrix:
         include:
-          - build: 'cpu-x64'
+          - backend: 'vulkan'
             arch: 'x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
-          #- build: 'openblas-x64'
-          #  arch: 'x64'
-          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
-            arch: 'x64'
-            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'cpu-arm64'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
-          - build: 'opencl-adreno-arm64'
+            defines: '-DGGML_VULKAN=ON'
+            target: 'ggml-vulkan'
+          - backend: 'opencl-adreno'
             arch: 'arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
+            target: 'ggml-opencl'
 
     steps:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
 
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          key: windows-latest-cmake-${{ matrix.build }}
+          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
           variant: ccache
           evict-old-files: 1d
 
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
       - name: Install Vulkan SDK
         id: get_vulkan
-        if: ${{ matrix.build == 'vulkan-x64' }}
+        if: ${{ matrix.backend == 'vulkan' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@@ -296,7 +337,7 @@ jobs:
 
       - name: Install OpenCL Headers and Libs
         id: install_opencl
-        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
+        if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
         run: |
           git clone https://github.com/KhronosGroup/OpenCL-Headers
           cd OpenCL-Headers
@@ -314,46 +355,22 @@ jobs:
             -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
           cmake --build build-arm64-release --target install --config release
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
       - name: Build
         id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
+          cmake --build build --config Release --target ${{ matrix.target }}
 
       - name: Pack artifacts
         id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
+          path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
+          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
 
   windows-cuda:
     runs-on: windows-2019
@@ -366,8 +383,6 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
 
       - name: Install ccache
         uses: hendrikmuhs/ccache-action@v1.2.16
@@ -386,45 +401,30 @@ jobs:
         run: |
           choco install ninja
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
       - name: Build
         id: cmake_build
         shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
           call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
           cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_NATIVE=OFF ^
             -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CPU=OFF ^
             -DGGML_CUDA=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
+            -DLLAMA_CURL=OFF
           set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
 
       - name: Pack artifacts
         id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
 
       - name: Copy and pack Cuda runtime
         run: |
@@ -432,13 +432,13 @@ jobs:
           $dst='.\build\bin\cudart\'
           robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
           robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
+          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
 
       - name: Upload Cuda runtime
         uses: actions/upload-artifact@v4
         with:
-          path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
 
   windows-sycl:
     runs-on: windows-latest
@@ -451,12 +451,11 @@ jobs:
       WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
       WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
       ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+
     steps:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
 
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2.16
@@ -469,15 +468,18 @@ jobs:
         run:  |
           scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
 
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
       - name: Build
         id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+        shell: cmd
+        run: |
+          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+          cmake -G "Ninja" -B build ^
+            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
+            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
+            -DLLAMA_CURL=OFF
+          cmake --build build --target ggml-sycl -j
 
       - name: Build the release package
         id: pack_artifacts
@@ -502,12 +504,12 @@ jobs:
           cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
 
           echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+          7z a llama-bin-win-sycl-x64.zip ./build/bin/*
 
       - name: Upload the release package
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          path: llama-bin-win-sycl-x64.zip
           name: llama-bin-win-sycl-x64.zip
 
   windows-hip:
@@ -515,14 +517,14 @@ jobs:
 
     strategy:
       matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
+        include:
+          - name: "radeon"
+            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
 
     steps:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
 
       - name: Clone rocWMMA repository
         id: clone_rocwmma
@@ -532,7 +534,7 @@ jobs:
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          key: windows-latest-cmake-hip-release
+          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
           evict-old-files: 1d
 
       - name: Install
@@ -550,50 +552,39 @@ jobs:
         run: |
           & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
       - name: Build
         id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
           cmake -G "Unix Makefiles" -B build -S . `
             -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
             -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
             -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
+            -DGGML_BACKEND_DL=ON `
+            -DGGML_NATIVE=OFF `
+            -DGGML_CPU=OFF `
+            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
             -DGGML_HIP_ROCWMMA_FATTN=ON `
             -DGGML_HIP=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+            -DLLAMA_CURL=OFF
+          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
           md "build\bin\rocblas\library\"
           cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
           cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
           cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
 
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
       - name: Pack artifacts
         id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
+          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
 
   ios-xcode-build:
     runs-on: macos-latest
@@ -655,14 +646,16 @@ jobs:
     runs-on: ubuntu-latest
 
     needs:
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
       - windows
+      - windows-cpu
       - windows-cuda
       - windows-sycl
       - windows-hip
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
       - macOS-arm64
       - macOS-x64
+      - ios-xcode-build
 
     steps:
       - name: Clone
@@ -680,10 +673,43 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           path: ./artifact
+          merge-multiple: true
 
       - name: Move artifacts
         id: move_artifacts
-        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
+        run: |
+          mkdir -p release
+
+          echo "Adding CPU backend files to existing zips..."
+          for arch in x64 arm64; do
+            cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
+            temp_dir=$(mktemp -d)
+            echo "Extracting CPU backend for $arch..."
+            unzip "$cpu_zip" -d "$temp_dir"
+
+            echo "Adding CPU files to $arch zips..."
+            for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
+              if [[ "$target_zip" == "$cpu_zip" ]]; then
+                continue
+              fi
+              echo "Adding CPU backend to $(basename "$target_zip")"
+              realpath_target_zip=$(realpath "$target_zip")
+              (cd "$temp_dir" && zip -r "$realpath_target_zip" .)
+            done
+
+            rm -rf "$temp_dir"
+          done
+
+          echo "Renaming and moving zips to release..."
+          for zip_file in artifact/llama-bin-win-*.zip; do
+            base_name=$(basename "$zip_file" .zip)
+            zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
+            echo "Moving $zip_file to release/$zip_name"
+            mv "$zip_file" "release/$zip_name"
+          done
+
+          echo "Moving other artifacts..."
+          mv -v artifact/*.zip release
 
       - name: Create release
         id: create_release
@@ -702,7 +728,7 @@ jobs:
             const path = require('path');
             const fs = require('fs');
             const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact/release')) {
+            for (let file of await fs.readdirSync('./release')) {
               if (path.extname(file) === '.zip') {
                 console.log('uploadReleaseAsset', file);
                 await github.repos.uploadReleaseAsset({
@@ -710,7 +736,7 @@ jobs:
                   repo: context.repo.repo,
                   release_id: release_id,
                   name: file,
-                  data: await fs.readFileSync(`./artifact/release/${file}`)
+                  data: await fs.readFileSync(`./release/${file}`)
                 });
               }
             }
diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml
new file mode 100644
index 000000000..5c2861559
--- /dev/null
+++ b/.github/workflows/winget.yml
@@ -0,0 +1,42 @@
+name: Update Winget Package
+
+on:
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    - cron: '28 5 * * *' # Update every day at 5:28 UTC
+
+jobs:
+  update:
+    name: Update Winget Package
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Install cargo binstall
+        uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
+
+      - name: Install komac
+        run: |
+          cargo binstall komac@2.11.2 -y
+
+      - name: Find latest release
+        id: find_latest_release
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const { data: releases } = await github.rest.repos.listReleases({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+            });
+            console.log("Latest release:", releases[0].tag_name);
+            return releases[0].tag_name;
+
+      - name: Update manifest
+        env:
+          VERSION: ${{ steps.find_latest_release.outputs.result }}
+        run: |
+          echo "Updating manifest..."
+          komac update --version ${{ env.VERSION }} \
+            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
+            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
+            --submit \
+            ggml.llamacpp
diff --git a/README.md b/README.md
index d1cb8d833..576332bc5 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Bindings</summary>
 
+- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -580,3 +581,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
 - [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
 - [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
 - [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
+- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index a7ff3ac16..564af1448 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -58,19 +58,20 @@ add_library(${TARGET} STATIC
     arg.cpp
     arg.h
     base64.hpp
+    chat-parser.cpp
+    chat-parser.h
     chat.cpp
     chat.h
     common.cpp
     common.h
     console.cpp
     console.h
+    json-partial.cpp
+    json-partial.h
     json-schema-to-grammar.cpp
-    json.hpp
     llguidance.cpp
     log.cpp
     log.h
-    minja/chat-template.hpp
-    minja/minja.hpp
     ngram-cache.cpp
     ngram-cache.h
     regex-partial.cpp
@@ -143,7 +144,7 @@ if (LLAMA_LLGUIDANCE)
     set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
 endif ()
 
-target_include_directories(${TARGET} PUBLIC .)
+target_include_directories(${TARGET} PUBLIC . ../vendor)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
 
diff --git a/common/arg.cpp b/common/arg.cpp
index 997f732cc..cfa9878f9 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,10 +1,11 @@
-#include "gguf.h" // for reading GGUF splits
 #include "arg.h"
 
+#include "chat.h"
 #include "common.h"
+#include "gguf.h" // for reading GGUF splits
+#include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
-#include "chat.h"
 
 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -15,6 +16,9 @@
 #include <windows.h>
 #endif
 
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
@@ -34,12 +38,10 @@
 #include <future>
 #endif
 
-#include "json-schema-to-grammar.h"
-
 using json = nlohmann::ordered_json;
 
 std::initializer_list<enum llama_example> mmproj_examples = {
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_SERVER,
 };
 
@@ -242,7 +244,56 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
 }
 
 // download one single file from remote URL to local path
-static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
+static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
+    // Check if the file already exists locally
+    auto file_exists = std::filesystem::exists(path);
+
+    // If the file exists, check its JSON metadata companion file.
+    std::string metadata_path = path + ".json";
+    nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
+    std::string etag;
+    std::string last_modified;
+
+    if (file_exists) {
+        if (offline) {
+            LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
+            return true; // skip verification/downloading
+        }
+        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
+        std::ifstream metadata_in(metadata_path);
+        if (metadata_in.good()) {
+            try {
+                metadata_in >> metadata;
+                LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
+                    etag = metadata.at("etag");
+                }
+                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
+                    last_modified = metadata.at("lastModified");
+                }
+            } catch (const nlohmann::json::exception & e) {
+                LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+            }
+        }
+        // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
+    } else {
+        if (offline) {
+            LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
+            return false;
+        }
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+    }
+
+    // Send a HEAD request to retrieve the etag and last-modified headers
+    struct common_load_model_from_url_headers {
+        std::string etag;
+        std::string last_modified;
+    };
+
+    common_load_model_from_url_headers headers;
+    bool head_request_ok = false;
+    bool should_download = !file_exists; // by default, we should download if the file does not exist
+
     // Initialize libcurl
     curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
     curl_slist_ptr http_headers;
@@ -269,91 +320,47 @@ static bool common_download_file_single(const std::string & url, const std::stri
     curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
 #endif
 
-    // Check if the file already exists locally
-    auto file_exists = std::filesystem::exists(path);
+    typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+    auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+        common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
 
-    // If the file exists, check its JSON metadata companion file.
-    std::string metadata_path = path + ".json";
-    nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
-    std::string etag;
-    std::string last_modified;
+        static std::regex header_regex("([^:]+): (.*)\r\n");
+        static std::regex etag_regex("ETag", std::regex_constants::icase);
+        static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
 
-    if (file_exists) {
-        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
-        std::ifstream metadata_in(metadata_path);
-        if (metadata_in.good()) {
-            try {
-                metadata_in >> metadata;
-                LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
-                    etag = metadata.at("etag");
-                }
-                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
-                    last_modified = metadata.at("lastModified");
-                }
-            } catch (const nlohmann::json::exception & e) {
-                LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+        std::string header(buffer, n_items);
+        std::smatch match;
+        if (std::regex_match(header, match, header_regex)) {
+            const std::string & key = match[1];
+            const std::string & value = match[2];
+            if (std::regex_match(key, match, etag_regex)) {
+                headers->etag = value;
+            } else if (std::regex_match(key, match, last_modified_regex)) {
+                headers->last_modified = value;
             }
         }
-        // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
-    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-    }
-
-    // Send a HEAD request to retrieve the etag and last-modified headers
-    struct common_load_model_from_url_headers {
-        std::string etag;
-        std::string last_modified;
+        return n_items;
     };
 
-    common_load_model_from_url_headers headers;
-    bool head_request_ok = false;
-    bool should_download = !file_exists; // by default, we should download if the file does not exist
+    curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+    curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
 
-    // get ETag to see if the remote file has changed
-    {
-        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+    // we only allow retrying once for HEAD requests
+    // this is for the use case of using running offline (no internet), retrying can be annoying
+    bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
+    if (!was_perform_successful) {
+        head_request_ok = false;
+    }
 
-            static std::regex header_regex("([^:]+): (.*)\r\n");
-            static std::regex etag_regex("ETag", std::regex_constants::icase);
-            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
-
-            std::string header(buffer, n_items);
-            std::smatch match;
-            if (std::regex_match(header, match, header_regex)) {
-                const std::string & key = match[1];
-                const std::string & value = match[2];
-                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
-                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
-                }
-            }
-            return n_items;
-        };
-
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-
-        // we only allow retrying once for HEAD requests
-        // this is for the use case of using running offline (no internet), retrying can be annoying
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), 1, 0, "HEAD");
-        if (!was_perform_successful) {
-            head_request_ok = false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code == 200) {
-            head_request_ok = true;
-        } else {
-            LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-            head_request_ok = false;
-        }
+    long http_code = 0;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+    if (http_code == 200) {
+        head_request_ok = true;
+    } else {
+        LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+        head_request_ok = false;
     }
 
     // if head_request_ok is false, we don't have the etag or last-modified headers
@@ -460,12 +467,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
 
 // download multiple files from remote URLs to local paths
 // the input is a vector of pairs <url, path>
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
     // Prepare download in parallel
     std::vector<std::future<bool>> futures_download;
     for (auto const & item : urls) {
-        futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
-            return common_download_file_single(it.first, it.second, bearer_token);
+        futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
+            return common_download_file_single(it.first, it.second, bearer_token, offline);
         }, item));
     }
 
@@ -481,14 +488,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
 
 static bool common_download_model(
         const common_params_model & model,
-        const std::string & bearer_token) {
+        const std::string & bearer_token,
+        bool offline) {
     // Basic validation of the model.url
     if (model.url.empty()) {
         LOG_ERR("%s: invalid model url\n", __func__);
         return false;
     }
 
-    if (!common_download_file_single(model.url, model.path, bearer_token)) {
+    if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
         return false;
     }
 
@@ -547,7 +555,7 @@ static bool common_download_model(
         }
 
         // Download in parallel
-        common_download_file_multiple(urls, bearer_token);
+        common_download_file_multiple(urls, bearer_token, offline);
     }
 
     return true;
@@ -608,7 +616,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
  *
  * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
  */
-static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
+static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
     auto parts = string_split<std::string>(hf_repo_with_tag, ':');
     std::string tag = parts.size() > 1 ? parts.back() : "latest";
     std::string hf_repo = parts[0];
@@ -638,20 +646,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
     long res_code = 0;
     std::string res_str;
     bool use_cache = false;
-    try {
-        auto res = common_remote_get_content(url, params);
-        res_code = res.first;
-        res_str = std::string(res.second.data(), res.second.size());
-    } catch (const std::exception & e) {
-        LOG_WRN("error: failed to get manifest: %s\n", e.what());
-        LOG_WRN("try reading from cache\n");
-        // try to read from cache
+    if (!offline) {
         try {
+            auto res = common_remote_get_content(url, params);
+            res_code = res.first;
+            res_str = std::string(res.second.data(), res.second.size());
+        } catch (const std::exception & e) {
+            LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
+        }
+    }
+    if (res_code == 0) {
+        if (std::filesystem::exists(cached_response_path)) {
+            LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
             res_str = read_file(cached_response_path);
             res_code = 200;
             use_cache = true;
-        } catch (const std::exception & e) {
-            throw std::runtime_error("error: failed to get manifest (check your internet connection)");
+        } else {
+            throw std::runtime_error(
+                offline ? "error: failed to get manifest (offline mode)"
+                : "error: failed to get manifest (check your internet connection)");
         }
     }
     std::string ggufFile;
@@ -698,24 +711,25 @@ bool common_has_curl() {
     return false;
 }
 
-static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
+static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
     LOG_ERR("error: built without CURL, cannot download model from internet\n");
     return false;
 }
 
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
     LOG_ERR("error: built without CURL, cannot download model from the internet\n");
     return false;
 }
 
 static bool common_download_model(
         const common_params_model &,
-        const std::string &) {
+        const std::string &,
+        bool) {
     LOG_ERR("error: built without CURL, cannot download model from the internet\n");
     return false;
 }
 
-static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
+static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
     LOG_ERR("error: built without CURL, cannot download model from the internet\n");
     return {};
 }
@@ -742,7 +756,8 @@ struct handle_model_result {
 static handle_model_result common_params_handle_model(
         struct common_params_model & model,
         const std::string & bearer_token,
-        const std::string & model_path_default) {
+        const std::string & model_path_default,
+        bool offline) {
     handle_model_result result;
     // handle pre-fill default model path and url based on hf_repo and hf_file
     {
@@ -750,7 +765,7 @@ static handle_model_result common_params_handle_model(
             // short-hand to avoid specifying --hf-file -> default it to --model
             if (model.hf_file.empty()) {
                 if (model.path.empty()) {
-                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
+                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
                     if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
                         exit(1); // built without CURL, error message already printed
                     }
@@ -791,7 +806,7 @@ static handle_model_result common_params_handle_model(
 
     // then, download it if needed
     if (!model.url.empty()) {
-        bool ok = common_download_model(model, bearer_token);
+        bool ok = common_download_model(model, bearer_token, offline);
         if (!ok) {
             LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
             exit(1);
@@ -934,7 +949,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // handle model and download
     {
-        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
+        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
         if (params.no_mmproj) {
             params.mmproj = {};
         } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -944,12 +959,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         // only download mmproj if the current example is using it
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, "");
+                common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
                 break;
             }
         }
-        common_params_handle_model(params.speculative.model, params.hf_token, "");
-        common_params_handle_model(params.vocoder.model,     params.hf_token, "");
+        common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
+        common_params_handle_model(params.vocoder.model,     params.hf_token, "", params.offline);
     }
 
     if (params.escape) {
@@ -1333,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--prio"}, "N",
-        string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
+        string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
         [](common_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
+            if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
                 throw std::invalid_argument("invalid value");
             }
             params.cpuparams.priority = (enum ggml_sched_priority) prio;
@@ -2233,12 +2248,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
     add_opt(common_arg(
-        {"--image"}, "FILE",
-        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        {"--image", "--audio"}, "FILE",
+        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
         [](common_params & params, const std::string & value) {
             params.image.emplace_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
@@ -2848,15 +2863,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
-        "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
-        "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
-        "only supported for non-streamed responses",
+        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
+        "- none: leaves thoughts unparsed in `message.content`\n"
+        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+        "(default: deepseek)",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
-            else { std::invalid_argument("invalid value"); }
+            else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
+    add_opt(common_arg(
+        {"--reasoning-budget"}, "N",
+        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+        [](common_params & params, int value) {
+            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+            params.reasoning_budget = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         string_format(
@@ -2868,7 +2892,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.chat_template = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
         string_format(
@@ -2955,7 +2979,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
             else if (value == "md") { params.batched_bench_output_jsonl = false; }
-            else { std::invalid_argument("invalid value"); }
+            else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
@@ -2987,6 +3011,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             common_log_set_verbosity_thold(INT_MAX);
         }
     ));
+    add_opt(common_arg(
+        {"--offline"},
+        "Offline mode: forces use of cache, prevents network access",
+        [](common_params & params) {
+            params.offline = true;
+        }
+    ).set_env("LLAMA_OFFLINE"));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
new file mode 100644
index 000000000..65b664cb3
--- /dev/null
+++ b/common/chat-parser.cpp
@@ -0,0 +1,380 @@
+#include "chat-parser.h"
+#include "common.h"
+#include "log.h"
+#include "regex-partial.h"
+
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
+    : input_(input), is_partial_(is_partial), syntax_(syntax)
+{
+    result_.role = "assistant";
+
+    while (true) {
+        std::string id = std::to_string(std::rand());
+        if (input.find(id) == std::string::npos) {
+            healing_marker_ = id;
+            break;
+        }
+    }
+}
+
+std::string common_chat_msg_parser::str(const common_string_range & rng) const {
+    GGML_ASSERT(rng.begin <= rng.end);
+    return input_.substr(rng.begin, rng.end - rng.begin);
+}
+
+void common_chat_msg_parser::add_content(const std::string &content) {
+    result_.content += content;
+}
+
+void common_chat_msg_parser::add_reasoning_content(const std::string &reasoning_content) {
+    result_.reasoning_content += reasoning_content;
+}
+
+bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::string & id, const std::string & arguments) {
+    if (name.empty()) {
+        return false;
+    }
+
+    common_chat_tool_call tool_call;
+    tool_call.name = name;
+    tool_call.arguments = arguments;
+    tool_call.id = id;
+
+    // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
+    result_.tool_calls.emplace_back(tool_call);
+    return true;
+}
+bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
+    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
+    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
+    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
+    return add_tool_call(name, id, arguments);
+}
+
+bool common_chat_msg_parser::add_tool_calls(const json & arr) {
+    for (const auto & item : arr) {
+        if (!add_tool_call(item)) {
+            return false;
+        }
+    }
+    return true;
+}
+void common_chat_msg_parser::finish() {
+    if (!is_partial_ && pos_ != input_.size()) {
+        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
+    }
+}
+
+bool common_chat_msg_parser::consume_spaces() {
+    const auto length = input_.size();
+    auto consumed = false;
+    while (pos_ < length && std::isspace(input_[pos_])) {
+        ++pos_;
+        consumed = true;
+    }
+    return consumed;
+}
+
+bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
+    auto pos = pos_;
+    for (auto i = 0u; i < literal.size(); ++i) {
+        if (pos >= input_.size()) {
+            return false;
+        }
+        if (input_[pos] != literal[i]) {
+            return false;
+        }
+        ++pos;
+    }
+    pos_ = pos;
+    return true;
+}
+
+std::optional<common_chat_msg_parser::find_regex_result>  common_chat_msg_parser::try_find_literal(const std::string & literal) {
+    auto idx = input_.find(literal, pos_);
+    if (idx != std::string::npos) {
+        find_regex_result res;
+        res.prelude = input_.substr(pos_, idx - pos_);
+        auto end = idx + literal.size();
+        res.groups.emplace_back(common_string_range{idx, end});
+        move_to(end);
+        return res;
+    }
+    if (is_partial_) {
+        idx = string_find_partial_stop(input_, literal);
+        if (idx != std::string::npos && idx >= pos_) {
+            find_regex_result res;
+            res.prelude = input_.substr(pos_, idx - pos_);
+            auto end = input_.size();
+            res.groups.emplace_back(common_string_range{idx, end});
+            move_to(end);
+            return res;
+        }
+    }
+    return std::nullopt;
+}
+
+void common_chat_msg_parser::consume_literal(const std::string & literal) {
+    if (!try_consume_literal(literal)) {
+        throw common_chat_msg_partial_exception(literal);
+    }
+}
+
+bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+    auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
+        auto stripped_reasoning = string_strip(reasoning);
+        if (stripped_reasoning.empty()) {
+            return;
+        }
+        if (syntax_.reasoning_in_content) {
+            add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
+            add_content(stripped_reasoning);
+            if (closed) {
+                add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
+            }
+        } else {
+            add_reasoning_content(stripped_reasoning);
+        }
+    };
+    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
+        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
+            if (auto res = try_find_literal(end_think)) {
+                handle_reasoning(res->prelude, /* closed */ true);
+                consume_spaces();
+                return true;
+            }
+            auto rest = consume_rest();
+            if (!rest.empty()) {
+                handle_reasoning(rest, /* closed */ !is_partial());
+            }
+            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
+            // if (!syntax_.thinking_forced_open) {
+            //     throw common_chat_msg_partial_exception(end_think);
+            // }
+            return true;
+        }
+    }
+    return false;
+}
+
+std::string common_chat_msg_parser::consume_rest() {
+    auto rest = input_.substr(pos_);
+    pos_ = input_.size();
+    return rest;
+}
+
+// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
+    auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
+    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+        return std::nullopt;
+    }
+    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
+    pos_ = m.groups[0].end;
+
+    if (add_prelude_to_content) {
+        add_content(prelude);
+    }
+    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+        if (is_partial()) {
+            throw common_chat_msg_partial_exception(regex.str());
+        }
+        return std::nullopt;
+    }
+    return find_regex_result{prelude, m.groups};
+}
+
+common_chat_msg_parser::find_regex_result common_chat_msg_parser::consume_regex(const common_regex & regex) {
+    if (auto result = try_consume_regex(regex)) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception(regex.str());
+}
+
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_consume_regex(const common_regex & regex) {
+    auto m = regex.search(input_, pos_);
+    if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
+        return std::nullopt;
+    }
+    if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
+        if (is_partial()) {
+            throw common_chat_msg_partial_exception(regex.str());
+        }
+        return std::nullopt;
+    }
+    if (m.groups[0].begin != pos_) {
+        // Didn't match at the current position.
+        return std::nullopt;
+    }
+    pos_ = m.groups[0].end;
+
+    return find_regex_result {
+        /* .prelude = */ "",
+        m.groups,
+    };
+}
+
+std::optional<common_json> common_chat_msg_parser::try_consume_json() {
+    auto it = input_.cbegin() + pos_;
+    const auto end = input_.cend();
+    common_json result;
+    if (!common_json_parse(it, end, healing_marker_, result)) {
+        return std::nullopt;
+    }
+    pos_ = std::distance(input_.cbegin(), it);
+    if (result.healing_marker.marker.empty()) {
+        // No healing marker, just return the parsed json
+        return result;
+    }
+    if (!is_partial()) {
+        throw common_chat_msg_partial_exception("JSON");
+    }
+    return result;
+}
+
+common_json common_chat_msg_parser::consume_json() {
+    if (auto result = try_consume_json()) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception("JSON");
+}
+
+common_chat_msg_parser::consume_json_result common_chat_msg_parser::consume_json_with_dumped_args(
+    const std::vector<std::vector<std::string>> & args_paths,
+    const std::vector<std::vector<std::string>> & content_paths
+) {
+    if (auto result = try_consume_json_with_dumped_args(args_paths, content_paths)) {
+        return *result;
+    }
+    throw common_chat_msg_partial_exception("JSON");
+}
+
+std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parser::try_consume_json_with_dumped_args(
+    const std::vector<std::vector<std::string>> & args_paths,
+    const std::vector<std::vector<std::string>> & content_paths
+) {
+    auto partial = try_consume_json();
+    if (!partial) {
+        return std::nullopt;
+    }
+    auto is_arguments_path = [&](const std::vector<std::string> & path) {
+        return std::find(args_paths.begin(), args_paths.end(), path) != args_paths.end();
+    };
+    auto is_content_path = [&](const std::vector<std::string> & path) {
+        return std::find(content_paths.begin(), content_paths.end(), path) != content_paths.end();
+    };
+
+    if (partial->healing_marker.marker.empty()) {
+        if (args_paths.empty()) {
+            // No arguments to dump, and JSON was parsed fully.
+            return consume_json_result {
+                partial->json,
+                /* .is_partial = */ false,
+            };
+        }
+        if (is_arguments_path({})) {
+            // Entire JSON is the arguments and was parsed fully.
+            return consume_json_result {
+                partial->json.dump(),
+                /* .is_partial = */ false,
+            };
+        }
+    }
+
+    LOG_DBG("Parsed partial JSON: %s (json_healing_marker: %s)\n", partial->json.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+
+    auto found_healing_marker = false;
+    std::vector<std::string> path;
+    std::function<json(const json &)> remove_unsupported_healings_and_dump_args = [&](const json & j) -> json {
+        if (is_arguments_path(path)) {
+            auto arguments = j.dump();
+            if (is_partial() && !partial->healing_marker.marker.empty()) {
+                auto idx = arguments.find(partial->healing_marker.json_dump_marker);
+                if (idx != std::string::npos) {
+                    arguments.resize(idx);
+                    found_healing_marker = true;
+                }
+                if (arguments == "\"") {
+                    // This happens because of completing `:"$magic` after `"arguments"`
+                    arguments = "";
+                }
+            }
+            return arguments;
+        }
+        if (is_content_path(path)) {
+            if (!j.is_string()) {
+                throw std::runtime_error("Content path must be a string");
+            }
+            std::string str = j;
+            auto idx = str.find(partial->healing_marker.marker); // not using json_dump_marker as we're inside a string
+            if (idx != std::string::npos) {
+                str.resize(idx);
+                found_healing_marker = true;
+            }
+            return str;
+        }
+        if (j.is_object()) {
+            auto obj = json::object();
+            for (const auto & p : j.items()) {
+                const auto & key = p.key();
+                const auto & value = p.value();
+                const std::string key_str = key; // NOLINT
+                auto idx = key_str.find(healing_marker_);
+                if (idx != std::string::npos) {
+                    found_healing_marker = true;
+                    break;
+                }
+                path.push_back(key_str);
+                if (value.is_string()) {
+                    const std::string value_str = value;
+                    if (value_str.find(healing_marker_) != std::string::npos) {
+                        found_healing_marker = true;
+                        if (is_content_path(path)) {
+                            if (partial->healing_marker.marker == partial->healing_marker.json_dump_marker) {
+                                // The healing occurred inside the string: good. Otherwise we just ditch the entire key/value pair.
+                                obj[key] = remove_unsupported_healings_and_dump_args(value);
+                            }
+                        }
+                        break;
+                    }
+                    obj[key] = value;
+                } else {
+                    obj[key] = remove_unsupported_healings_and_dump_args(value);
+                }
+                path.pop_back();
+            }
+            return obj;
+        }
+        if (j.is_array()) {
+            auto arr = json::array();
+            for (const auto & value : j) {
+                if (value.is_string()) {
+                    std::string str = value;
+                    auto idx = str.find(healing_marker_);
+                    if (idx != std::string::npos) {
+                        // Don't heal array values that aren't in the arguments.
+                        found_healing_marker = true;
+                        break;
+                    }
+                }
+                arr.push_back(remove_unsupported_healings_and_dump_args(value));
+            }
+            return arr;
+        }
+        return j;
+    };
+
+    auto cleaned = remove_unsupported_healings_and_dump_args(partial->json);
+    LOG_DBG("Cleaned up JSON %s to %s (json_healing_marker : '%s')\n", partial->json.dump().c_str(), cleaned.dump().c_str(), partial->healing_marker.json_dump_marker.c_str());
+    return consume_json_result {
+        cleaned,
+        /* .is_partial = */ found_healing_marker,
+    };
+}
diff --git a/common/chat-parser.h b/common/chat-parser.h
new file mode 100644
index 000000000..7ee355056
--- /dev/null
+++ b/common/chat-parser.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "chat.h"
+#include "json-partial.h"
+#include "regex-partial.h"
+
+#include <nlohmann/json.hpp>
+
+#include <optional>
+#include <string>
+#include <vector>
+
+class common_chat_msg_partial_exception : public std::runtime_error {
+  public:
+    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
+};
+
+class common_chat_msg_parser {
+    std::string input_;
+    bool is_partial_;
+    common_chat_syntax syntax_;
+    std::string healing_marker_;
+
+    size_t pos_ = 0;
+    common_chat_msg result_;
+
+  public:
+    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+    const std::string & input() const { return input_; }
+    size_t pos() const { return pos_; }
+    const std::string & healing_marker() const { return healing_marker_; }
+    const bool & is_partial() const { return is_partial_; }
+    const common_chat_msg & result() const { return result_; }
+    const common_chat_syntax & syntax() const { return syntax_; }
+
+    void move_to(size_t pos) {
+        if (pos > input_.size()) {
+            throw std::runtime_error("Invalid position!");
+        }
+        pos_ = pos;
+    }
+    void move_back(size_t n) {
+        if (pos_ < n) {
+            throw std::runtime_error("Can't move back that far!");
+        }
+        pos_ -= n;
+    }
+
+    // Get the substring of the input at the given range
+    std::string str(const common_string_range & rng) const;
+
+    // Appends to the result.content field
+    void add_content(const std::string & content);
+
+    // Appends to the result.reasoning_content field
+    void add_reasoning_content(const std::string & reasoning_content);
+
+    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
+    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
+
+    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
+    bool add_tool_call(const nlohmann::ordered_json & tool_call);
+
+    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
+    bool add_tool_calls(const nlohmann::ordered_json & arr);
+
+    void finish();
+
+    bool consume_spaces();
+
+    void consume_literal(const std::string & literal);
+
+    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
+
+    std::string consume_rest();
+
+    struct find_regex_result {
+        std::string prelude;
+        std::vector<common_string_range> groups;
+    };
+
+    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
+
+    bool try_consume_literal(const std::string & literal);
+
+    std::optional<find_regex_result> try_find_literal(const std::string & literal);
+
+    find_regex_result consume_regex(const common_regex & regex);
+
+    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
+
+    std::optional<common_json> try_consume_json();
+    common_json consume_json();
+
+    struct consume_json_result {
+        nlohmann::ordered_json value;
+        bool is_partial;
+    };
+
+    /*
+        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
+
+        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
+        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
+
+        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
+        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
+        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
+    */
+    consume_json_result consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+    std::optional<consume_json_result> try_consume_json_with_dumped_args(
+        const std::vector<std::vector<std::string>> & args_paths = {},
+        const std::vector<std::vector<std::string>> & content_paths = {}
+    );
+};
diff --git a/common/chat.cpp b/common/chat.cpp
index f138c7bca..f1ab4c85a 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1,10 +1,21 @@
 #include "chat.h"
+#include "chat-parser.h"
+#include "common.h"
+#include "json-partial.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
-#include "minja/chat-template.hpp"
-#include "minja/minja.hpp"
+#include "regex-partial.h"
 
+#include <minja/chat-template.hpp>
+#include <minja/minja.hpp>
+
+#include <cstdio>
+#include <exception>
+#include <iostream>
 #include <optional>
+#include <stdexcept>
+#include <string>
+#include <vector>
 
 static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
     auto time = std::chrono::system_clock::to_time_t(now);
@@ -15,6 +26,101 @@ static std::string format_time(const std::chrono::system_clock::time_point & now
     return res;
 }
 
+static std::string string_diff(const std::string & last, const std::string & current) {
+    if (last.empty()) {
+        return current;
+    }
+    if (!string_starts_with(current, last)) {
+        if (string_starts_with(last, current)) {
+            // This happens if the last generation ended on a partial stop word (not erased),
+            // and the current ended on a stop word (erased).
+            return "";
+        }
+        throw std::runtime_error("Invalid diff: '" + last + "' not found at start of '" + current + "'");
+    }
+    return current.substr(last.size());
+}
+
+static bool has_content_or_tool_calls(const common_chat_msg & msg) {
+    return !msg.content.empty() || !msg.tool_calls.empty();
+}
+
+template <>
+json common_chat_msg::to_json_oaicompat() const
+{
+    json message {
+        {"role", "assistant"},
+    };
+    if (!reasoning_content.empty()) {
+        message["reasoning_content"] = reasoning_content;
+    }
+    if (content.empty() && !tool_calls.empty()) {
+        message["content"] = json();
+    } else {
+        message["content"] = content;
+    }
+    if (!tool_calls.empty()) {
+        auto arr = json::array();
+        for (const auto & tc : tool_calls) {
+            arr.push_back({
+                {"type", "function"},
+                {"function", {
+                    {"name", tc.name},
+                    {"arguments", tc.arguments},
+                }},
+                {"id", tc.id},
+                // // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
+                // // We only generate a random id for the ones that don't generate one by themselves
+                // // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
+                // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
+            });
+        }
+        message["tool_calls"] = arr;
+    }
+    return message;
+}
+
+std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
+    std::vector<common_chat_msg_diff> diffs;
+    // if (previous_msg.reasoning_content != current.reasoning_content) {
+    //     auto & diff = diffs.emplace_back();
+    //     diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
+    // }
+    if (previous_msg.content != new_msg.content) {
+        auto & diff = diffs.emplace_back();
+        diff.content_delta = string_diff(previous_msg.content, new_msg.content);
+    }
+
+    if (new_msg.tool_calls.size() < previous_msg.tool_calls.size()) {
+        throw std::runtime_error("Invalid diff: now finding less tool calls!");
+    }
+
+    if (!previous_msg.tool_calls.empty()) {
+        auto idx = previous_msg.tool_calls.size() - 1;
+        const auto & pref = previous_msg.tool_calls[idx];
+        const auto & newf = new_msg.tool_calls[idx];
+        if (pref.name != newf.name) {
+            throw std::runtime_error("Invalid diff: tool call mismatch!");
+        }
+        auto args_diff = string_diff(pref.arguments, newf.arguments);
+        if (!args_diff.empty() || pref.id != newf.id) {
+            auto & diff = diffs.emplace_back();
+            diff.tool_call_index = idx;
+            if (pref.id != newf.id) {
+                diff.tool_call_delta.id = newf.id;
+                diff.tool_call_delta.name = newf.name;
+            }
+            diff.tool_call_delta.arguments = args_diff;
+        }
+    }
+    for (size_t idx = previous_msg.tool_calls.size(); idx < new_msg.tool_calls.size(); ++idx) {
+        auto & diff = diffs.emplace_back();
+        diff.tool_call_index = idx;
+        diff.tool_call_delta = new_msg.tool_calls[idx];
+    }
+    return diffs;
+}
+
 typedef minja::chat_template common_chat_template;
 
 struct common_chat_templates {
@@ -32,7 +138,7 @@ struct templates_params {
     bool stream;
     std::string grammar;
     bool add_generation_prompt = true;
-    bool extract_reasoning     = true;
+    bool enable_thinking = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 
@@ -277,6 +383,32 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
     return result;
 }
 
+template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
+    json delta = json::object();
+    // if (!diff.reasoning_content_delta.empty()) {
+    //     delta["reasoning_content"] = msg.reasoning_content;
+    // }
+    if (!diff.content_delta.empty()) {
+        delta["content"] = diff.content_delta;
+    }
+    if (diff.tool_call_index != std::string::npos) {
+        json tool_call;
+        tool_call["index"] = diff.tool_call_index;
+        if (!diff.tool_call_delta.id.empty()) {
+            tool_call["id"] = diff.tool_call_delta.id;
+            tool_call["type"] = "function";
+        }
+        json function = json::object();
+        if (!diff.tool_call_delta.name.empty()) {
+            function["name"] = diff.tool_call_delta.name;
+        }
+        function["arguments"] = diff.tool_call_delta.arguments;
+        tool_call["function"] = function;
+        delta["tool_calls"] = json::array({tool_call});
+    }
+    return delta;
+}
+
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
     if (use_jinja) {
         try {
@@ -444,7 +576,7 @@ common_chat_templates_ptr common_chat_templates_init(
     return tmpls;
 }
 
-std::string common_chat_format_name(common_chat_format format) {
+const char * common_chat_format_name(common_chat_format format) {
     switch (format) {
         case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
         case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
@@ -452,182 +584,127 @@ std::string common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
         case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
         case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
-        case COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING: return "DeepSeek R1 (extract reasoning)";
         case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
         case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
-        case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)";
         case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
-        case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
         default:
             throw std::runtime_error("Unknown chat format");
     }
 }
 
-static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
-    // // https://json.nlohmann.me/features/parsing/sax_interface/
-    struct json_error_locator : public nlohmann::json_sax<json> {
-        std::size_t position;
-        bool found_error;
+const char * common_reasoning_format_name(common_reasoning_format format) {
+    switch (format) {
+        case COMMON_REASONING_FORMAT_NONE:     return "none";
+        case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
+        default:
+            throw std::runtime_error("Unknown reasoning format");
+    }
+}
 
-        json_error_locator() : position(0), found_error(false) {}
-
-        bool parse_error(std::size_t position, const std::string &, const json::exception &) override { // NOLINT
-            this->position = position - 1;
-            this->found_error = true;
-            return false;
+static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
+    std::string arguments;
+    if (builder.is_partial()) {
+        arguments = (json {{"code", code + builder.healing_marker()}}).dump();
+        auto idx = arguments.find(builder.healing_marker());
+        if (idx != std::string::npos) {
+            arguments.resize(idx);
         }
-        bool null() override { return true; } // NOLINT
-        bool boolean(bool) override { return true; } // NOLINT
-        bool number_integer(number_integer_t) override { return true; } // NOLINT
-        bool number_unsigned(number_unsigned_t) override { return true; } // NOLINT
-        bool number_float(number_float_t, const string_t &) override { return true; } // NOLINT
-        bool string(string_t &) override { return true; } // NOLINT
-        bool binary(binary_t &) override { return true; } // NOLINT
-        bool start_object(std::size_t) override { return true; } // NOLINT
-        bool key(string_t &) override { return true; } // NOLINT
-        bool end_object() override { return true; }
-        bool start_array(std::size_t) override { return true; } // NOLINT
-        bool end_array() override { return true; }
-    };
-    json_error_locator err_loc;
-    json::sax_parse(it, end, &err_loc);
-
-    std::string::const_iterator temptative_end;
-    if (err_loc.found_error) {
-        temptative_end = it + err_loc.position;
     } else {
-        temptative_end = end;
-    }
-    std::string json_sub {it, temptative_end};
-    try {
-        out = json::parse(json_sub);
-        it = temptative_end;
-        return true;
-    } catch (const std::exception &) {
-        return false;
-    }
-}
-
-static bool parse_literal(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
-    auto expected_it = expected.begin();
-    auto tmp_it = it;
-    while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
-        ++tmp_it;
-        ++expected_it;
-    }
-    if (expected_it == expected.end()) {
-        it = tmp_it;
-        return true;
-    }
-    return false;
-}
-
-static std::optional<std::smatch> parse_pattern(std::string::const_iterator & it, const std::string::const_iterator & end, const std::regex & expected) {
-    std::smatch match;
-    if (std::regex_match(it, end, match, expected)) {
-        it = match.suffix().first;
-        return match;
-    }
-    return std::nullopt;
-}
-
-static void consume_spaces(std::string::const_iterator & it, const std::string::const_iterator & end) {
-    while (it != end && std::isspace(*it)) {
-        ++it;
+        arguments = (json {{"code", code}}).dump();
     }
+    return arguments;
 }
 
 /**
  * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
  * Aggregates the prefix, suffix and in-between text into the content.
  */
-static common_chat_msg parse_json_tool_calls(
-    const std::string& input,
-    const std::optional<std::regex> & trigger_opt,
-    const std::regex & function_regex,
-    const std::regex & close_regex,
-    bool allow_raw_python = false) {
-    std::smatch match;
+static void parse_json_tool_calls(
+    common_chat_msg_parser & builder,
+    const std::optional<common_regex> & block_open,
+    const std::optional<common_regex> & function_regex_start_only,
+    const std::optional<common_regex> & function_regex,
+    const common_regex & close_regex,
+    const std::optional<common_regex> & block_close,
+    bool allow_raw_python = false,
+    const std::function<std::string(const common_chat_msg_parser::find_regex_result & fres)> & get_function_name = nullptr) {
 
-    common_chat_msg result;
-    result.role = "assistant";
+    auto parse_tool_calls = [&]() {
+        size_t from = std::string::npos;
+        auto first = true;
+        while (true) {
+            auto res = function_regex_start_only && first
+                ? builder.try_consume_regex(*function_regex_start_only)
+                : function_regex
+                    ? builder.try_find_regex(*function_regex, from)
+                    : std::nullopt;
+            if (res) {
+                std::string name;
+                if (get_function_name) {
+                    name = get_function_name(*res);
+                } else {
+                    GGML_ASSERT(res->groups.size() == 2);
+                    name = builder.str(res->groups[1]);
+                }
+                first = false;
+                if (name.empty()) {
+                    // get_function_name signalled us that we should skip this match and treat it as content.
+                    from = res->groups[0].begin + 1;
+                    continue;
+                }
+                from = std::string::npos;
 
-
-    auto end = input.end();
-    auto it = input.begin();
-
-    if (trigger_opt) {
-        if (!std::regex_search(it, end, match, *trigger_opt)) {
-            result.content = input;
-            return result;
-        }
-        result.content = match.prefix().str();
-        it = match.suffix().first;
-    }
-
-    while (it != end) {
-        std::sregex_iterator rend;
-        std::sregex_iterator rit(it, end, function_regex);
-        if (rit == rend) {
-            result.content += std::string(it, end);
+                auto maybe_raw_python = name == "python" && allow_raw_python;
+                if (builder.input()[builder.pos()] == '{' || !maybe_raw_python) {
+                    if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) {
+                        if (!builder.add_tool_call(name, "", arguments->value) || arguments->is_partial) {
+                            throw common_chat_msg_partial_exception("incomplete tool call");
+                        }
+                        builder.consume_regex(close_regex);
+                    }
+                    continue;
+                }
+                if (maybe_raw_python) {
+                    auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
+                    if (!builder.add_tool_call(name, "", arguments)) {
+                        throw common_chat_msg_partial_exception("incomplete tool call");
+                    }
+                    return;
+                }
+                throw common_chat_msg_partial_exception("incomplete tool call");
+            }
             break;
         }
-        auto name = rit->str(1);
-        result.content += std::string(it, rit->prefix().second);
-        it = rit->suffix().first;
-
-        json arguments;
-        if (parse_json(it, end, arguments)) {
-            if (!std::regex_search(it, end, match, close_regex)) {
-                throw std::runtime_error("Malformed input, missing closing pattern: " + input);
-            }
-            it = match.suffix().first;
-            result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
-        } else {
-            if (allow_raw_python && name == "python") {
-                result.tool_calls.push_back({name, json({{"code", std::string(it, end)}}).dump(), /* id= */ ""});
-                break;
-            }
-            throw std::runtime_error("Failed to parse json tool call arguments: " + input);
+        if (block_close) {
+            builder.consume_regex(*block_close);
         }
-    }
-
-    if (!result.tool_calls.empty()) {
-        if (!string_strip(result.content).empty()) {
-            LOG_WRN("Content found with tool calls: %s\n", result.content.c_str());
-        }
-        result.content = "";
-    }
-    return result;
-}
-
-static common_chat_tool_call process_tool_call(const json & tool_call) {
-    const auto & arguments = tool_call.at("arguments");
-    return {
-        /* .name = */ tool_call.at("name"),
-        /* .arguments = */ arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
-        /* .id = */ tool_call.contains("id") ? tool_call.at("id") : "",
+        builder.consume_spaces();
+        builder.add_content(builder.consume_rest());
     };
-}
-static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
-    auto content_end = input.find(prefix);
-    size_t tc_start = std::string::npos;
-
-    common_chat_msg result;
-    result.role = "assistant";
-    if (content_end == std::string::npos) {
-        result.content = input;
-    } else {
-        tc_start = content_end + prefix.size() - rstrip_prefix;
-        result.content = input.substr(0, content_end);
-        auto tool_calls = json::parse(input.substr(tc_start));
-        for (const auto & tool_call : tool_calls) {
-            result.tool_calls.emplace_back(process_tool_call(tool_call));
+    if (block_open) {
+        if (auto res = builder.try_find_regex(*block_open)) {
+            parse_tool_calls();
+        } else {
+            builder.add_content(builder.consume_rest());
         }
+    } else {
+        parse_tool_calls();
+    }
+}
+
+static void parse_prefixed_json_tool_call_array(common_chat_msg_parser & builder, const common_regex & prefix, size_t rstrip_prefix = 0) {
+    static const std::vector<std::vector<std::string>> args_paths = {{"arguments"}};
+    if (auto res = builder.try_find_regex(prefix)) {
+        builder.move_back(rstrip_prefix);
+        auto tool_calls = builder.consume_json_with_dumped_args(args_paths);
+        if (!builder.add_tool_calls(tool_calls.value) || tool_calls.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete tool call array");
+        }
+    } else {
+        builder.add_content(builder.consume_rest());
     }
-    return result;
 }
 
 static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
@@ -754,29 +831,36 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
     data.format = COMMON_CHAT_FORMAT_GENERIC;
     return data;
 }
-static common_chat_msg common_chat_parse_generic(const std::string & input) {
-    json data = json::parse(input);
-    common_chat_msg result;
-    result.role = "assistant";
-    if (data.contains("tool_calls")) {
-        for (const auto & tool_call : data.at("tool_calls")) {
-            result.tool_calls.push_back({
-                tool_call.at("name"),
-                tool_call.at("arguments").dump(),
-                tool_call.contains("id") ? tool_call.at("id") : "",
-            });
-        }
-    } else if (data.contains("tool_call")) {
-        result.tool_calls.push_back({
-            data.at("tool_call").at("name"),
-            data.at("tool_call").at("arguments").dump(),
-            /* id= */ "",
-        });
-    } else if (data.contains("response")) {
-        const auto & response = data.at("response");
-        result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
+static void common_chat_parse_generic(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const std::vector<std::vector<std::string>> content_paths = {
+        {"response"},
+    };
+    static const std::vector<std::vector<std::string>> args_paths = {
+        {"tool_call", "arguments"},
+        {"tool_calls", "arguments"},
+    };
+    auto data = builder.consume_json_with_dumped_args(args_paths, content_paths);
+    if (data.value.contains("tool_calls")) {
+        if (!builder.add_tool_calls(data.value.at("tool_calls")) || data.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete tool calls");
+        }
+    } else if (data.value.contains("tool_call")) {
+        if (!builder.add_tool_call(data.value.at("tool_call")) || data.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete tool call");
+        }
+    } else if (data.value.contains("response")) {
+        const auto & response = data.value.at("response");
+        builder.add_content(response.is_string() ? response.template get<std::string>() : response.dump(2));
+        if (data.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete response");
+        }
+    } else {
+        throw common_chat_msg_partial_exception("Expected 'tool_call', 'tool_calls' or 'response' in JSON");
     }
-    return result;
 }
 
 static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -823,12 +907,44 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
     data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
     return data;
 }
-static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) {
-    return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]");
+static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
+    parse_prefixed_json_tool_call_array(builder, prefix);
 }
 
 static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
+
+    auto adjusted_messages = json::array();
+    for (const auto & msg : inputs.messages) {
+        auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
+        auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
+        if (has_reasoning_content && has_tool_calls) {
+            auto adjusted_message = msg;
+            adjusted_message["tool_plan"] = msg.at("reasoning_content");
+            adjusted_message.erase("reasoning_content");
+            adjusted_messages.push_back(adjusted_message);
+        } else {
+            adjusted_messages.push_back(msg);
+        }
+    }
+    data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
+    data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
+    if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "<|END_THINKING|>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    } else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
+        data.prompt += "<|START_THINKING|><|END_THINKING|>";
+    }
+
     data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
     data.grammar = build_grammar([&](const common_grammar_builder & builder) {
         auto schemas = json::array();
@@ -859,11 +975,16 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
         if (!inputs.parallel_tool_calls) {
             schema["maxItems"] = 1;
         }
-        builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
+        builder.add_rule("root",
+            std::string(data.thinking_forced_open ? "( \"<|END_THINKING|>\" space )? " : "") +
+            "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
     });
     data.grammar_triggers.push_back({
-        COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
-        "<|START_ACTION|>",
+        COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+        // If thinking_forced_open, then we capture the </think> tag in the grammar,
+        // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+        std::string(data.thinking_forced_open ? "[\\s\\S]*?(<\\|END_THINKING\\|>\\s*)" : "(?:<\\|START_THINKING\\|>[\\s\\S]*?<\\|END_THINKING\\|>\\s*)?") +
+            "(<\\|START_ACTION\\|>)[\\s\\S]*"
     });
     data.preserved_tokens = {
         "<|START_ACTION|>",
@@ -873,61 +994,40 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
         "<|START_THINKING|>",
         "<|END_THINKING|>",
     };
-    auto adjusted_messages = json::array();
-    for (const auto & msg : inputs.messages) {
-        auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
-        auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
-        if (has_reasoning_content && has_tool_calls) {
-            auto adjusted_message = msg;
-            adjusted_message["tool_plan"] = msg.at("reasoning_content");
-            adjusted_message.erase("reasoning_content");
-            adjusted_messages.push_back(adjusted_message);
-        } else {
-            adjusted_messages.push_back(msg);
-        }
-    }
-    data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
-    data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING : COMMON_CHAT_FORMAT_COMMAND_R7B;
     return data;
 }
-static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
-    static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
-    static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
-    static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
 
-    std::smatch match;
+static void common_chat_parse_command_r7b(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
 
-    common_chat_msg result;
-    result.role = "assistant";
+    static const common_regex start_action_regex("<\\|START_ACTION\\|>");
+    static const common_regex end_action_regex("<\\|END_ACTION\\|>");
+    static const common_regex start_response_regex("<\\|START_RESPONSE\\|>");
+    static const common_regex end_response_regex("<\\|END_RESPONSE\\|>");
 
-    std::string rest = input;
-
-    if (std::regex_match(rest, match, thought_regex)) {
-        if (extract_reasoning) {
-            result.reasoning_content = match[2].str();
-        } else if (!match[2].str().empty()) {
-            // Let the unparsed thinking tags through in content only if their insides aren't empty.
-            result.content = match[1].str();
+    if (auto res = builder.try_find_regex(start_action_regex)) {
+        // If we didn't extract thoughts, prelude includes them.
+        auto tool_calls = builder.consume_json_with_dumped_args({{"parameters"}});
+        for (const auto & tool_call : tool_calls.value) {
+            std::string name = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
+            std::string id = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : "";
+            std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : "";
+            if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) {
+                throw common_chat_msg_partial_exception("incomplete tool call");
+            }
         }
-        rest = match[3].str();
-    }
-    if (std::regex_match(rest, match, action_regex)) {
-        auto actions_str = match[1].str();
-        auto actions = json::parse(actions_str);
-        for (const auto & action : actions) {
-            result.tool_calls.push_back({
-                /* .name = */      action.at("tool_name"),
-                /* .arguments = */ action.at("parameters").dump(),
-                /* .id = */        action.at("tool_call_id"),
-            });
+        if (tool_calls.is_partial) {
+            throw common_chat_msg_partial_exception("incomplete tool call");
+        }
+        builder.consume_regex(end_action_regex);
+    } else if (auto res = builder.try_find_regex(start_response_regex)) {
+        if (!builder.try_find_regex(end_response_regex)) {
+            builder.add_content(builder.consume_rest());
+            throw common_chat_msg_partial_exception(end_response_regex.str());
         }
-    } else if (std::regex_match(rest, match, response_regex)) {
-        auto response = match[1].str();
-        result.content += response;
     } else {
-        result.content += rest;
+        builder.add_content(builder.consume_rest());
     }
-    return result;
 }
 
 static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
@@ -1004,8 +1104,8 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
             });
             // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
             data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
-                "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+                "(\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\")[\\s\\S]*", // + name + "\"[\\s\\S]*",
             });
             if (!builtin_tools.empty()) {
                 data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
@@ -1028,78 +1128,64 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
     });
     return data;
 }
-static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
-    // TODO: tighten & simplify the parser, don't accept leading text context.
-    static const std::regex function_regex(
+static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    static const common_regex function_regex(
         "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
-    static const std::regex close_regex("\\}\\s*");
-    static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
+    static const common_regex close_regex("\\}\\s*");
+
+    static const common_regex function_name_regex("\\s*(\\w+)\\s*\\.\\s*call\\(");
+    static const common_regex arg_name_regex("\\s*(\\w+)\\s*=\\s*");
 
     if (with_builtin_tools) {
-        std::smatch match;
-        if (std::regex_match(input, match, builtin_call_regex)) {
-            try {
-                auto name = match[1].str();
-                auto arg_name = match[2].str();
-                auto arg_value_str = match[3].str();
-                auto arg_value = json::parse(arg_value_str);
+        static const common_regex builtin_call_regex("<\\|python_tag\\|>");
+        if (auto res = builder.try_find_regex(builtin_call_regex)) {
+            auto fun_res = builder.consume_regex(function_name_regex);
+            auto function_name = builder.str(fun_res.groups[1]);
 
-                common_chat_msg msg;
-                msg.role = "assistant";
-                msg.tool_calls.push_back({
-                    /* .name = */ name,
-                    /* .arguments = */ (json {
-                        {arg_name, arg_value},
-                    }).dump(),
-                    /* .id = */ "",
-                });
-                return msg;
-            } catch (const std::exception & e) {
-                LOG_WRN("Failed to parse builtin tool call arguments (%s): %s", e.what(), input.c_str());
+            common_healing_marker healing_marker;
+            json args = json::object();
+            while (true) {
+                if (auto arg_res = builder.try_consume_regex(arg_name_regex)) {
+                    auto arg_name = builder.str(arg_res->groups[1]);
+                    auto partial = builder.consume_json();
+                    args[arg_name] = partial.json;
+                    healing_marker.marker = partial.healing_marker.marker;
+                    healing_marker.json_dump_marker = partial.healing_marker.json_dump_marker;
+                    builder.consume_spaces();
+                    if (!builder.try_consume_literal(",")) {
+                        break;
+                    }
+                } else {
+                    break;
+                }
             }
+            builder.consume_literal(")");
+            builder.consume_spaces();
+
+            auto arguments = args.dump();
+            if (!builder.add_tool_call(function_name, "", arguments)) {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+            return;
         }
     }
-    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ std::nullopt,
+        /* function_regex_start_only= */ function_regex,
+        /* function_regex= */ std::nullopt,
+        close_regex,
+        std::nullopt);
+
 }
 
 static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
-    if (inputs.tools.is_array() && !inputs.tools.empty()) {
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            std::vector<std::string> tool_rules;
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                std::string name = function.at("name");
-                auto parameters = function.at("parameters");
-                builder.resolve_refs(parameters);
-                tool_rules.push_back(builder.add_rule(name + "-call",
-                    "\"<｜tool▁call▁begin｜>function<｜tool▁sep｜>" + name + "\\n"
-                    "```json\\n\" " + builder.add_schema(name + "-args", parameters) + " "
-                    "\"```<｜tool▁call▁end｜>\""));
-            });
-            // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
-            // so we accept common variants (then it's all constrained)
-            builder.add_rule("root",
-                "( \"<｜tool▁calls▁begin｜>\" | \"<｜tool_calls_begin｜>\" | \"<｜tool calls begin｜>\" | \"<｜tool\\\\_calls\\\\_begin｜>\" ) "
-                "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
-                "\"<｜tool▁calls▁end｜>\""
-                " space");
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool▁calls▁begin｜>"});
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool_calls_begin｜>"});
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool calls begin｜>"});
-            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool\\_calls\\_begin｜>"});
-            data.preserved_tokens = {
-                "<think>",
-                "</think>",
-                "<｜tool▁calls▁begin｜>",
-                "<｜tool▁call▁begin｜>",
-                "<｜tool▁sep｜>",
-                "<｜tool▁call▁end｜>",
-                "<｜tool▁calls▁end｜",
-            };
-        });
-    }
     auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
 
     // Hacks to fix the official (broken) prompt.
@@ -1120,45 +1206,76 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
             "$1<｜tool▁calls▁end｜><｜end▁of▁sentence｜>$2");
     }
     data.prompt = prompt;
-    data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
+    data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
+    if (string_ends_with(data.prompt, "<think>\n")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+
+    if (inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            std::vector<std::string> tool_rules;
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+                tool_rules.push_back(builder.add_rule(name + "-call",
+                    "( \"<｜tool▁call▁begin｜>\" )? \"function<｜tool▁sep｜>" + name + "\\n"
+                    "```json\\n\" " + builder.add_schema(name + "-args", parameters) + " "
+                    "\"```<｜tool▁call▁end｜>\""));
+            });
+            // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
+            // so we accept common variants (then it's all constrained)
+            builder.add_rule("root",
+                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
+                "( \"<｜tool▁calls▁begin｜>\" | \"<｜tool_calls_begin｜>\" | \"<｜tool calls begin｜>\" | \"<｜tool\\\\_calls\\\\_begin｜>\" | \"<｜tool▁calls｜>\" ) "
+                "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
+                "\"<｜tool▁calls▁end｜>\""
+                " space");
+            data.grammar_triggers.push_back({
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+                // If thinking_forced_open, then we capture the </think> tag in the grammar,
+                // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
+                    "(<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)[\\s\\S]*"
+            });
+            data.preserved_tokens = {
+                "<think>",
+                "</think>",
+                "<｜tool▁calls▁begin｜>",
+                "<｜tool▁call▁begin｜>",
+                "<｜tool▁sep｜>",
+                "<｜tool▁call▁end｜>",
+                "<｜tool▁calls▁end｜",
+            };
+        });
+    }
     return data;
 }
-static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function<common_chat_msg(const std::string &)> & rest_parser) {
-    std::smatch match;
-    static const std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
-    if (std::regex_match(input, match, reasoning_content_regex)) {
-        auto rest = match[3].str();
-        auto msg = rest_parser(rest);
-        auto reasoning_content = string_strip(match[2].str());
-        if (extract_reasoning) {
-            msg.reasoning_content = reasoning_content;
-        } else if (!reasoning_content.empty()) {
-            std::ostringstream content;
-            content << "<think>" << reasoning_content << "</think>" << msg.content;
-            msg.content = content.str();
-        }
-        return msg;
+static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
     }
-    return rest_parser(input);
-}
-static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
-    return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
-        static const std::regex function_regex("<｜tool▁call▁begin｜>function<｜tool▁sep｜>([^\n]+)\n```json\n");
-        static const std::regex close_regex("```[\\s\\r\\n]*<｜tool▁call▁end｜>");
-        static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>)([\\s\\S\\r\\n]*?)<｜tool▁calls▁end｜>");
 
-        common_chat_msg msg;
-        msg.role = "assistant";
-        std::smatch match;
-        if (std::regex_search(input, match, tool_calls_regex)) {
-            auto tool_calls = match[1].str();
-            auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
-            msg.tool_calls = std::move(msg2.tool_calls);
-        } else {
-            msg.content = input;
-        }
-        return msg;
-    });
+    static const common_regex tool_calls_begin("(?:<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)");
+    static const common_regex tool_calls_end("<｜tool▁calls▁end｜>");
+    static const common_regex function_regex("(?:<｜tool▁call▁begin｜>)?function<｜tool▁sep｜>([^\n]+)\n```json\n");
+    static const common_regex close_regex("```[\\s\\r\\n]*<｜tool▁call▁end｜>");
+
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ tool_calls_begin,
+        /* function_regex_start_only= */ std::nullopt,
+        function_regex,
+        close_regex,
+        tool_calls_end);
 }
 
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1206,13 +1323,19 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
     }
     return data;
 }
-static common_chat_msg common_chat_parse_firefunction_v2(const std::string & input) {
-    return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1);
+static void common_chat_parse_firefunction_v2(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    static const common_regex prefix(regex_escape(" functools["));
+    parse_prefixed_json_tool_call_array(builder, prefix, /* rstrip_prefix= */ 1);
 }
 
 static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct templates_params & inputs) {
     // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}...
     // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
+    // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
     common_chat_params data;
     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
     data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
@@ -1226,24 +1349,21 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
                 std::string name = function.at("name");
                 auto parameters = function.at("parameters");
                 builder.resolve_refs(parameters);
+                std::string args_pattern = "[\\s\\S]*";
                 auto args_rule = builder.add_schema(name + "-args", parameters);
-                first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule));
-                subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
+                if (name == "python") {
+                    args_rule = builder.add_rule(name + "-maybe-raw-args", args_rule + " | [^{] .*");
+                } else {
+                    args_pattern = "\\{" + args_pattern;
+                }
+                auto call_rule = builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule);
+                first_tool_rules.push_back(call_rule);
+                if (inputs.parallel_tool_calls) {
+                    subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>\" " + call_rule));
+                }
                 data.grammar_triggers.push_back({
-                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
-                    regex_escape(name + "\n"),
-                });
-                data.grammar_triggers.push_back({
-                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
-                    regex_escape("assistant<|end_header_id|>\n" + name + "\n"),
-                });
-                data.grammar_triggers.push_back({
-                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
-                    regex_escape(">>>" + name + "\n"),
-                });
-                data.grammar_triggers.push_back({
-                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
-                    ">>>assistant<|end_header_id|>\n" + name,
+                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+                    "((?:[\\s\\S]+?>>>)?" + regex_escape(name) + "\n)" + args_pattern,
                 });
             });
             data.preserved_tokens = {
@@ -1261,40 +1381,33 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
     }
     return data;
 }
+static void common_chat_parse_functionary_v3_2(common_chat_msg_parser & builder) {
+    static const common_regex function_regex_start_only(R"((\w+\n\{|python\n|all\n))");
+    static const common_regex function_regex(R"(>>>(\w+\n\{|python\n|all\n))");
+    static const common_regex close_regex(R"(\s*)");
 
-static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
-    static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
-    static const std::regex close_regex(R"($|(?=>>>))");
-
-    std::string content;
-    auto it = input.begin();
-    const auto end = input.end();
-
-    if (parse_literal(it, end, "all\n")) {
-        std::smatch match;
-        if (std::regex_search(it, end, match, function_regex)) {
-            auto fun_it = match.prefix().second;
-            content = std::string(it, fun_it);
-            it = fun_it;
-        } else {
-            common_chat_msg res;
-            res.role = "assistant";
-            res.content = std::string(it, end);
-            return res;
-        }
-    }
-    // TODO: tighten & simplify.
-    try {
-        auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex, /* allow_raw_python= */ true);
-        res.content = content + res.content;
-        return res;
-    } catch (const std::exception & e) {
-        LOG_ERR("Failed to parse functionary v3.2 input: %s\n", e.what());
-        common_chat_msg res;
-        res.role = "assistant";
-        res.content = input;
-        return res;
-    }
+    parse_json_tool_calls(
+        builder,
+        std::nullopt,
+        function_regex_start_only,
+        function_regex,
+        close_regex,
+        std::nullopt,
+        /* allow_raw_python= */ true,
+        /* get_function_name= */ [&](const auto & res) -> std::string {
+            auto at_start = res.groups[0].begin == 0;
+            auto name = builder.str(res.groups[1]);
+            if (!name.empty() && name.back() == '{') {
+                // Unconsume the opening brace '{' to ensure the JSON parsing goes well.
+                builder.move_back(1);
+            }
+            auto idx = name.find_last_not_of("\n{");
+            name = name.substr(0, idx + 1);
+            if (at_start && name == "all") {
+                return "";
+            }
+            return name;
+        });
 }
 
 static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1355,229 +1468,224 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
     // TODO: if (has_raw_python)
     return data;
 }
-static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::string & input) {
-    // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
-    static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
-    std::smatch match;
-    if (std::regex_search(input, match, python_tag_regex)) {
-        auto code = match[1].str();
-        common_chat_msg msg;
-        msg.role = "assistant";
-        msg.content = match.prefix().str();
-        msg.tool_calls.push_back({
-            /* .name = */ "python",
-            /* .arguments = */ (json {{"code", code}}).dump(),
-            /* .id = */ "",
-        });
-        return msg;
+static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser & builder) {
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+    // This version of Functionary still supports the llama 3.1 tool call format for the python tool.
+    static const common_regex python_tag_regex(regex_escape("<|python_tag|>"));
+
+    static const common_regex function_regex(R"(<function=(\w+)>)");
+    static const common_regex close_regex(R"(</function>)");
+
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ std::nullopt,
+        /* function_regex_start_only= */ std::nullopt,
+        function_regex,
+        close_regex,
+        std::nullopt);
+
+    if (auto res = builder.try_find_regex(python_tag_regex)) {
+        auto arguments = wrap_code_as_arguments(builder, builder.consume_rest());
+        builder.add_tool_call("python", "", arguments);
+        return;
     }
-    static const std::regex function_regex(R"(<function=(\w+)>)");
-    static const std::regex close_regex(R"(</function>)");
-    // TODO: tighten & simplify.
-    return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
 }
 
 static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
-    // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
-    data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-        std::vector<std::string> tool_rules;
-        std::vector<std::string> tool_call_alts;
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const auto & function = tool.at("function");
-            std::string name = function.at("name");
-            auto parameters = function.at("parameters");
-            builder.resolve_refs(parameters);
-            tool_rules.push_back(builder.add_schema(name + "-call", {
-                {"type", "object"},
-                {"properties", json {
-                    {"name", json {{"const", name}}},
-                    {"arguments", parameters},
-                }},
-                {"required", json::array({"name", "arguments"})},
-            }));
-            tool_call_alts.push_back(builder.add_rule(
-                name + "-function-tag",
-                "\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
-                builder.add_schema(name + "-args", parameters) + " "
-                "\"</function>\" space"));
 
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
-                "<function=" + name + ">",
-            });
-            auto escaped_name = regex_escape(name);
-            data.grammar_triggers.push_back({
-                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-                "<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
-            });
-        });
-        auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
-        std::vector<std::string> alt_tags {
-            any_tool_call,
-            "\"<tool_call>\" space "     + any_tool_call + " \"</tool_call>\"",
-            // The rest is just to accommodate common "good bad" outputs.
-            "\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
-            "\"<response>\"  space "     + any_tool_call + " \"</response>\"",
-            "\"<tools>\"     space "     + any_tool_call + " \"</tools>\"",
-            "\"<json>\"      space "     + any_tool_call + " \"</json>\"",
-            "\"<xml>\"      space "     + any_tool_call + " \"</xml>\"",
-            "\"<JSON>\"      space "     + any_tool_call + " \"</JSON>\"",
-        };
-        auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
-        tool_call_alts.push_back(wrappable_tool_call);
-        tool_call_alts.push_back(
-            "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
-        auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
-        builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
-        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"});
-        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function"});
-        // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
-        data.grammar_triggers.push_back({
-            COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
-            "(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
-        });
-        data.preserved_tokens = {
-            "<think>",
-            "</think>",
-            "<tool_call>",
-            "</tool_call>",
-            "<function",
-            "<tools>",
-            "</tools>",
-            "<response>",
-            "</response>",
-            "<function_call>",
-            "</function_call>",
-            "<json>",
-            "</json>",
-            "<JSON>",
-            "</JSON>",
-            "```",
-            "```json",
-            "```xml",
-        };
-    });
+    json additional_context = {
+        {"enable_thinking", inputs.enable_thinking},
+    };
+
+    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
+    data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
+    if (string_ends_with(data.prompt, "<think>\n")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+
+    if (!inputs.tools.is_null()) {
+        // (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            std::vector<std::string> tool_rules;
+            std::vector<std::string> tool_call_alts;
+            std::vector<std::string> escaped_names;
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+                tool_rules.push_back(builder.add_schema(name + "-call", {
+                    {"type", "object"},
+                    {"properties", json {
+                        {"name", json {{"const", name}}},
+                        {"arguments", parameters},
+                    }},
+                    {"required", json::array({"name", "arguments"})},
+                }));
+                tool_call_alts.push_back(builder.add_rule(
+                    name + "-function-tag",
+                    "\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
+                    builder.add_schema(name + "-args", parameters) + " "
+                    "\"</function>\" space"));
+
+                data.grammar_triggers.push_back({
+                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+                    "<function=" + name + ">",
+                });
+                auto escaped_name = regex_escape(name);
+                data.grammar_triggers.push_back({
+                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+                    "<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
+                });
+                escaped_names.push_back(escaped_name);
+            });
+            auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
+            std::vector<std::string> alt_tags {
+                any_tool_call,
+                "\"<tool_call>\" space "     + any_tool_call + " \"</tool_call>\"",
+                // The rest is just to accommodate common "good bad" outputs.
+                "\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
+                "\"<response>\"  space "     + any_tool_call + " \"</response>\"",
+                "\"<tools>\"     space "     + any_tool_call + " \"</tools>\"",
+                "\"<json>\"      space "     + any_tool_call + " \"</json>\"",
+                "\"<xml>\"      space "     + any_tool_call + " \"</xml>\"",
+                "\"<JSON>\"      space "     + any_tool_call + " \"</JSON>\"",
+            };
+            auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
+            tool_call_alts.push_back(wrappable_tool_call);
+            tool_call_alts.push_back(
+                "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
+            auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
+            builder.add_rule("root",
+                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
+                (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
+            // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
+            data.grammar_triggers.push_back({
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+                // If thinking_forced_open, then we capture the </think> tag in the grammar,
+                // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
+                    "(\\s*"
+                    "(?:<tool_call>"
+                    "|<function"
+                    "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
+                    "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
+                    ")"
+                    ")[\\s\\S]*"
+                ),
+            });
+            data.preserved_tokens = {
+                "<think>",
+                "</think>",
+                "<tool_call>",
+                "</tool_call>",
+                "<function",
+                "<tools>",
+                "</tools>",
+                "<response>",
+                "</response>",
+                "<function_call>",
+                "</function_call>",
+                "<json>",
+                "</json>",
+                "<JSON>",
+                "</JSON>",
+                "```",
+                "```json",
+                "```xml",
+            };
+        });
+    }
 
-    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING : COMMON_CHAT_FORMAT_HERMES_2_PRO;
     return data;
 }
-static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) {
-    return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) {
-        static const std::regex open_regex(
-            "(?:"
-            "(```(?:xml|json)?\\n\\s*)?"         // match 1 (block_start)
-            "(<tool_call>"                   // match 2 (open_tag)
-            "|<function_call>"
-            "|<tool>"
-            "|<tools>"
-            "|<response>"
-            "|<json>"
-            "|<xml>"
-            "|<JSON>"
+static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    static const common_regex open_regex(
+        "(?:"
+            "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
+            "("                          // match 2 (open_tag)
+                "<tool_call>"
+                "|<function_call>"
+                "|<tool>"
+                "|<tools>"
+                "|<response>"
+                "|<json>"
+                "|<xml>"
+                "|<JSON>"
             ")?"
-            "(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)"    // match 3 (named tool call + rest)
-            ")"
-            "|"
-            "(?:<function=([^>]+)>"            // match 4 (function name)
-            "|<function name=\"([^\"]+)\">)" // match 5 (function name again)
-            "([\\s\\S]*)"                   // match 6 (function arguments + rest)})"
-        );
+            "(\\s*\\{\\s*\"name\")" // match 3 (named tool call)
+        ")"
+        "|<function=([^>]+)>"            // match 4 (function name)
+        "|<function name=\"([^\"]+)\">"  // match 5 (function name again)
+    );
 
-        try {
-            common_chat_msg msg;
-            msg.role = "assistant";
+    if (auto res = builder.try_find_regex(open_regex)) {
+        const auto & block_start = res->groups[1];
+        std::string block_end = block_start.empty() ? "" : "```";
 
-            std::string::const_iterator it = input.begin();
-            const std::string::const_iterator end = input.end();
-            std::smatch match;
+        const auto & open_tag = res->groups[2];
+        std::string close_tag;
 
-            while (it != end) {
-                if (std::regex_search(it, end, match, open_regex)) {
-                    // Add content before the match
-                    msg.content += std::string(it, match[0].first);
+        if (!res->groups[3].empty()) {
+            builder.move_to(res->groups[3].begin);
+            close_tag = open_tag.empty() ? "" : "</" + builder.str(open_tag).substr(1);
 
-                    auto block_start = match[1].str();
-                    std::string block_end = block_start.empty() ? "" : "```";
+            if (auto tool_call = builder.try_consume_json_with_dumped_args({{"arguments"}})) {
+                if (!builder.add_tool_call(tool_call->value) || tool_call->is_partial) {
+                    throw common_chat_msg_partial_exception("incomplete tool call");
+                }
+                builder.consume_spaces();
+                builder.consume_literal(close_tag);
+                builder.consume_spaces();
+                if (!block_end.empty()) {
+                    builder.consume_literal(block_end);
+                    builder.consume_spaces();
+                }
+                builder.add_content(builder.consume_rest());
+            } else {
+                throw common_chat_msg_partial_exception("failed to parse tool call");
+            }
+        } else {
+            auto function_name = builder.str(res->groups[4]);
+            if (function_name.empty()) {
+                function_name = builder.str(res->groups[5]);
+            }
+            GGML_ASSERT(!function_name.empty());
 
-                    auto open_tag = match[2].str();
-                    std::string close_tag;
+            close_tag = "</function>";
 
-                    if (match[3].matched) {
-                        close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
-                        auto json_it = match[3].first;
-                        json tool_call;
-                        if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
-
-                            msg.tool_calls.emplace_back(process_tool_call(tool_call));
-                            it = json_it;  // Move iterator past parsed JSON
-
-                            // Handle close tags
-                            consume_spaces(it, end);
-                            if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
-                                throw std::runtime_error("Failed to parse closing tag");
-                            }
-                            consume_spaces(it, end);
-                            if (!block_end.empty() && !parse_literal(it, end, block_end)) {
-                                throw std::runtime_error("Failed to parse block end");
-                            }
-                            consume_spaces(it, end);
-                        } else {
-                            // Not a valid tool call, treat as content
-                            msg.content += std::string(match[0].first, match[0].second);
-                            it = match[0].second;
-                        }
-                    } else {
-                        auto function_name = match[4].str();
-                        if (function_name.empty()) {
-                            function_name = match[5].str();
-                        }
-                        GGML_ASSERT(!function_name.empty());
-
-                        close_tag = "</function>";
-                        // Start parsing from after the opening tags
-                        auto json_it = match[6].first;
-                        json arguments;
-                        if (parse_json(json_it, end, arguments)) {
-                            msg.tool_calls.emplace_back(process_tool_call({
-                                {"name", function_name},
-                                {"arguments", arguments},
-                            }));
-                            it = json_it;  // Move iterator past parsed JSON
-
-                            // Handle close tags
-                            consume_spaces(it, end);
-                            if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
-                                throw std::runtime_error("Failed to parse closing tag");
-                            }
-                            consume_spaces(it, end);
-                            if (!block_end.empty() && !parse_literal(it, end, block_end)) {
-                                throw std::runtime_error("Failed to parse block end");
-                            }
-                            consume_spaces(it, end);
-                        } else {
-                            // Not a valid tool call, treat as content
-                            msg.content += std::string(match[0].first, match[0].second);
-                            it = match[0].second;
-                        }
-                    }
-                } else {
-                    // Add remaining content
-                    msg.content += std::string(it, end);
-                    break;
+            if (auto arguments = builder.try_consume_json_with_dumped_args({{}})) {
+                if (!builder.add_tool_call(function_name, "", arguments->value) || arguments->is_partial) {
+                    throw common_chat_msg_partial_exception("incomplete tool call");
+                }
+                builder.consume_spaces();
+                builder.consume_literal(close_tag);
+                builder.consume_spaces();
+                if (!block_end.empty()) {
+                    builder.consume_literal(block_end);
+                    builder.consume_spaces();
                 }
             }
-            return msg;
-        } catch (const std::exception & e) {
-            LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what());
-            common_chat_msg msg;
-            msg.role = "assistant";
-            msg.content = input;
-            return msg;
+            builder.add_content(builder.consume_rest());
         }
-    });
+    } else {
+        builder.add_content(builder.consume_rest());
+    }
 }
 
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1609,8 +1717,8 @@ static common_chat_params common_chat_templates_apply_jinja(
     const auto & caps = tmpl.original_caps();
     params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
     params.add_generation_prompt = inputs.add_generation_prompt;
-    params.extract_reasoning = inputs.extract_reasoning;
     params.tool_choice = inputs.tool_choice;
+    params.enable_thinking = inputs.enable_thinking;
     params.grammar = inputs.grammar;
     params.now = inputs.now;
     if (!inputs.json_schema.empty()) {
@@ -1644,7 +1752,7 @@ static common_chat_params common_chat_templates_apply_jinja(
     }
 
     // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
-    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
+    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
         return common_chat_params_init_hermes_2_pro(tmpl, params);
     }
 
@@ -1758,44 +1866,64 @@ common_chat_params common_chat_templates_apply(
         : common_chat_templates_apply_legacy(tmpls, inputs);
 }
 
-static common_chat_msg common_chat_parse_content_only(const std::string & input) {
-    common_chat_msg msg;
-    msg.role = "assistant";
-    msg.content = input;
-    return msg;
+static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
+    builder.add_content(builder.consume_rest());
 }
 
-common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) {
-    switch (format) {
+static void common_chat_parse(common_chat_msg_parser & builder) {
+    LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(builder.syntax().format), builder.input().c_str());
+
+    switch (builder.syntax().format) {
         case COMMON_CHAT_FORMAT_CONTENT_ONLY:
-            return common_chat_parse_content_only(input);
+            common_chat_parse_content_only(builder);
+            break;
         case COMMON_CHAT_FORMAT_GENERIC:
-            return common_chat_parse_generic(input);
+            common_chat_parse_generic(builder);
+            break;
         case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
-            return common_chat_parse_mistral_nemo(input);
+            common_chat_parse_mistral_nemo(builder);
+            break;
         case COMMON_CHAT_FORMAT_LLAMA_3_X:
-            return common_chat_parse_llama_3_1(input);
+            common_chat_parse_llama_3_1(builder);
+            break;
         case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
-            return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
+            common_chat_parse_llama_3_1(builder, /* with_builtin_tools= */ true);
+            break;
         case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
-            return common_chat_parse_deepseek_r1(input, /* extract_reasoning= */ false);
-        case COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING:
-            return common_chat_parse_deepseek_r1(input, /* extract_reasoning= */ true);
+            common_chat_parse_deepseek_r1(builder);
+            break;
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
-            return common_chat_parse_functionary_v3_2(input);
+            common_chat_parse_functionary_v3_2(builder);
+            break;
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
-            return common_chat_parse_functionary_v3_1_llama_3_1(input);
+            common_chat_parse_functionary_v3_1_llama_3_1(builder);
+            break;
         case COMMON_CHAT_FORMAT_HERMES_2_PRO:
-            return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ false);
-        case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING:
-            return common_chat_parse_hermes_2_pro(input, /* extract_reasoning= */ true);
+            common_chat_parse_hermes_2_pro(builder);
+            break;
         case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
-            return common_chat_parse_firefunction_v2(input);
+            common_chat_parse_firefunction_v2(builder);
+            break;
         case COMMON_CHAT_FORMAT_COMMAND_R7B:
-            return common_chat_parse_command_r7b(input, /* extract_reasoning= */ false);
-        case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING:
-            return common_chat_parse_command_r7b(input, /* extract_reasoning= */ true);
+            common_chat_parse_command_r7b(builder);
+            break;
         default:
-            throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
+            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
     }
+    builder.finish();
+}
+
+common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+    common_chat_msg_parser builder(input, is_partial, syntax);
+    try {
+        common_chat_parse(builder);
+    } catch (const common_chat_msg_partial_exception & ex) {
+        LOG_DBG("Partial parse: %s\n", ex.what());
+        if (!is_partial) {
+            throw std::runtime_error(ex.what());
+        }
+    }
+    auto msg = builder.result();
+    LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+    return msg;
 }
diff --git a/common/chat.h b/common/chat.h
index d26a09c2f..f6b1d0ffc 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "common.h"
+#include <functional>
 #include <chrono>
 #include <string>
 #include <vector>
@@ -13,11 +14,19 @@ struct common_chat_tool_call {
     std::string name;
     std::string arguments;
     std::string id;
+
+    bool operator==(const common_chat_tool_call & other) const {
+        return name == other.name && arguments == other.arguments && id == other.id;
+    }
 };
 
 struct common_chat_msg_content_part {
     std::string type;
     std::string text;
+
+    bool operator==(const common_chat_msg_content_part & other) const {
+        return type == other.type && text == other.text;
+    }
 };
 
 struct common_chat_msg {
@@ -28,6 +37,51 @@ struct common_chat_msg {
     std::string reasoning_content;
     std::string tool_name;
     std::string tool_call_id;
+
+    template <class T> T to_json_oaicompat() const;
+
+    bool empty() const {
+        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
+    }
+    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
+        for (auto i = 0u; i < tool_calls.size(); i++) {
+            if (ids_cache.size() <= i) {
+                auto id = tool_calls[i].id;
+                if (id.empty()) {
+                    id = gen_tool_call_id();
+                }
+                ids_cache.push_back(id);
+            }
+            tool_calls[i].id = ids_cache[i];
+        }
+    }
+    bool operator==(const common_chat_msg & other) const {
+        return role == other.role
+            && content == other.content
+            && content_parts == other.content_parts
+            && tool_calls == other.tool_calls
+            && reasoning_content == other.reasoning_content
+            && tool_name == other.tool_name
+            && tool_call_id == other.tool_call_id;
+    }
+    bool operator!=(const common_chat_msg & other) const {
+        return !(*this == other);
+    }
+};
+
+struct common_chat_msg_diff {
+    // std::string reasoning_content_delta;
+    std::string content_delta;
+    size_t tool_call_index = std::string::npos;
+    common_chat_tool_call tool_call_delta;
+
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
+
+    bool operator==(const common_chat_msg_diff & other) const {
+        return content_delta == other.content_delta
+        && tool_call_index == other.tool_call_index
+        && tool_call_delta == other.tool_call_delta;
+    }
 };
 
 struct common_chat_tool {
@@ -49,14 +103,11 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_LLAMA_3_X,
     COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
     COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -71,7 +122,8 @@ struct common_chat_templates_inputs {
     std::vector<common_chat_tool> tools;
     common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
-    bool extract_reasoning     = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    bool enable_thinking = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 
@@ -80,11 +132,21 @@ struct common_chat_params {
     std::string                         prompt;
     std::string                         grammar;
     bool                                grammar_lazy = false;
+    bool                                thinking_forced_open = false;
     std::vector<common_grammar_trigger> grammar_triggers;
     std::vector<std::string>            preserved_tokens;
     std::vector<std::string>            additional_stops;
 };
 
+struct common_chat_syntax {
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
+    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
+};
+
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
 
@@ -121,8 +183,9 @@ std::string common_chat_format_example(
     const struct common_chat_templates * tmpls,
     bool use_jinja);
 
-std::string               common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
+const char*               common_chat_format_name(common_chat_format format);
+const char*               common_reasoning_format_name(common_reasoning_format format);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
 
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
 
@@ -135,3 +198,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
 template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
+
+template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
diff --git a/common/common.cpp b/common/common.cpp
index eb16055ea..4cc40ed8b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 
     DWORD p = NORMAL_PRIORITY_CLASS;
     switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p = BELOW_NORMAL_PRIORITY_CLASS; break;
         case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
         case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
         case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 
     int p = 0;
     switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p =  5;  break;
         case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
         case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
         case GGML_SCHED_PRIO_HIGH:     p = -10; break;
@@ -849,7 +851,7 @@ std::string fs_get_cache_directory() {
     if (getenv("LLAMA_CACHE")) {
         cache_directory = std::getenv("LLAMA_CACHE");
     } else {
-#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
         if (std::getenv("XDG_CACHE_HOME")) {
             cache_directory = std::getenv("XDG_CACHE_HOME");
         } else {
@@ -903,13 +905,16 @@ struct common_init_result common_init_from_params(common_params & params) {
             ok = false;
         }
 
-        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
-            ok = false;
-        }
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
 
-        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
-            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
+        if (!has_eos && !has_sep) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+            ok = false;
+        } else if (!has_eos) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+        } else if (!has_sep) {
+            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
             ok = false;
         }
 
diff --git a/common/common.h b/common/common.h
index 556ff5be4..cee1e3039 100644
--- a/common/common.h
+++ b/common/common.h
@@ -76,7 +76,7 @@ enum llama_example {
     LLAMA_EXAMPLE_SERVER,
     LLAMA_EXAMPLE_CVECTOR_GENERATOR,
     LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
@@ -115,7 +115,7 @@ enum common_grammar_trigger_type {
     COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
     COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
     COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
-    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
 };
 
 struct common_grammar_trigger {
@@ -291,6 +291,7 @@ struct common_params {
     int32_t verbosity                  = 0;
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
+    bool    offline                    = false;
 
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -368,6 +369,7 @@ struct common_params {
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
     std::vector<std::string> api_keys;
diff --git a/common/json-partial.cpp b/common/json-partial.cpp
new file mode 100644
index 000000000..d9d916998
--- /dev/null
+++ b/common/json-partial.cpp
@@ -0,0 +1,256 @@
+#include "json-partial.h"
+
+#include "log.h"
+
+#include <nlohmann/json.hpp>
+
+#include <string>
+
+using json = nlohmann::ordered_json;
+
+enum common_json_stack_element_type {
+    COMMON_JSON_STACK_ELEMENT_OBJECT,
+    COMMON_JSON_STACK_ELEMENT_KEY,
+    COMMON_JSON_STACK_ELEMENT_ARRAY,
+};
+
+struct common_json_stack_element {
+    common_json_stack_element_type type;
+    std::string key;
+};
+
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out)
+{
+    std::string::const_iterator it = input.begin();
+    const auto end = input.end();
+    return common_json_parse(it, end, healing_marker, out);
+}
+
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out)
+{
+    // // https://json.nlohmann.me/features/parsing/sax_interface/
+    struct json_error_locator : public nlohmann::json_sax<json> {
+        std::size_t position;
+        bool found_error;
+        std::string last_token;
+        std::string exception_message;
+        std::vector<common_json_stack_element> stack;
+
+        json_error_locator() : position(0), found_error(false) {}
+
+        bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
+            this->position = position - 1;
+            this->found_error = true;
+            this->last_token = last_token;
+            this->exception_message = ex.what();
+            return false;
+        }
+        void close_value() {
+            if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
+                stack.pop_back();
+            }
+        }
+        bool null() override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool boolean(bool) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_integer(number_integer_t) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_unsigned(number_unsigned_t) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool number_float(number_float_t, const string_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool string(string_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool binary(binary_t &) override { // NOLINT
+            close_value();
+            return true;
+        }
+        bool start_object(std::size_t) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
+            return true;
+        }
+        bool end_object() override {
+            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
+            stack.pop_back();
+            close_value();
+            return true;
+        }
+        bool key(string_t & key) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
+            return true;
+        }
+        bool start_array(std::size_t) override { // NOLINT
+            stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
+            return true;
+        }
+        bool end_array() override {
+            GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
+            stack.pop_back();
+            close_value();
+            return true;
+        }
+    };
+    json_error_locator err_loc;
+    auto start = it;
+    json::sax_parse(it, end, &err_loc);
+
+    if (err_loc.found_error) {
+        it = start;
+        auto temptative_end = it + err_loc.position;
+        // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
+
+        auto input = std::string(it, temptative_end);
+        try {
+            out.json = json::parse(input);
+            // out.json = json::parse(it, temptative_end);
+            it = temptative_end;
+            return true;
+        } catch (const std::exception & ex) {
+            // No, needs healing.
+            LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
+        }
+        auto can_parse = [](const std::string & str) {
+            try {
+                auto _ = json::parse(str); // NOLINT
+                return true;
+            } catch (const std::exception &) {
+                return false;
+            }
+        };
+        if (!healing_marker.empty() && !err_loc.stack.empty()) {
+            std::string str(it, temptative_end);
+            auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
+            if (last_non_sp_pos == std::string::npos) {
+                throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
+            }
+            auto last_non_sp_char = str[last_non_sp_pos];
+            // Used to detect stops on a number, which may not be complete.
+            auto was_maybe_number = [&]() {
+                if (!str.empty() && std::isspace(str.back())) {
+                    return false;
+                }
+                return std::isdigit(last_non_sp_char) ||
+                    last_non_sp_char == '.' ||
+                    last_non_sp_char == 'e' ||
+                    last_non_sp_char == 'E' ||
+                    last_non_sp_char == '-';
+            };
+
+            std::string closing;
+            for (size_t i = err_loc.stack.size(); i > 0; i--) {
+                auto & el = err_loc.stack[i - 1];
+                if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
+                    closing += "}";
+                } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
+                    closing += "]";
+                } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
+                    throw std::runtime_error("Unexpected stack element type");
+                }
+            }
+
+            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
+
+            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
+                // We're inside an object value
+                if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
+                    // Was about to create an object value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + ": 1" + closing)) {
+                    str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
+                } else if (last_non_sp_char == '{' && can_parse(str + closing)) {
+                    // Was about to create an object
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + "\"" + closing)) {
+                    // Was inside an object value string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
+                    // Was inside an object value string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else {
+                    // find last :
+                    auto last_pos = str.find_last_of(':');
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
+                    }
+                    // Cutting back to opening : for object value
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
+                if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
+                    // Was about to create an array value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + "\"" + closing)) {
+                    // Was inside an array value string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
+                    // Was inside an array value string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
+                    // Had just finished a value
+                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
+                } else {
+                    auto last_pos = str.find_last_of("[,");
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
+                    }
+                    // Cutting back to last [ or , for array value
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
+                if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
+                        (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
+                    // Was about to create an object key+value
+                    str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
+                } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
+                    // Was about to create an object key+value
+                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + "\": 1" + closing)) {
+                    // Was inside an object key string
+                    str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
+                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
+                    // Was inside an object key string after an escape
+                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
+                } else {
+                    auto last_pos = str.find_last_of(':');
+                    if (last_pos == std::string::npos) {
+                        throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
+                    }
+                    // fprintf(stderr, "Cutting back to last : for object key+value\n");
+                    str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
+                }
+            } else {
+                throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
+            }
+            // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
+            out.json = json::parse(str);
+            it = temptative_end;
+            return true;
+        }
+        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
+        // fprintf(stderr, "Closing: TODO\n");
+        return false;
+    }
+    out.json = json::parse(it, end);
+    it = end;
+    return true;
+}
diff --git a/common/json-partial.h b/common/json-partial.h
new file mode 100644
index 000000000..f63356dc4
--- /dev/null
+++ b/common/json-partial.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <nlohmann/json.hpp>
+
+// Healing marker (empty if the JSON was fully parsed / wasn't healed).
+struct common_healing_marker {
+    // Raw marker.
+    std::string marker;
+
+    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
+    std::string json_dump_marker;
+};
+
+// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
+struct common_json {
+    nlohmann::ordered_json json;
+
+    common_healing_marker healing_marker;
+};
+
+// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
+//
+// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
+// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
+// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
+//
+// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out);
+
+// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out);
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 5b3059c2f..d38a74f95 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -1,8 +1,9 @@
 #include "json-schema-to-grammar.h"
 #include "common.h"
 
+#include <nlohmann/json.hpp>
+
 #include <algorithm>
-#include <fstream>
 #include <map>
 #include <regex>
 #include <sstream>
diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h
index 4613f5d9f..362991b54 100644
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include <nlohmann/json_fwd.hpp>
+
+#include <functional>
+#include <string>
 
 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                    bool force_gbnf = false);
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 28705e24c..9c04d35fd 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -161,7 +161,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
         GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
     } else {
-        std::vector<std::string> patterns_at_start;
+        std::vector<std::string> trigger_patterns;
         std::vector<std::string> patterns_anywhere;
         std::vector<llama_token> trigger_tokens;
         for (const auto & trigger : params.grammar_triggers) {
@@ -173,10 +173,13 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                     break;
                 }
                 case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
-                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
                 {
-                    const auto & pattern = trigger.value;
-                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
+                    patterns_anywhere.push_back(trigger.value);
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
+                {
+                    trigger_patterns.push_back(trigger.value);
                     break;
                 }
                 case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
@@ -190,10 +193,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
             }
         }
 
-        std::vector<std::string> trigger_patterns;
-        if (!patterns_at_start.empty()) {
-            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
-        }
         if (!patterns_anywhere.empty()) {
             trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
         }
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 753c88e7c..4c566af5f 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -45,7 +45,7 @@ class SentencePieceTokenTypes(IntEnum):
 
 class ModelType(IntEnum):
     TEXT = 1
-    VISION = 2
+    MMPROJ = 2
 
 
 AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
@@ -54,7 +54,7 @@ AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
 class ModelBase:
     _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
         ModelType.TEXT: {},
-        ModelType.VISION: {},
+        ModelType.MMPROJ: {},
     }
 
     dir_model: Path
@@ -88,7 +88,7 @@ class ModelBase:
                  small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
         if type(self) is ModelBase or \
                 type(self) is TextModel or \
-                type(self) is VisionModel:
+                type(self) is MmprojModel:
             raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
 
         self.dir_model = dir_model
@@ -309,6 +309,7 @@ class ModelBase:
                             gguf.MODEL_TENSOR.POSNET_NORM1,
                             gguf.MODEL_TENSOR.POSNET_NORM2,
                             gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
+                            gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
                         )
                     )
                     or not new_name.endswith(".weight")
@@ -422,23 +423,26 @@ class ModelBase:
         try:
             # for security reason, we don't allow loading remote code by default
             # if a model need remote code, we will fallback to config.json
-            return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
+            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
         except Exception as e:
             logger.warning(f"Failed to load model config from {dir_model}: {e}")
             logger.warning("Trying to load config.json instead")
             with open(dir_model / "config.json", "r", encoding="utf-8") as f:
                 config = json.load(f)
-                if "llm_config" in config:
-                    # rename for InternVL
-                    config["text_config"] = config["llm_config"]
-                return config
+        if "llm_config" in config:
+            # rename for InternVL
+            config["text_config"] = config["llm_config"]
+        if "thinker_config" in config:
+            # rename for Qwen2.5-Omni
+            config["text_config"] = config["thinker_config"]["text_config"]
+        return config
 
     @classmethod
     def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
         assert names
 
         def func(modelcls: AnyModel) -> AnyModel:
-            model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT
+            model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
             for name in names:
                 cls._model_classes[model_type][name] = modelcls
             return modelcls
@@ -519,15 +523,15 @@ class TextModel(ModelBase):
             self.gguf_writer.add_context_length(n_ctx)
             logger.info(f"gguf: context length = {n_ctx}")
 
-        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
+        if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
             self.gguf_writer.add_embedding_length(n_embd)
             logger.info(f"gguf: embedding length = {n_embd}")
 
-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
+        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
             self.gguf_writer.add_feed_forward_length(n_ff)
             logger.info(f"gguf: feed forward length = {n_ff}")
 
-        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
+        if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
             self.gguf_writer.add_head_count(n_head)
             logger.info(f"gguf: head count = {n_head}")
 
@@ -670,12 +674,12 @@ class TextModel(ModelBase):
         if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
             # ref: https://huggingface.co/tiiuae/falcon-7b
             res = "falcon"
-        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
-            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
-            res = "falcon3"
         if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
             # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
             res = "bert-bge"
+        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
+            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
+            res = "falcon3"
         if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
             # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
             res = "bert-bge-large"
@@ -727,9 +731,6 @@ class TextModel(ModelBase):
         if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
             res = "jina-v2-code"
-        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
-            res = "chatglm-bpe"
         if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
             # ref: https://huggingface.co/LumiOpen/Viking-7B
             res = "viking"
@@ -760,9 +761,6 @@ class TextModel(ModelBase):
         if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
             # ref: https://huggingface.co/facebook/chameleon-7b
             res = "chameleon"
-        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
-            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
-            res = "minerva-7b"
         if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
             # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
             res = "roberta-bpe"
@@ -793,15 +791,24 @@ class TextModel(ModelBase):
         if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
             # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
             res = "llama4"
-        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
-            res = "glm4"
         if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
             # ref: https://huggingface.co/mistral-community/pixtral-12b
             res = "pixtral"
         if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
             # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
             res = "seed-coder"
+        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"
+        if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"
+        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
+            res = "glm4"
+        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
+            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
+            res = "minerva-7b"
 
         if res is None:
             logger.warning("\n")
@@ -1040,6 +1047,10 @@ class TextModel(ModelBase):
         special_vocab.chat_template = "rwkv-world"
         # hack: Add '\n\n' as the EOT token to make it chat normally
         special_vocab._set_special_token("eot", 261)
+        # hack: Override these as they have already been set (incorrectly)
+        special_vocab.special_token_ids["bos"] = 0
+        special_vocab.special_token_ids["eos"] = 0
+
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
@@ -1114,60 +1125,116 @@ class TextModel(ModelBase):
             self.gguf_writer.add_pooling_type(pooling_type)
 
 
-class VisionModel(ModelBase):
-    model_type = ModelType.VISION
-    model_arch = gguf.MODEL_ARCH.CLIP_VISION
+class MmprojModel(ModelBase):
+    model_type = ModelType.MMPROJ
+    model_arch = gguf.MODEL_ARCH.MMPROJ
     preprocessor_config: dict[str, Any]
     global_config: dict[str, Any]
 
+    n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
+
+    has_vision_encoder: bool = True # by default
+    has_audio_encoder: bool = False
+
+    # for models having multiple encoders, we need to separate their hparams
+    hparams_vision: dict[str, Any] | None = None
+    hparams_audio: dict[str, Any] | None = None
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
-            raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
+        if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
+            raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
 
         # get n_embd of the text model
         if "text_config" not in self.hparams:
             self.hparams["text_config"] = {}
+        if "audio_config" not in self.hparams:
+            self.hparams["audio_config"] = {}
         text_config = {**self.hparams, **self.hparams["text_config"]}
         self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
         assert self.n_embd_text > 0, "n_embd not found in hparams"
 
-        if "vision_config" not in self.hparams:
-            raise ValueError("vision_config not found in hparams")
         # move vision config to the top level, while preserving the original hparams in global_config
-        self.global_config = self.hparams
-        self.hparams = self.hparams["vision_config"]
+        import copy
+        self.global_config = copy.deepcopy(self.hparams)
+        self.hparams_vision = self.get_vision_config()
+        self.hparams_audio = self.get_audio_config()
 
-        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
-        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
+        if self.hparams_vision is None and self.hparams_audio is None:
+            raise ValueError("vision_config / audio_config not found in hparams")
+
+        # for compat with vision-only models
+        self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
+
+        # TODO @ngxson : this is a hack to support both vision and audio encoders
+        have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
+        self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
+        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
 
         # load preprocessor config
         with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
             self.preprocessor_config = json.load(f)
 
+    def get_vision_config(self) -> dict[str, Any] | None:
+        return self.global_config.get("vision_config")
+
+    def get_audio_config(self) -> dict[str, Any] | None:
+        return self.global_config.get("audio_config")
+
     def set_type(self):
-        self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)
+        self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
 
     def set_gguf_parameters(self):
         self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
-        self.gguf_writer.add_vision_has_vision_encoder(True)
 
-        # vision config
-        self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
-        self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
-        self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
-        self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
-        self.gguf_writer.add_vision_block_count(self.block_count)
-        self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
+        if self.has_vision_encoder:
+            self.gguf_writer.add_clip_has_vision_encoder(True)
+            self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
 
-        # preprocessor config
-        self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
-        self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
+            # vision config
+            self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
+            self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
+            self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
+            self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
+            self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
+            self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
+
+            # preprocessor config
+            self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
+            self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
+
+        if self.has_audio_encoder:
+            self.gguf_writer.add_clip_has_audio_encoder(True)
+            self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
+
+            # audio config
+            self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
+            self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
+            self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
+            self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
+
+        if not self.has_vision_encoder and not self.has_audio_encoder:
+            raise ValueError("MmprojModel must have either vision or audio encoder")
 
     def write_vocab(self):
-        raise ValueError("VisionModel does not support vocab writing")
+        raise ValueError("MmprojModel does not support vocab writing")
+
+    def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
+        assert self.hparams_vision is not None
+        return self._find_param(self.hparams_vision, keys, optional)
+
+    def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
+        assert self.hparams_audio is not None
+        return self._find_param(self.hparams_audio, keys, optional)
+
+    def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
+        key = next((k for k in keys if k in obj), None)
+        if key is not None:
+            return obj[key]
+        if optional:
+            return None
+        raise KeyError(f"could not find any of: {keys}")
 
 
 @ModelBase.register("GPTNeoXForCausalLM")
@@ -1781,7 +1848,8 @@ class StableLMModel(TextModel):
     "MistralForCausalLM",
     "MixtralForCausalLM",
     "VLlama3ForCausalLM",
-    "LlavaForConditionalGeneration")
+    "LlavaForConditionalGeneration",
+    "LlamaModel")
 class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
     undo_permute = True
@@ -1861,6 +1929,8 @@ class LlamaModel(TextModel):
 
         if is_vision_tensor:
             return [] # skip vision tensors
+        elif self.hf_arch == "LlamaModel":
+            name = "model." + name
         elif name.startswith("model.text_model"):
             name = name.replace("text_model.", "") # for SmolVLM
         elif name.startswith("language_model."):
@@ -1951,7 +2021,7 @@ class LlamaModel(TextModel):
     "LlavaForConditionalGeneration", # pixtral
     "Mistral3ForConditionalGeneration", # mistral small 3.1
 )
-class LlavaVisionModel(VisionModel):
+class LlavaVisionModel(MmprojModel):
     img_break_tok_id = -1
 
     def __init__(self, *args, **kwargs):
@@ -1977,7 +2047,7 @@ class LlavaVisionModel(VisionModel):
         super().set_gguf_parameters()
         hparams = self.hparams
         if hparams["model_type"] == "pixtral":
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
             self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
 
             # hidden_act
@@ -2016,7 +2086,7 @@ class LlavaVisionModel(VisionModel):
 
 
 @ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
-class SmolVLMModel(VisionModel):
+class SmolVLMModel(MmprojModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         if self.hparams["model_type"] == "smolvlm_vision":
@@ -2028,7 +2098,7 @@ class SmolVLMModel(VisionModel):
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
         self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
         self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
         self.gguf_writer.add_vision_use_gelu(True)
@@ -2094,10 +2164,10 @@ class Llama4Model(LlamaModel):
 
 
 @ModelBase.register("Llama4ForConditionalGeneration")
-class Llama4VisionModel(VisionModel):
+class Llama4VisionModel(MmprojModel):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
         self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
         self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
         assert self.hparams["hidden_act"] == "gelu"
@@ -2109,6 +2179,9 @@ class Llama4VisionModel(VisionModel):
             # process vision tensors
             if "positional_embedding_vlm" in name and ".weight" not in name:
                 name += ".weight"
+            if "multi_modal_projector.linear_1" in name:
+                # despite the name with number postfix, this is a single fully connected layer
+                return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
             return [(self.map_tensor_name(name), data_torch)]
         return []
 
@@ -2615,7 +2688,7 @@ class QwenModel(TextModel):
         self.gguf_writer.add_file_type(self.ftype)
 
 
-@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM")
+@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
 class Qwen2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.QWEN2
 
@@ -2639,13 +2712,19 @@ class Qwen2Model(TextModel):
             name = f"model.{name}"  # map to Qwen2ForCausalLM tensors
         if "language_model." in name:
             name = name.replace("language_model.", "") # for InternVL
-        if name.startswith("mlp") or name.startswith("vision_model"):
-            # skip visual tensors
+        if name.startswith("mlp") or name.startswith("multi_modal_projector") \
+                or name.startswith("vision_model") or name.startswith("audio_tower"):
+            # skip vision and audio tensors
             return []
         yield from super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+@ModelBase.register(
+    "Qwen2VLModel",
+    "Qwen2VLForConditionalGeneration",
+    "Qwen2_5_VLForConditionalGeneration",
+    "Qwen2_5OmniModel",
+)
 class Qwen2VLModel(TextModel):
     model_arch = gguf.MODEL_ARCH.QWEN2VL
 
@@ -2663,31 +2742,40 @@ class Qwen2VLModel(TextModel):
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
-        if name.startswith("visual."):
-            # skip visual tensors
+        if name.startswith("thinker."):
+            name = name.replace("thinker.", "")
+        if name.startswith("visual") or name.startswith("audio") or \
+                name.startswith("talker") or name.startswith("token2wav"):
+            # skip multimodal tensors
             return []
         return [(self.map_tensor_name(name), data_torch)]
 
 
 @ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
-class Qwen2VLVisionModel(VisionModel):
+class Qwen2VLVisionModel(MmprojModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.hparams["image_size"] = self.hparams.get("image_size", 560)
+        assert self.hparams_vision is not None
+        self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
         # rename config.json values
-        self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
-        self.hparams["num_hidden_layers"] = self.hparams.get("depth")
-        if "embed_dim" in self.hparams: # qwen2vl
-            self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
-            self.hparams["hidden_size"] = self.hparams.get("embed_dim")
+        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
+        self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
+        if "embed_dim" in self.hparams_vision: # qwen2vl
+            self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
+            self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        hparams = self.hparams
-        if self.global_config['model_type'] == 'qwen2_vl':
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
-        elif self.global_config['model_type'] == 'qwen2_5_vl':
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
+        assert self.hparams_vision is not None
+        hparams = self.hparams_vision
+        model_type = self.global_config['model_type']
+        if model_type == 'qwen2_vl':
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
+        elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
+            if model_type == 'qwen2_5_omni':
+                self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
+            else:
+                self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
             self.gguf_writer.add_vision_use_silu(True)
             # find n_wa_pattern (window attention pattern)
             fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@@ -2745,12 +2833,72 @@ class Qwen2VLVisionModel(VisionModel):
         return [] # skip other tensors
 
 
+@ModelBase.register("Qwen2_5OmniModel")
+class Qwen25OmniModel(Qwen2VLVisionModel):
+    has_vision_encoder = True
+    has_audio_encoder = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_audio is not None
+        self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
+        self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
+        self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_audio is not None
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
+
+    def get_vision_config(self) -> dict[str, Any] | None:
+        return self.global_config["thinker_config"].get("vision_config")
+
+    def get_audio_config(self) -> dict[str, Any] | None:
+        return self.global_config["thinker_config"].get("audio_config")
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        # SinusoidsPositionEmbedding
+        assert self.hparams_audio is not None
+        max_timescale = 10000
+        length = 1500
+        channels = self.hparams_audio["hidden_size"]
+        log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+        inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
+        scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+        pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
+        yield ("audio_tower.embed_positions.weight", pos_embd)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        del bid, new_name, n_dims  # unused
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F16
+        return False
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("thinker."):
+            name = name.replace("thinker.", "")
+
+        if name.startswith("audio_tower"):
+            # process audio tensors
+            if "conv1.bias" in name or "conv2.bias" in name:
+                # transpose conv1 and conv2 bias
+                data_torch = data_torch.unsqueeze(-1)
+            if "audio_bos_eos_token" in name:
+                # this tensor is left unused in transformers code
+                # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
+                return []
+            return [(self.map_tensor_name(name), data_torch)]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("InternVisionModel")
-class InternVisionModel(VisionModel):
+class InternVisionModel(MmprojModel):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
         self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
         # hidden_act
         if hparams["hidden_act"] == "silu":
@@ -3541,7 +3689,7 @@ class InternLM3Model(TextModel):
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
+@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
 class BertModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -3549,11 +3697,21 @@ class BertModel(TextModel):
         super().__init__(*args, **kwargs)
         self.vocab_size = None
 
+        if cls_out_labels := self.hparams.get("id2label"):
+            if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
+                # Remove dummy labels added by AutoConfig
+                cls_out_labels = None
+        self.cls_out_labels = cls_out_labels
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_causal_attention(False)
         self._try_set_pooling_type()
 
+        if self.cls_out_labels:
+            key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
+            self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
+
     def set_vocab(self):
         tokens, toktypes, tokpre = self.get_vocab_base()
         self.vocab_size = len(tokens)
@@ -3604,6 +3762,14 @@ class BertModel(TextModel):
         if name.startswith("cls.seq_relationship"):
             return []
 
+        if self.cls_out_labels:
+            # For BertForSequenceClassification (direct projection layer)
+            if name == "classifier.weight":
+                name = "classifier.out_proj.weight"
+
+            if name == "classifier.bias":
+                name = "classifier.out_proj.bias"
+
         return [(self.map_tensor_name(name), data_torch)]
 
     def _xlmroberta_tokenizer_init(self) -> None:
@@ -3648,7 +3814,7 @@ class BertModel(TextModel):
             remove_whitespaces = tokenizer.clean_up_tokenization_spaces
             precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
 
-            vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
         else:
             sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
             sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -3661,7 +3827,7 @@ class BertModel(TextModel):
             tokenizer = SentencePieceProcessor()
             tokenizer.LoadFromFile(str(tokenizer_path))
 
-            vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+            vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
 
         tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
         scores: list[float] = [-10000.0] * vocab_size
@@ -3691,33 +3857,26 @@ class BertModel(TextModel):
             unk_token = tokenizer_config_json.get("unk_token")
             unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
 
-            for token_id in range(vocab_size):
+            for token_id in range(tokenizer.vocab_size):
                 piece = tokenizer._convert_id_to_token(token_id)
-                text = piece.encode("utf-8")
-                score = tokenizer_json["model"]["vocab"][token_id][1]
+                if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
+                    text = piece.encode("utf-8")
+                    score = tokenizer_json["model"]["vocab"][token_id][1]
 
-                toktype = SentencePieceTokenTypes.NORMAL
-                if token_id == unk_token_id:
-                    toktype = SentencePieceTokenTypes.UNKNOWN
-                elif token_id in tokenizer.all_special_ids:
-                    toktype = SentencePieceTokenTypes.CONTROL
-                elif token_id in added_vocab.values():
-                    toktype = SentencePieceTokenTypes.USER_DEFINED
-                # No reliable way to detect this, but jina-embeddings-v3 doesn't have any
-                # elif tokenizer.IsByte(token_id):
-                #    toktype = SentencePieceTokenTypes.BYTE
+                    toktype = SentencePieceTokenTypes.NORMAL
+                    if token_id == unk_token_id:
+                        toktype = SentencePieceTokenTypes.UNKNOWN
+                    elif token_id in tokenizer.all_special_ids:
+                        toktype = SentencePieceTokenTypes.CONTROL
+                    elif token_id in added_vocab.values():
+                        toktype = SentencePieceTokenTypes.USER_DEFINED
+                    # No reliable way to detect this, but jina doesn't have any
+                    # elif tokenizer.IsByte(token_id):
+                    #     toktype = SentencePieceTokenTypes.BYTE
 
-                tokens[token_id] = text
-                scores[token_id] = score
-                toktypes[token_id] = toktype
-
-        if vocab_size > len(tokens):
-            pad_count = vocab_size - len(tokens)
-            logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
-            for i in range(1, pad_count + 1):
-                tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
-                scores.append(-1000.0)
-                toktypes.append(SentencePieceTokenTypes.UNUSED)
+                    tokens[token_id] = text
+                    scores[token_id] = score
+                    toktypes[token_id] = toktype
 
         if isinstance(tokenizer, SentencePieceProcessor):
             # realign tokens (see HF tokenizer code)
@@ -3730,6 +3889,12 @@ class BertModel(TextModel):
                 SentencePieceTokenTypes.UNKNOWN,
             ] + toktypes[3:-1]
 
+            if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
+                # Add mask token missing from sentencepiece.bpe.model
+                tokens[250001] = b'<mask>'
+                scores[250001] = 0.0
+                toktypes[250001] = SentencePieceTokenTypes.CONTROL
+
         self.gguf_writer.add_tokenizer_model("t5")
         self.gguf_writer.add_tokenizer_pre("default")
         self.gguf_writer.add_token_list(tokens)
@@ -3748,7 +3913,27 @@ class BertModel(TextModel):
         self.gguf_writer.add_add_eos_token(True)
 
 
-@ModelBase.register("RobertaModel")
+@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
+class DistilBertModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_layer_norm_eps(1e-12)
+        logger.info("gguf: layer norm epsilon = 1e-12")
+        super().set_gguf_parameters()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("distilbert."):
+            name = name[11:]
+
+        # These layers act as MLM head, so we don't need them
+        if name.startswith("vocab_"):
+            return []
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
 class RobertaModel(BertModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -4086,11 +4271,11 @@ class Gemma3Model(TextModel):
 
 
 @ModelBase.register("Gemma3ForConditionalGeneration")
-class Gemma3VisionModel(VisionModel):
+class Gemma3VisionModel(MmprojModel):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
         # default values below are taken from HF tranformers code
         self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
         self.gguf_writer.add_vision_use_gelu(True)
@@ -6037,6 +6222,65 @@ class ChameleonModel(TextModel):
         return data_torch
 
 
+@ModelBase.register("UltravoxModel")
+class UltravoxModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA # dummy
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
+
+
+@ModelBase.register("Qwen2AudioForConditionalGeneration")
+class WhisperEncoderModel(MmprojModel):
+    has_vision_encoder = False # no vision encoder
+    has_audio_encoder = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hparams["hidden_size"] = self.hparams["d_model"]
+        self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
+        self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2A)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        del bid, new_name, n_dims  # unused
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F16
+        return False
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if name.startswith("language_model."):
+            # skip language model tensors
+            return []
+
+        # prevent clash naming with vision tensors
+        if name.startswith("multi_modal_projector"):
+            name = "audio." + name
+
+        if "conv1.bias" in name or "conv2.bias" in name:
+            # transpose conv1 and conv2 bias
+            data_torch = data_torch.unsqueeze(-1)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
+@ModelBase.register("UltravoxModel")
+class UltravoxWhisperEncoderModel(WhisperEncoderModel):
+    has_vision_encoder = False # no vision encoder
+    has_audio_encoder = True
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
+
 ###### CONVERSION LOGIC ######
 
 
@@ -6212,13 +6456,15 @@ def split_str_to_n_bytes(split_str: str) -> int:
 
 
 def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
+    # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
+    # maybe we should fallback to text model's arch in that case, since not many models have both
     text_config = hparams.get("text_config", {})
     vision_config = hparams.get("vision_config", {})
     arch = hparams["architectures"][0]
     # if "architectures" is found in the sub-config, use that instead
     if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
         arch = text_config["architectures"][0]
-    elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
+    elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
         arch = vision_config["architectures"][0]
     return arch
 
@@ -6281,7 +6527,7 @@ def main() -> None:
 
     with torch.inference_mode():
         output_type = ftype_map[args.outtype]
-        model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
+        model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
         hparams = ModelBase.load_hparams(dir_model)
         model_architecture = get_model_architecture(hparams, model_type)
         logger.info(f"Model architecture: {model_architecture}")
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 5993a4c98..2f733f097 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -1,28 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-# This script downloads the tokenizer models of the specified models from Huggingface and
-# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
-#
-# This is necessary in order to analyze the type of pre-tokenizer used by the model and
-# provide the necessary information to llama.cpp via the GGUF header in order to implement
-# the same pre-tokenizer.
-#
-# ref: https://github.com/ggml-org/llama.cpp/pull/6920
-#
-# Instructions:
-#
-# - Add a new model to the "models" list
-# - Run the script with your huggingface token:
-#
-#   python3 convert_hf_to_gguf_update.py <huggingface_token>
-#
-# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
-# - Update llama.cpp with the new pre-tokenizer if necessary
-#
-# TODO: generate tokenizer tests for llama.cpp
-#
-
 import logging
 import os
 import pathlib
@@ -32,6 +10,7 @@ import requests
 import sys
 import json
 import shutil
+import argparse
 
 from hashlib import sha256
 from enum import IntEnum, auto
@@ -41,6 +20,11 @@ logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("convert_hf_to_gguf_update")
 sess = requests.Session()
 
+convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
+convert_py = convert_py_pth.read_text(encoding="utf-8")
+hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
+hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
+
 
 class TOKENIZER_TYPE(IntEnum):
     SPM = auto()
@@ -49,20 +33,49 @@ class TOKENIZER_TYPE(IntEnum):
     UGM = auto()
 
 
+DOC_STRING = """
+This script downloads the tokenizer models of the specified models from Huggingface and
+generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
+
+/!\\ It is intended to be used by contributors and is not meant to be run by end users
+
+This is necessary in order to analyze the type of pre-tokenizer used by the model and
+provide the necessary information to llama.cpp via the GGUF header in order to implement
+the same pre-tokenizer.
+
+ref: https://github.com/ggml-org/llama.cpp/pull/6920
+
+Instructions:
+
+- Add a new model to the "models" list
+- Run the script with your huggingface token
+    By default, token will be read from ~/.cache/huggingface/token
+- The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
+- Update llama.cpp with the new pre-tokenizer if necessary
+"""
+# TODO: generate tokenizer tests for llama.cpp
+
+parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument(
+    "--full", action="store_true",
+    help="download full list of models - make sure you have access to all of them",
+)
+parser.add_argument(
+    "hf_token",
+    help="optional HF token",
+    nargs="?",
+)
+args = parser.parse_args()
+hf_token = args.hf_token if args.hf_token is not None else hf_token
+
+if hf_token is None:
+    logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
+    sys.exit(1)
+
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
-if len(sys.argv) == 2:
-    token = sys.argv[1]
-    if not token.startswith("hf_"):
-        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
-        sys.exit(1)
-else:
-    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
-    sys.exit(1)
-
 # TODO: add models here, base models preferred
 models = [
     {"name": "llama-spm",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
@@ -103,7 +116,6 @@ models = [
     {"name": "exaone",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
     {"name": "phi-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
     {"name": "chameleon",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
-    {"name": "minerva-7b",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
     {"name": "roberta-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
     {"name": "gigachat",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
     {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
@@ -114,11 +126,19 @@ models = [
     {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
     {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
     {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
-    {"name": "glm4",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
     {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
     {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
 ]
 
+# some models are known to be broken upstream, so we will skip them as exceptions
+pre_computed_hashes = [
+    # chatglm-bpe has 2 hashes, why?
+    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
+    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
+    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
+    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
+]
+
 
 def download_file_with_auth(url, token, save_path):
     headers = {"Authorization": f"Bearer {token}"}
@@ -169,9 +189,29 @@ def download_model(model):
             if os.path.isfile(save_path):
                 logger.info(f"{name}: File {save_path} already exists - skipping")
                 continue
-            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+            download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)
 
 
+# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
+# returns mapping res --> chkhsh
+def get_existing_models(convert_py):
+    pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
+    matches = re.findall(pattern, convert_py)
+    output = {}
+    for chkhsh, res in matches:
+        output[res] = chkhsh
+    return output
+
+
+existing_models = {}
+all_models = models.copy()
+if not args.full:
+    # Filter out models that already exist in convert_hf_to_gguf.py
+    existing_models = get_existing_models(convert_py)
+    all_models = models.copy()
+    models = [model for model in all_models if model["name"] not in existing_models]
+
+logging.info(f"Downloading {len(models)} models...")
 for model in models:
     try:
         download_model(model)
@@ -182,9 +222,10 @@ for model in models:
 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
 
 src_ifs = ""
-for model in models:
+for model in [*all_models, *pre_computed_hashes]:
     name = model["name"]
     tokt = model["tokt"]
+    chkhsh = model.get("chkhsh")
 
     if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
         continue
@@ -195,35 +236,44 @@ for model in models:
         continue
 
     # create the tokenizer
-    try:
-        if name == "t5":
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
-        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
-        continue  # Skip to the next model if the tokenizer can't be loaded
+    if chkhsh is not None:
+        # if the model has a pre-computed hash, use it
+        logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
+    elif name in existing_models:
+        # if the model already exists in convert_hf_to_gguf.py, skip compute hash
+        chkhsh = existing_models[name]
+    else:
+        # otherwise, compute the hash of the tokenizer
+        try:
+            logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
+            if name == "t5":
+                tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        except OSError as e:
+            logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
+            continue  # Skip to the next model if the tokenizer can't be loaded
 
-    chktok = tokenizer.encode(CHK_TXT)
-    chkhsh = sha256(str(chktok).encode()).hexdigest()
+        chktok = tokenizer.encode(CHK_TXT)
+        chkhsh = sha256(str(chktok).encode()).hexdigest()
 
-    logger.info(f"model: {name}")
-    logger.info(f"tokt: {tokt}")
-    logger.info(f"repo: {model['repo']}")
-    logger.info(f"chktok: {chktok}")
-    logger.info(f"chkhsh: {chkhsh}")
+        logger.info(f"model: {name}")
+        logger.info(f"tokt: {tokt}")
+        logger.info(f"repo: {model['repo']}")
+        logger.info(f"chktok: {chktok}")
+        logger.info(f"chkhsh: {chkhsh}")
 
-    # print the "pre_tokenizer" content from the tokenizer.json
-    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
-        cfg = json.load(f)
-        normalizer = cfg["normalizer"]
-        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
-        pre_tokenizer = cfg["pre_tokenizer"]
-        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
-        if "ignore_merges" in cfg["model"]:
-            logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
+        # print the "pre_tokenizer" content from the tokenizer.json
+        with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
+            cfg = json.load(f)
+            normalizer = cfg["normalizer"]
+            logger.info("normalizer: " + json.dumps(normalizer, indent=4))
+            pre_tokenizer = cfg["pre_tokenizer"]
+            logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+            if "ignore_merges" in cfg["model"]:
+                logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
 
-    logger.info("")
+        logger.info("")
 
     src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
     src_ifs += f"            # ref: {model['repo']}\n"
@@ -271,8 +321,6 @@ src_func = f"""
         return res
 """
 
-convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
-convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
     r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
     lambda m: m.group(1) + src_func + m.group(3),
@@ -288,7 +336,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
 
 tests = [
     "ied 4 ½ months",
-    "Führer",
+    "Äpfel",
     "",
     " ",
     "  ",
@@ -367,6 +415,10 @@ for model in models:
         logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
         continue  # Skip this model and continue with the next one in the loop
 
+    if not os.path.exists(f"models/ggml-vocab-{name}.gguf"):
+        logger.info(f"Skip vocab files for model {name}, no GGUF file found")
+        continue
+
     with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
         for text in tests:
             f.write(f"{text}")
diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md
old mode 100644
new mode 100755
index e172ec5c2..a5ba617ca
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -280,6 +280,15 @@ cmake --build build --config release
 ### **GitHub contribution**:
 Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
 
+## Updates
+### Basic Flash Attention Support
+The basic FA kernel with aclnnops has been added in aclnn_ops.cpp.
+Currently, the FA only supports the cases with FP16 KV tensors and NO logit softcap.
+Since the aclnn interface for flash attention cannot support the logit softcap, we will only update the quantized version in the future.
+
+Authors from Peking University: Bizhao Shi (bshi@pku.edu.cn), Yuxin Yang (yxyang@pku.edu.cn), Ruiyang Ma (ruiyang@stu.pku.edu.cn), and Guojie Luo (gluo@pku.edu.cn).
+
+We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
 
 ## TODO
 - Support more models and data types.
diff --git a/docs/build.md b/docs/build.md
index c9027c0b5..32717a793 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -63,6 +63,7 @@ cmake --build build --config Release
       cmake --preset x64-windows-llvm-release
       cmake --build build-x64-windows-llvm-release
       ```
+- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
 
 ## BLAS Build
 
diff --git a/docs/function-calling.md b/docs/function-calling.md
index c3873c3fa..fd3db9bd1 100644
--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -2,7 +2,6 @@
 
 [chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
 - `llama-server` when started w/ `--jinja` flag
-- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
 
 ## Universal support w/ Native & Generic handlers
 
@@ -325,36 +324,65 @@ To get the official template from original HuggingFace repos, you can use [scrip
 > [!TIP]
 > If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
 
+> [!CAUTION]
+> Beware of extreme KV quantizations (e.g. `-ctk q4_0`), they can substantially degrade the model's tool calling performance.
+
 Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
 
 ```bash
 curl http://localhost:8080/v1/chat/completions -d '{
-"model": "gpt-3.5-turbo",
-"tools": [
-    {
-    "type":"function",
-    "function":{
-        "name":"python",
-        "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
-        "parameters":{
-        "type":"object",
-        "properties":{
-            "code":{
-            "type":"string",
-            "description":"The code to run in the ipython interpreter."
+    "model": "gpt-3.5-turbo",
+    "tools": [
+        {
+        "type":"function",
+        "function":{
+            "name":"python",
+            "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+            "parameters":{
+            "type":"object",
+            "properties":{
+                "code":{
+                "type":"string",
+                "description":"The code to run in the ipython interpreter."
+                }
+            },
+            "required":["code"]
             }
-        },
-        "required":["code"]
         }
-    }
-    }
-],
-"messages": [
-    {
-    "role": "user",
-    "content": "Print a hello world message with python."
-    }
-]
+        }
+    ],
+    "messages": [
+        {
+        "role": "user",
+        "content": "Print a hello world message with python."
+        }
+    ]
+}'
+
+
+curl http://localhost:8080/v1/chat/completions -d '{
+    "model": "gpt-3.5-turbo",
+    "messages": [
+        {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
+        {"role": "user", "content": "What is the weather in Istanbul?"}
+    ],
+    "tools": [{
+        "type":"function",
+        "function":{
+            "name":"get_current_weather",
+            "description":"Get the current weather in a given location",
+            "parameters":{
+                "type":"object",
+                "properties":{
+                    "location":{
+                        "type":"string",
+                        "description":"The city and country/state, e.g. `San Francisco, CA`, or `Paris, France`"
+                    }
+                },
+                "required":["location"]
+            }
+        }
+    }]
 }'
 ```
 
diff --git a/docs/multimodal.md b/docs/multimodal.md
index 054778e91..e849c2a0b 100644
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -4,7 +4,9 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
 - [llama-mtmd-cli](../tools/mtmd/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
 
-To enable it, can use use one of the 2 methods below:
+Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
+
+To enable it, you can use one of the 2 methods below:
 
 - Use `-hf` option with a supported model (see a list of pre-quantized model below)
     - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
@@ -31,12 +33,14 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
 
 ## Pre-quantized models
 
-These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org
+These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
 
 Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
 
 NOTE: some models may require large context window, for example: `-c 8192`
 
+**Vision models**:
+
 ```sh
 # Gemma 3
 (tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
@@ -77,4 +81,29 @@ NOTE: some models may require large context window, for example: `-c 8192`
 
 # Llama 4 Scout
 (tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
+
+# Moondream2 20250414 version
+(tool_name) -hf ggml-org/moondream2-20250414-GGUF
+
+```
+
+**Audio models**:
+
+```sh
+# Ultravox 0.5
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
+
+# Qwen2-Audio and SeaLLM-Audio
+# note: no pre-quantized GGUF this model, as they have very poor result
+# ref: https://github.com/ggml-org/llama.cpp/pull/13760
+```
+
+**Mixed modalities**:
+
+```sh
+# Qwen2.5 Omni
+# Capabilities: audio input, vision input
+(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
+(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
 ```
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 01ff6763f..71f700877 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -41,8 +41,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_encode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to encode\n", __func__);
+    if (llama_decode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to process\n", __func__);
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
diff --git a/examples/parallel/README.md b/examples/parallel/README.md
index ece3a6641..2468a30d2 100644
--- a/examples/parallel/README.md
+++ b/examples/parallel/README.md
@@ -4,7 +4,7 @@ Simplified simulation of serving incoming requests in parallel
 
 ## Example
 
-Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question.
+Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of up to 10 junk questions (`--junk 10`) followed by the actual question.
 
 ```bash
 llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index acb1301a2..cd85bea9a 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
     common_params params;
 
     params.n_predict = 128;
-    params.n_junk = 0;
+    params.n_junk = 1;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
         return 1;
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
     const bool is_sp_shared = params.is_pp_shared;
 
     // extra text to insert in each client's prompt in order to make it larger
-    const int32_t n_junk = params.n_junk;
+    const int32_t n_junk = std::max(1, params.n_junk);
 
     // init llama.cpp
     llama_backend_init();
@@ -315,7 +315,10 @@ int main(int argc, char ** argv) {
                     } else {
                         client.prompt += k_system;
                     }
-                    for (int i = 0; i < n_junk; ++i) {
+
+                    const int n_junk_cur = rand() % n_junk;
+
+                    for (int i = 0; i < n_junk_cur; ++i) {
                         const int r = rand() % k_questions.size();
                         client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
                     }
@@ -340,7 +343,7 @@ int main(int argc, char ** argv) {
                     client.n_decoded = 0;
                     client.i_batch   = batch.n_tokens - 1;
 
-                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur);
 
                     g_seq_id += 1;
 
@@ -359,7 +362,9 @@ int main(int argc, char ** argv) {
         // process in chunks of params.n_batch
         int32_t n_batch = params.n_batch;
 
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+        int32_t i_next = 0;
+
+        for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
             // experiment: process in powers of 2
             //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
             //    n_batch /= 2;
@@ -367,7 +372,7 @@ int main(int argc, char ** argv) {
             //    continue;
             //}
 
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
             llama_batch batch_view = {
                 n_tokens,
@@ -387,19 +392,24 @@ int main(int argc, char ** argv) {
                     return 1;
                 }
 
-                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
 
                 n_cache_miss += 1;
 
                 // retry with half the batch size to try to find a free slot in the KV cache
                 n_batch /= 2;
-                i -= n_batch;
 
                 continue;
             }
 
             LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
 
+            // move the head of the batch forward with the number of tokens we just processed
+            i_next = i + n_tokens;
+
+            // on successful decode, restore the original batch size
+            n_batch = params.n_batch;
+
             for (auto & client : clients) {
                 if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
                     continue;
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 347ea4a69..5ac881b45 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -133,9 +133,8 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_self_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_self_update  (ctx);
+            llama_kv_self_seq_add(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_self_seq_div(ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
 
             n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         }
@@ -169,8 +168,6 @@ int main(int argc, char ** argv) {
 
         llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
         llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_self_defrag (ctx);
-        llama_kv_self_update (ctx);
 
         n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
 
@@ -200,8 +197,6 @@ int main(int argc, char ** argv) {
 
             llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
             llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_self_defrag (ctx);
-            llama_kv_self_update (ctx);
 
             n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         }
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index e3d0c9542..754da1411 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
     }
 }
 
-static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
     llama_kv_self_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_encode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to encode\n", __func__);
+    if (llama_decode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to process\n", __func__);
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
         // encode if at capacity
         if (batch.n_tokens + n_toks > n_batch) {
             float * out = emb + p * n_embd;
-            batch_encode(ctx, batch, out, s, n_embd);
+            batch_process(ctx, batch, out, s, n_embd);
             common_batch_clear(batch);
             p += s;
             s = 0;
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
 
     // final batch
     float * out = emb + p * n_embd;
-    batch_encode(ctx, batch, out, s, n_embd);
+    batch_process(ctx, batch, out, s, n_embd);
 
     // save embeddings to chunks
     for (int i = 0; i < n_chunks; i++) {
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
         batch_add_seq(query_batch, query_tokens, 0);
 
         std::vector<float> query_emb(n_embd, 0);
-        batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
+        batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
 
         common_batch_clear(query_batch);
 
diff --git a/examples/training/README.md b/examples/training/README.md
index ecdf398f8..df4252792 100644
--- a/examples/training/README.md
+++ b/examples/training/README.md
@@ -10,8 +10,8 @@ Proof of concept:
 
 ``` sh
 export model_name=llama_3.2-1b && export quantization=f32
-./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
-./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
+./build/bin/llama-finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
+./build/bin/llama-perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
 ```
 
 The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4746d5cb7..3d01184a2 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -129,6 +129,7 @@ option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
+option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
 
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
@@ -176,7 +177,6 @@ option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
 option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
-option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
diff --git a/ggml/cmake/common.cmake b/ggml/cmake/common.cmake
index 1976d0ae9..bb1ec9b37 100644
--- a/ggml/cmake/common.cmake
+++ b/ggml/cmake/common.cmake
@@ -24,3 +24,28 @@ function(ggml_get_flags CCID CCVER)
     set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
     set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
 endfunction()
+
+function(ggml_get_system_arch)
+    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+        set(GGML_SYSTEM_ARCH "ARM" PARENT_SCOPE)
+    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR
+            CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
+        set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
+    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
+            "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
+        set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        set(GGML_SYSTEM_ARCH "loongarch64"  PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
+        set(GGML_SYSTEM_ARCH "riscv64" PARENT_SCOPE)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+        set(GGML_SYSTEM_ARCH "s390x" PARENT_SCOPE)
+    else()
+        set(GGML_SYSTEM_ARCH "UNKNOWN" PARENT_SCOPE)
+    endif()
+endfunction()
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index c81ff03fe..1a57f1cd7 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -528,15 +528,15 @@ extern "C" {
         GGML_UNARY_OP_STEP,
         GGML_UNARY_OP_TANH,
         GGML_UNARY_OP_ELU,
+        GGML_UNARY_OP_RELU,
         GGML_UNARY_OP_SIGMOID,
         GGML_UNARY_OP_GELU,
-        GGML_UNARY_OP_GELU_ERF,
         GGML_UNARY_OP_GELU_QUICK,
         GGML_UNARY_OP_SILU,
         GGML_UNARY_OP_HARDSWISH,
         GGML_UNARY_OP_HARDSIGMOID,
         GGML_UNARY_OP_EXP,
-        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_GELU_ERF,
 
         GGML_UNARY_OP_COUNT,
     };
@@ -935,6 +935,15 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // repeat a to the specified shape
+    GGML_API struct ggml_tensor * ggml_repeat_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+                       int64_t    ne0,
+                       int64_t    ne1,
+                       int64_t    ne2,
+                       int64_t    ne3);
+
     // sums repetitions in a into shape of b
     GGML_API struct ggml_tensor * ggml_repeat_back(
             struct ggml_context * ctx,
@@ -2086,9 +2095,6 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
     GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
 
-    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
-    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
-
     // print info and performance information for the graph
     GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
 
@@ -2172,6 +2178,7 @@ extern "C" {
 
     // scheduling priorities
     enum ggml_sched_priority {
+        GGML_SCHED_PRIO_LOW = -1,
         GGML_SCHED_PRIO_NORMAL,
         GGML_SCHED_PRIO_MEDIUM,
         GGML_SCHED_PRIO_HIGH,
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ddea5ad38..76b24bd9d 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -109,6 +109,8 @@ if (MSVC)
 else ()
     set(CMAKE_GENERATOR_PLATFORM_LWR "")
 endif ()
+ggml_get_system_arch()
+message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")
 
 if (NOT MSVC)
     if (GGML_STATIC)
@@ -194,6 +196,7 @@ add_library(ggml-base
             ../include/ggml-opt.h
             ../include/gguf.h
             ggml.c
+            ggml.cpp
             ggml-alloc.c
             ggml-backend.cpp
             ggml-opt.cpp
@@ -224,6 +227,7 @@ function(ggml_add_backend_library backend)
         set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
         target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
         add_dependencies(ggml ${backend})
+        install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
     else()
         add_library(${backend} ${ARGN})
         target_link_libraries(ggml PUBLIC ${backend})
@@ -287,16 +291,20 @@ if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
-    ggml_add_cpu_backend_variant(x64)
-    ggml_add_cpu_backend_variant(sse42        SSE42)
-    ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
-    ggml_add_cpu_backend_variant(haswell      SSE42 AVX F16C AVX2 BMI2 FMA)
-    ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
-    ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-    ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
-    if (NOT MSVC)
-        # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+    if (GGML_SYSTEM_ARCH STREQUAL "x86")
+        ggml_add_cpu_backend_variant(x64)
+        ggml_add_cpu_backend_variant(sse42        SSE42)
+        ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
+        ggml_add_cpu_backend_variant(haswell      SSE42 AVX F16C AVX2 BMI2 FMA)
+        ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
+        ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+        ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+        if (NOT MSVC)
+            # MSVC doesn't support AMX
+            ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+        endif()
+    else()
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
     endif()
 elseif (GGML_CPU)
     ggml_add_cpu_backend_variant_impl("")
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index b30b4cb38..b1050ad59 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1340,7 +1340,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
     // allocate graph
     if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
         // the re-allocation may cause the split inputs to be moved to a different address
-        ggml_backend_sched_synchronize(sched);
+        // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
+        for (int i = 0; i < sched->n_backends; i++) {
+            ggml_backend_synchronize(sched->backends[i]);
+        }
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
@@ -1564,7 +1567,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
 
     ggml_backend_sched_split_graph(sched, graph);
 
-
     if (!ggml_backend_sched_alloc_splits(sched)) {
         return false;
     }
@@ -1598,6 +1600,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
     for (int i = 0; i < sched->n_backends; i++) {
         ggml_backend_synchronize(sched->backends[i]);
     }
+    if (!sched->is_alloc) {
+        // if the graph is not already allocated, always use copy 0 after a synchronization
+        // this ensures that during generation the same copy is used every time,
+        // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
+        sched->cur_copy = 0;
+    }
 }
 
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt
index 0bf3c05d9..76064c3fd 100644
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -81,7 +81,7 @@ if (BLAS_FOUND)
     target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
     target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
 else()
-    message(ERROR "BLAS not found, please refer to "
-                  "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-                  " to set correct GGML_BLAS_VENDOR")
+    message(FATAL_ERROR "BLAS not found, please refer to "
+                        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+                        " to set correct GGML_BLAS_VENDOR")
 endif()
diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt
old mode 100644
new mode 100755
index 0d8e483b2..7742b3915
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -30,6 +30,7 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
 string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
 string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
+message(STATUS "CANN: SOC_VERSION =  ${SOC_VERSION}")
 
 if (CANN_INSTALL_DIR)
     # Only Support Linux.
diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile
old mode 100644
new mode 100755
diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp
old mode 100644
new mode 100755
index f5462c5a1..f311864d4
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -31,6 +31,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
             return ACL_FLOAT;
         case GGML_TYPE_F16:
             return ACL_FLOAT16;
+        case GGML_TYPE_BF16:
+            return ACL_BF16;
         case GGML_TYPE_I8:
             return ACL_INT8;
         case GGML_TYPE_I16:
diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h
old mode 100644
new mode 100755
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
old mode 100644
new mode 100755
index cbf9783b7..437ece2d4
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -66,6 +66,7 @@
 #include <aclnnop/aclnn_gt_scalar.h>
 #include <aclnnop/aclnn_pow.h>
 #include <aclnnop/aclnn_grouped_matmul_v2.h>
+#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
 #include <float.h>
 
 #include <cmath>
@@ -74,11 +75,13 @@
 #include <vector>
 
 #include "ggml-impl.h"
+#include "ggml.h"
 
 #define GGML_COMMON_DECL_C
 
 #include "../ggml-common.h"
 
+
 void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
                  aclTensor ** acl_src1, aclTensor ** acl_dst) {
     GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
@@ -2697,14 +2700,10 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
         }
     }
 
-    // GroupedMatmulV2 required tensor_list.size < 128
     size_t GROUP_SIZE = 128;
-    std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
-    std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
-    std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
-
-    // split and call GroupedMatmulV2
+    // GroupedMatmulV2 required tensor_list.size < 128
     for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
+        // split and call GroupedMatmulV2
         size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
         std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
         std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
@@ -2722,6 +2721,133 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
     return;
 }
 
+/**
+ * @brief Performs expert-specific matrix multiplication (MoE) with
+ * quantized precision using the CANN backend.
+ *
+ * This function executes a matrix multiplication operation tailored for
+ * Mixture of Experts (MoE) models, where the input tensor is multiplied
+ * with expert-specific quantized weight matrices. It leverages the CANN
+ * backend to perform efficient low-precision computations and stores the
+ * quantized result in the destination tensor `dst`.
+ *
+ * Quantization techniques reduce memory footprint and improve performance
+ * by using lower-bit representations (e.g., int8) instead of floating-point.
+ * This function is designed to work with such formats and may incorporate
+ * optimizations like identity-based fast paths or routing masks for sparse
+ * expert selection.
+ *
+ * @param ctx The context for executing CANN backend operations.
+ * @param dst The destination tensor where the quantized MoE multiplication result
+ * will be stored.
+ *
+ * @note This function assumes quantized data types and is designed for
+ * MoE architectures with potential sparse expert routing.
+ */
+static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // TODO: Use aclnnGroupedMatMul
+    //dst   [M, K, N, 1]
+    ggml_tensor * src0 = dst->src[0];  //src0	[D, M, A, 1]
+    ggml_tensor * src1 = dst->src[1];  //src1	[D, B, N, 1], B = K or B = 1
+    ggml_tensor * ids  = dst->src[2];  //ids	[K, N]
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    // copy index from npu to cpu
+    int64_t n_as = ne02; // A
+    int64_t n_ids = ids->ne[0]; // K
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+    ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
+        ACL_MEMCPY_DEVICE_TO_HOST);
+    ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
+
+    char * src0_original = (char *) src0->data;
+    char * src1_original = (char *) src1->data;
+    char * dst_original  = (char *)  dst->data;
+
+    ggml_tensor src0_row = *src0;
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row = *dst;
+
+    const enum ggml_type type = dst->src[0]->type;
+    float weight_elem_size;
+    if (type == GGML_TYPE_Q4_0) {
+        weight_elem_size = float(sizeof(uint8_t)) / 2;
+    } else if (type == GGML_TYPE_Q8_0) {
+        weight_elem_size = float(sizeof(uint8_t));
+    } else {
+        GGML_ABORT("MUL_MAT_ID only support quant type Q4_0 and Q8_0 ");
+    }
+
+    // src0_row [D, M, 1, 1] weight without permute
+    src0_row.ne[2] = 1;
+    src0_row.ne[3] = 1;
+    src0_row.nb[0] = weight_elem_size;
+    src0_row.nb[1] = weight_elem_size * ne00;
+    src0_row.nb[2] = weight_elem_size * ne00;
+    src0_row.nb[3] = weight_elem_size * ne00;
+    size_t weight_stride = ne00 * ne01 * weight_elem_size;
+    size_t weight_size = weight_stride * ne02 * ne03;
+
+    // scale [D, M, 1, 1] -> scale && permute
+    size_t scale_elem_size = sizeof(uint16_t);
+    size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+
+    // src1_row [D, 1, 1, 1] -> input
+    src1_row.ne[1] = 1;
+    src1_row.ne[2] = 1;
+    src1_row.ne[3] = 1;
+    src1_row.nb[2] = nb11;
+    src1_row.nb[3] = nb11;
+
+    // dst_row [M, 1, 1, 1] -> out
+    dst_row.ne[1] = 1;
+    dst_row.ne[2] = 1;
+    dst_row.ne[3] = 1;
+    dst_row.nb[2] = nb1;
+    dst_row.nb[3] = nb1;
+
+    //create weight for one row
+    ggml_cann_pool_alloc weight_allocator(ctx.pool());
+    void* weight_buffer = weight_allocator.alloc(nb02);
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            // expert index
+            int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+            // If B = 1 (broadcast), always use 0; otherwise, use id.
+            int64_t i11 = (ne11 == 1 ? 0 : id);
+            int64_t i12 = iid1;
+
+            int64_t i1 = id;
+            int64_t i2 = i12;
+
+            void* src0_tmp_ptr = src0_original + i02*weight_stride;
+            void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
+            void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
+            void* dst_tmp_ptr  = dst_original  + i1*nb1   + i2*nb2;
+
+            // mem cpy
+            ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
+                ACL_MEMCPY_DEVICE_TO_DEVICE);
+            void* scale_buffer = (char*)weight_buffer + weight_stride;
+            ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
+                ACL_MEMCPY_DEVICE_TO_DEVICE);
+
+            src0_row.data = weight_buffer;
+            src1_row.data = src1_tmp_ptr;
+            dst_row.data = dst_tmp_ptr;
+            dst_row.src[0] = &src0_row;
+            dst_row.src[1] = &src1_row;
+
+            ggml_cann_mul_mat(ctx, &dst_row);
+        }
+    }
+    return;
+}
+
 void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     const enum ggml_type type = dst->src[0]->type;
     switch (type) {
@@ -2729,8 +2855,339 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         case GGML_TYPE_F16:
             ggml_cann_mul_mat_id_fp(ctx, dst);
             break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q8_0:
+            ggml_cann_mul_mat_id_quant(ctx, dst);
+            break;
         default:
             GGML_ABORT("Unsupported type for mul_mat_id");
             break;
     }
 }
+
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+
+    ggml_tensor* src0 = dst->src[0]; // q, fp32
+    ggml_tensor* src1 = dst->src[1]; // k, fp16
+    ggml_tensor* src2 = dst->src[2]; // v, fp16
+    ggml_tensor* src3 = dst->src[3]; // mask, fp16
+
+    float maxBias = 0.0f;
+    float scaleValue = 1.0f;
+    float logitSoftcap = 0.0f;
+    memcpy(&scaleValue,    (float*)dst->op_params + 0, sizeof(float));
+    memcpy(&maxBias,       (float*)dst->op_params + 1, sizeof(float));
+    memcpy(&logitSoftcap,  (float*)dst->op_params + 2, sizeof(float));
+
+    if(logitSoftcap == 0.0f){
+        size_t faElemSize = sizeof(uint16_t);
+        auto   faDataType = ACL_FLOAT16; //ACL_BF16;
+
+        aclTensor* acl_src0_f16_tensor = nullptr;
+        aclTensor* acl_src1_f16_tensor = nullptr;
+        aclTensor* acl_src2_f16_tensor = nullptr;
+        aclTensor* acl_dst_f16_tensor  = nullptr;
+
+        // Step 1: cast the src0 (Query) to fp16 if needed
+        ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
+        void* src0_f16_buffer = nullptr;
+
+        if(ggml_cann_type_mapping(src0->type) != faDataType){
+            aclTensor* acl_src0_f32_tensor = ggml_cann_create_tensor(src0);
+            src0_f16_buffer = src0_f16_allocator.alloc(
+                                    ggml_nelements(src0) * faElemSize);
+
+            int64_t* src0_f16_ne = src0->ne;
+            size_t   src0_f16_nb[GGML_MAX_DIMS];
+            src0_f16_nb[0] = sizeof(uint16_t);
+            for(int i = 1; i < GGML_MAX_DIMS; ++i){
+                src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
+            }
+
+            acl_src0_f16_tensor = ggml_cann_create_tensor(
+                src0_f16_buffer, faDataType, faElemSize,
+                src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS
+            );
+            aclnn_cast(ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, faDataType);
+            ggml_cann_release_resources(ctx, acl_src0_f32_tensor);
+        }else{
+            acl_src0_f16_tensor = ggml_cann_create_tensor(src0);
+        }
+
+        // Step 2: create the acl tensors for src1 (Key), src2 (Value),
+        //         and the direct output from FusedInferAttention
+
+        acl_src1_f16_tensor = ggml_cann_create_tensor(src1);
+        acl_src2_f16_tensor = ggml_cann_create_tensor(src2);
+
+        ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
+        void* out_f16_buffer = out_f16_allocator.alloc(
+                                    ggml_nelements(dst) * faElemSize);
+
+        int64_t* out_f16_ne = src0->ne;
+        size_t out_f16_nb[GGML_MAX_DIMS];
+        out_f16_nb[0] = faElemSize;
+        for(int i = 1; i < GGML_MAX_DIMS; ++i){
+            out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
+        }
+
+        acl_dst_f16_tensor = ggml_cann_create_tensor(
+            out_f16_buffer, faDataType, faElemSize,
+            out_f16_ne, out_f16_nb, GGML_MAX_DIMS
+        );
+
+        // Step 3: create the PSEShift tensor if needed
+        //         this tensor is considered as mask (f16) in the llama.cpp
+
+        aclTensor* bcast_pse_tensor = nullptr;
+        int64_t bcast_pse_ne[GGML_MAX_DIMS];
+        size_t bcast_pse_nb[GGML_MAX_DIMS];
+        ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
+        void* bcast_pse_buffer = nullptr;
+
+        if(src3 != nullptr){
+            bcast_pse_buffer = bcast_pse_allocator.alloc(
+                            ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
+
+            if(src0->ne[1] > 1){
+                // Case 1: broadcast pse for prefill stage with multiple head
+                aclTensor* acl_mask_f16_tensor = ggml_cann_create_tensor(src3);
+                bcast_pse_ne[0] = src3->ne[0];
+                bcast_pse_ne[1] = src3->ne[1];
+                bcast_pse_ne[2] = src0->ne[2];
+                bcast_pse_ne[3] = src3->ne[3];
+
+                bcast_pse_nb[0] = sizeof(uint16_t);
+                for(int i = 1; i < GGML_MAX_DIMS; ++i){
+                    bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
+                }
+
+                bcast_pse_tensor = ggml_cann_create_tensor(
+                    bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
+                    bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
+
+                int64_t repeats[] = {1, src0->ne[2], 1, 1};
+                aclnn_repeat(ctx, acl_mask_f16_tensor, bcast_pse_tensor, repeats);
+
+                ggml_cann_release_resources(ctx, acl_mask_f16_tensor);
+            }else{
+                // Case 2: trunc the first row and broadcast pse for decode stage with multiple head
+                int64_t trunc_pse_ne[GGML_MAX_DIMS] = {src3->ne[0], src0->ne[1], src3->ne[2], src3->ne[3]};
+                size_t* trunc_pse_nb = src3->nb;
+
+                aclTensor* acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
+                    src3->data, ACL_FLOAT16, sizeof(uint16_t),
+                    trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
+
+                bcast_pse_ne[0] = src3->ne[0];
+                bcast_pse_ne[1] = src0->ne[1];
+                bcast_pse_ne[2] = src0->ne[2];
+                bcast_pse_ne[3] = src3->ne[3];
+
+                bcast_pse_nb[0] = sizeof(uint16_t);
+                for(int i = 1; i < GGML_MAX_DIMS; ++i){
+                    bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
+                }
+
+                bcast_pse_tensor = ggml_cann_create_tensor(
+                    bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
+                    bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
+
+                int64_t repeats[] = {1, src0->ne[2], 1, 1};
+                aclnn_repeat(ctx, acl_mask_f16_trunc_tensor, bcast_pse_tensor, repeats);
+
+                ggml_cann_release_resources(ctx, acl_mask_f16_trunc_tensor);
+            }
+
+            // Compute the slope if needed. Derived from ggml_cann_softmax().
+            if(maxBias != 0.0f){
+                // alibi
+                const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
+                const int64_t n_head = src0->ne[2];
+                const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
+                float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
+                float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
+                // init arange
+                ggml_cann_pool_alloc arange_allocator(ctx.pool(),
+                                                    ne2_ne3 * faElemSize);
+                void* tmp_arange_buffer = arange_allocator.get();
+
+                // arange1: [1, ..., n_heads_log2_floor+1)
+                float start = 1;
+                float stop = n_heads_log2_floor + 1;
+                float step = 1;
+                int64_t n_elements_arange = n_heads_log2_floor;
+
+                int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
+                size_t tmp_arange1_nb[] = {faElemSize};
+                aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
+                    tmp_arange_buffer, faDataType, faElemSize,
+                    tmp_arange1_ne, tmp_arange1_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+                aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
+
+                aclTensor* tmp_arange2_tensor = nullptr;
+                if (n_heads_log2_floor < ne2_ne3) {
+                    // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
+                    start = 1;
+                    stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
+                    step = 2;
+                    n_elements_arange = ne2_ne3 - n_heads_log2_floor;
+                    int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+                    size_t tmp_arange2_nb[] = {faElemSize};
+
+                    aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
+                        (char*)tmp_arange_buffer +
+                            n_heads_log2_floor * faElemSize,
+                        faDataType, faElemSize,
+                        tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                    aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
+                                n_elements_arange);
+                }
+
+                // init mk_base
+                ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
+                                                    ne2_ne3 * faElemSize);
+                void* tmp_mk_base_buffer = mk_base_allocator.get();
+                int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
+                size_t tmp_mk_base1_nb[] = {faElemSize};
+                aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_base1_ne, tmp_mk_base1_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+
+                aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
+
+                aclTensor* tmp_mk_base2_tensor = nullptr;
+                if (n_heads_log2_floor < ne2_ne3) {
+                    int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
+                    size_t tmp_mk_base2_nb[] = {faElemSize};
+                    aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
+                        (char*)tmp_mk_base_buffer +
+                            n_heads_log2_floor * faElemSize,
+                        faDataType, faElemSize,
+                        tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                    aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
+                }
+
+                // init mk
+                int64_t tmp_mk_base_ne[] = {ne2_ne3};
+                size_t tmp_mk_base_nb[] = {faElemSize};
+                aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_base_ne, tmp_mk_base_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
+                    tmp_arange_buffer, faDataType, faElemSize,
+                    tmp_mk_base_ne, tmp_mk_base_nb,
+                    GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+                aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
+
+                // reshape mk
+                int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
+                size_t tmp_mk_nb[GGML_MAX_DIMS];
+                tmp_mk_nb[0] = faElemSize;
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
+                }
+                aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
+                    tmp_mk_base_buffer, faDataType, faElemSize,
+                    tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
+                    ACL_FORMAT_ND);
+                GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
+
+                ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
+                    tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
+                    tmp_arange_tensor, tmp_mk_tensor);
+            }
+        }
+
+        // Step 4: set the inputs for FusedInferAttention.
+        int kvTensorNum = 1;
+        aclTensor* acl_q_tensor = acl_src0_f16_tensor;
+        aclTensor* acl_k_tensors[] = {acl_src1_f16_tensor};
+        aclTensor* acl_v_tensors[] = {acl_src2_f16_tensor};
+        auto acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum);
+        auto acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
+
+        int64_t numHeads = src0->ne[2]; // N
+        int64_t numKeyValueHeads = src1->ne[2];
+        // double  scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
+        int64_t preTokens = 65535;
+        int64_t nextTokens = 65535;
+        char layout[5] = {'B', 'N', 'S', 'D', 0};
+        int64_t sparseMode = 0;
+        int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
+        int64_t blockSize = 0;
+        int64_t antiquantMode = 0;
+        bool softmaxLseFlag = false;
+        int64_t keyAntiquantMode = 0;
+        int64_t valueAntiquantMode = 0;
+
+        // Step 5: launch the FusedInferAttentionScoreV2 kernel.
+        // Refer to https://gitee.com/ascend/cann-ops-adv/blob/master/docs/FusedInferAttentionScoreV2.md
+
+        GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
+            acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
+            bcast_pse_tensor, nullptr, // pse, mask
+            nullptr, nullptr, // actSeqLen, actSeqLenkv
+            nullptr, nullptr, // deqScale1, quantScale1
+            nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
+            nullptr, nullptr, // antiquantScale, antiquantOffset
+            nullptr, // blockTable
+            nullptr, nullptr, // qPadSize, kvPadSize
+            nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
+            nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
+            nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
+            numHeads, scaleValue, // heads, scaleValue
+            preTokens, nextTokens, // preTokens, nextTokens
+            layout, // inputLayout
+            numKeyValueHeads, // numKVHeads
+            sparseMode, innerPrecise, // sparseMode, innerPrecise
+            blockSize, antiquantMode, // blockSize, antiquantMode
+            softmaxLseFlag, // softmaxLseFlag
+            keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
+            acl_dst_f16_tensor, // attentionOut
+            nullptr // softmaxLse
+        );
+
+        // Step 6: post-processing, permute and cast to f32
+
+        int64_t new_dim[] = {0, 2, 1, 3};
+        aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+
+        if(ggml_cann_type_mapping(dst->type) != faDataType){
+            ggml_cann_pool_alloc perm_out_f16_allocator(ctx.pool());
+            perm_out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
+            void* perm_out_f16_buffer = perm_out_f16_allocator.get();
+
+            int64_t* perm_out_f16_ne = dst->ne;
+            size_t  perm_out_f16_nb[GGML_MAX_DIMS];
+            perm_out_f16_nb[0] = faElemSize;
+            for(int i = 1; i < GGML_MAX_DIMS; ++i){
+                perm_out_f16_nb[i] = perm_out_f16_nb[i - 1] * perm_out_f16_ne[i - 1];
+            }
+            aclTensor* acl_perm_out_f16_tensor = ggml_cann_create_tensor(
+                perm_out_f16_buffer, faDataType, faElemSize,
+                perm_out_f16_ne, perm_out_f16_nb, GGML_MAX_DIMS);
+            aclnn_permute(ctx, acl_dst_f16_tensor, acl_perm_out_f16_tensor, new_dim, GGML_MAX_DIMS);
+            aclnn_cast(ctx,
+                acl_perm_out_f16_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
+            ggml_cann_release_resources(ctx, acl_perm_out_f16_tensor);
+        }else{
+            // only need to permute
+            aclnn_permute(ctx, acl_dst_f16_tensor, acl_dst_tensor, new_dim, GGML_MAX_DIMS);
+        }
+        ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
+                                         acl_src1_f16_tensor,
+                                         acl_src2_f16_tensor,
+                                         acl_dst_f16_tensor,
+                                         acl_dst_tensor);
+        if(src3 != nullptr){
+            ggml_cann_release_resources(ctx, bcast_pse_tensor);
+        }
+    }else{
+        GGML_ABORT("Function is not implemented.");
+    }
+}
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
old mode 100644
new mode 100755
index 15993cce6..80ce80bae
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -714,6 +714,21 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst);
  */
 void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 
+/**
+ * @brief   Performs the Flash Attention extended operator using the CANN backend.
+ *
+ * @details This function implements the memory-efficient Flash Attention algorithm
+ *          for computing scaled dot-product attention with hardware acceleration.
+ *          The result is stored in the destination tensor `dst`.
+ *
+ *          This operation is accelerated using the CANN backend to improve runtime performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the result will be stored.
+ *            dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`.
+ */
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 /*
  * @brief A generic wrapper for ACL resources with custom deleter support.
  */
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
old mode 100644
new mode 100755
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
old mode 100644
new mode 100755
index 0cb7bbf17..c0ea26002
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -36,6 +36,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml-cann/aclnn_ops.h"
 #include "ggml-cann/common.h"
+#include "ggml.h"
 
 #define GGML_COMMON_DECL_C
 
@@ -1748,6 +1749,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_COUNT_EQUAL:
             ggml_cann_count_equal(ctx, dst);
             break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            ggml_cann_flash_attn_ext(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -2035,6 +2039,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                 case GGML_TYPE_F16:
                 case GGML_TYPE_F32:
                     return true;
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+#ifdef ASCEND_310P
+                    // Q4 && Q8 per group is not suppor on 310p device
+                    return false;
+#endif
+                    // only support contiguous for quantized types.
+                    return ggml_is_contiguous(op->src[0]) &&
+                            ggml_is_contiguous(op->src[1]);
                 default:
                     return false;
             }
@@ -2168,6 +2181,38 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
         case GGML_OP_PAD_REFLECT_1D:
         case GGML_OP_COUNT_EQUAL:
             return true;
+        case GGML_OP_FLASH_ATTN_EXT:{
+            // derived from [ggml-cuda.cu]
+            if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
+                return false;
+            }
+            if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
+                return false;
+            }
+            if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
+                return false;
+            }
+            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+                // different head sizes of K and V are not supported yet
+                return false;
+            }
+            if (op->src[0]->ne[0] == 192) {
+                return false;
+            }
+            if (op->src[0]->ne[0] == 576) {
+                // DeepSeek MLA
+                return false;
+            }
+            if (op->src[0]->ne[3] != 1) {
+                return false;
+            }
+            float logitSoftcap = 0.0f;
+            memcpy(&logitSoftcap,  (float*)op->op_params + 2, sizeof(float));
+            if(logitSoftcap != 0.0f) {
+                return false;
+            }
+            return true;
+        }
         default:
             return false;
     }
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 1d4259dae..b3237eead 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -82,13 +82,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
     endif()
 
-    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
-        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
-
+    if (GGML_SYSTEM_ARCH STREQUAL "ARM")
         message(STATUS "ARM detected")
-
         if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
             message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
         else()
@@ -170,12 +165,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 endforeach()
             endif()
         endif()
-    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
-
+    elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
         message(STATUS "x86 detected")
-
         if (MSVC)
             # instruction set detection for MSVC only
             if (GGML_NATIVE)
@@ -299,7 +290,26 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 endif()
             endif()
         endif()
-    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
+
+        if (GGML_BACKEND_DL)
+            if (GGML_NATIVE)
+                # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
+                message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
+            endif()
+
+            # The feature detection code is compiled as a separate target so that
+            # it can be built without the architecture flags
+            # Since multiple variants of the CPU backend may be included in the same
+            # build, using set_source_files_properties() to set the arch flags is not possible
+            set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
+            add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
+            target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
+            target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
+            target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
+            set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
+        endif()
+    elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
         message(STATUS "PowerPC detected")
         if (GGML_NATIVE)
             if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
@@ -325,9 +335,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
             endif()
         endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+    elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
         message(STATUS "loongarch64 detected")
-
         list(APPEND ARCH_FLAGS -march=loongarch64)
         if (GGML_LASX)
             list(APPEND ARCH_FLAGS -mlasx)
@@ -335,16 +344,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         if (GGML_LSX)
             list(APPEND ARCH_FLAGS -mlsx)
         endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
-        message(STATUS "RISC-V detected")
+    elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
+        message(STATUS "riscv64 detected")
         if (GGML_RVV)
-            if (GGML_RV_ZFH)
-                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
+            if (GGML_XTHEADVECTOR)
+                list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
+            elseif (GGML_RV_ZFH)
+                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
             else()
                 list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
             endif()
         endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
         message(STATUS "s390x detected")
         file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
         string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
@@ -477,25 +488,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
     target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
     target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
 
-    if (GGML_BACKEND_DL)
-        if (GGML_NATIVE)
-            # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
-            message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
-        endif()
-
-        # The feature detection code is compiled as a separate target so that
-        # it can be built without the architecture flags
-        # Since multiple variants of the CPU backend may be included in the same
-        # build, using set_source_files_properties() to set the arch flags is not possible
-        set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
-        add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
-        target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
-        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
-        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
-        set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-        target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
-    endif()
-
     if (EMSCRIPTEN)
         set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
     endif()
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index 8ff6d64a4..0a3ff867c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
         }
     }
     return;
-#elif defined(__riscv_v_intrinsic)
+#elif defined __riscv_v
     if (__riscv_vlenb() >= QK4_0) {
         const size_t vl = QK4_0;
 
@@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
         }
         return;
     }
-#elif defined(__riscv_v_intrinsic)
+#elif defined __riscv_v
     if (__riscv_vlenb() >= QK4_0) {
         const size_t vl = QK4_0;
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index e4af07635..b3f1b5ca7 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -320,21 +320,17 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
 
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
-#else
+#endif
+
 #ifdef __POWER9_VECTOR__
 #include <altivec.h>
-#else
+#endif
+
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
-#if !defined(__riscv)
+#elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
 #include <immintrin.h>
 #endif
-#endif
-#endif
-#endif
-#endif
 
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index a89ce9bb1..40bded476 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -883,7 +883,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
         _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
 #endif
     }
-#elif defined(__riscv_v_intrinsic)
+#elif defined(__riscv_v)
 
     size_t vl = QK8_0;
 
@@ -1221,7 +1221,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
         _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
 #endif
     }
-#elif defined(__riscv_v_intrinsic)
+#elif defined(__riscv_v)
 
     size_t vl = QK8_1;
 
@@ -2384,7 +2384,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
 
     sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#elif defined(__riscv_v_intrinsic)
+#elif defined(__riscv_v)
     size_t vl = qk / 2;
 
     for (; ib < nb; ++ib) {
@@ -2774,7 +2774,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
 
     sumf = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
+#elif defined(__riscv_v)
     size_t vl = qk / 2;
 
     for (; ib < nb; ++ib) {
@@ -3121,7 +3121,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
 
     sumf = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
+#elif defined(__riscv_v)
     size_t vl;
     size_t vlenb = __riscv_vlenb();
 
@@ -3460,7 +3460,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
 
     sumf = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
+#elif defined(__riscv_v)
     size_t vl;
     size_t vlenb = __riscv_vlenb();
 
@@ -3897,7 +3897,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
 
     sumf = hsum_float_8(accum);
-#elif defined(__riscv_v_intrinsic)
+#elif defined(__riscv_v)
     size_t vl = qk;
 
     for (; ib < nb; ++ib) {
@@ -5100,14 +5100,111 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     *s = sumf;
 
-#elif defined __riscv_v_intrinsic
+#elif defined __riscv_xtheadvector
+
+    float sumf = 0;
+    uint8_t atmp[16];
+
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * q2 = x[i].qs;
+        const  int8_t * q8 = y[i].qs;
+        const uint8_t * sc = x[i].scales;
+        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+        uint8_t *patmp = atmp;
+        int vsums;
+        int tmp;
+        __asm__ __volatile__(
+            "th.vsetvli zero, %[vl16], e8, m1\n\t"
+            "th.vmv.v.x v8, zero\n\t"
+            "th.vlb.v v1, (%[sc])\n\t"
+            "th.vand.vi v0, v1, 0xF\n\t"
+            "th.vsrl.vi v1, v1, 4\n\t"
+            "th.vsb.v v0, (%[scale])\n\t"
+            "th.vwaddu.vx v16, v1, zero\n\t"
+            "th.vsetvli zero, %[vl16], e16, m2\n\t"
+            "th.vlh.v v2, (%[bsums])\n\t"
+            "th.vwmul.vv v4, v16, v2\n\t"
+            "th.vsetvli zero, %[vl16], e32, m4\n\t"
+            "th.vredsum.vs v8, v4, v8\n\t"
+            "th.vmv.x.s %[vsums], v8"
+            : [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
+            : [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
+            , [vl16] "r" (16)
+            : "memory"
+            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+        );
+        sumf += dmin * vsums;
+        int isum = 0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __asm__ __volatile__(
+                "th.vsetvli zero, %[vl32], e8, m2\n\t"
+                "th.vlb.v v0, (%[q2])\n\t"
+                "th.vsrl.vi v2, v0, 2\n\t"
+                "th.vsrl.vi v4, v0, 4\n\t"
+                "th.vsrl.vi v6, v0, 6\n\t"
+                "th.vand.vi v0, v0, 0x3\n\t"
+                "th.vand.vi v2, v2, 0x3\n\t"
+                "th.vand.vi v4, v4, 0x3\n\t"
+                "th.vsetvli zero, %[vl128], e8, m8\n\t"
+                "th.vlb.v v8, (%[q8])\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t"
+                "th.vwmul.vv v16, v0, v8\n\t"
+                "th.vwmul.vv v24, v4, v12\n\t"
+                "th.vsetvli zero, %[vl16], e16, m2\n\t"
+                "th.vmv.v.x v0, zero\n\t"
+                "th.vwredsum.vs v10, v16, v0\n\t"
+                "th.vwredsum.vs v9, v18, v0\n\t"
+                "th.vwredsum.vs v8, v20, v0\n\t"
+                "th.vwredsum.vs v7, v22, v0\n\t"
+                "th.vwredsum.vs v11, v24, v0\n\t"
+                "th.vwredsum.vs v12, v26, v0\n\t"
+                "th.vwredsum.vs v13, v28, v0\n\t"
+                "th.vwredsum.vs v14, v30, v0\n\t"
+                "li %[tmp], 4\n\t"
+                "th.vsetvli zero, %[tmp], e32, m1\n\t"
+                "th.vslideup.vi v10, v9, 1\n\t"
+                "th.vslideup.vi v8, v7, 1\n\t"
+                "th.vslideup.vi v11, v12, 1\n\t"
+                "th.vslideup.vi v13, v14, 1\n\t"
+                "th.vslideup.vi v10, v8, 2\n\t"
+                "th.vslideup.vi v11, v13, 2\n\t"
+                "li %[tmp], 8\n\t"
+                "th.vsetvli zero, %[tmp], e32, m2\n\t"
+                "th.vlbu.v v12, (%[scale])\n\t"
+                "th.vmul.vv v10, v10, v12\n\t"
+                "th.vredsum.vs v0, v10, v0\n\t"
+                "th.vmv.x.s %[tmp], v0\n\t"
+                "add %[isum], %[isum], %[tmp]"
+                : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
+                : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
+                , [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+            q2 += 32; q8 += 128; patmp += 8;
+        }
+
+        sumf += dall * isum;
+    }
+
+    *s = sumf;
+
+#elif defined __riscv_v
+
+    float sumf = 0;
+    uint8_t atmp[16];
 
     const int vector_length = __riscv_vlenb() * 8;
-    float sumf = 0;
-
     uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
-    uint8_t atmp[16];
 
     switch (vector_length) {
     case 256:
@@ -6137,14 +6234,141 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     *s = sumf;
 
-#elif defined __riscv_v_intrinsic
+#elif defined __riscv_xtheadvector
 
-    uint32_t aux[3];
     uint32_t utmp[4];
-
-    const int vector_length = __riscv_vlenb() * 8;
     float sumf = 0;
 
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict qh = x[i].hmask;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        int8_t * scale = (int8_t *)utmp;
+        int tmp;
+        __asm__ __volatile__(
+            "li %[tmp], 12\n\t"
+            "th.vsetvli zero, %[tmp], e8, m1\n\t"
+            "th.vlb.v v0, (%[s6b])\n\t"
+            "th.vmv.v.v v2, v0\n\t"
+            "li %[tmp], 2\n\t"
+            "th.vsetvli zero, %[tmp], e64, m1\n\t"
+            "th.vmv.v.x v9, %[sh]\n\t"\
+            "th.vslidedown.vi v1, v0, 1\n\t"
+            "th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
+            "th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
+            "li %[tmp], 4\n\t"
+            "th.vsetvli zero, %[tmp], e32, m1\n\t"
+            "th.vid.v v9\n\t"
+            "th.vmv.x.s %[tmp], v1\n\t"
+            "th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
+            "th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
+            "th.vsrl.vv v4, v1, v9\n\t"
+            "th.vsrl.vv v2, v0, v8\n\t"
+            "th.vand.vx v5, v4, %[kmask1]\n\t"
+            "th.vand.vx v3, v2, %[kmask2]\n\t"
+            "th.vsll.vi v6, v5, 4\n\t"
+            "th.vor.vv v7, v6, v3\n\t"
+            "li %[tmp], 16\n\t"
+            "th.vsetvli zero, %[tmp], e8, m1\n\t"
+            "th.vsub.vx v0, v7, %[c]\n\t"
+            "th.vsb.v v0, (%[scale])"
+            : [tmp] "=&r" (tmp)
+            : [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
+            , [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
+            : "memory"
+            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+        );
+
+        uint8_t m = 1;
+        int isum = 0;
+        for (int j = 0; j < QK_K; j += 128) {
+            __asm__ __volatile__(
+                // fixme: use v0p7 mask layout directly
+                "th.vsetvli zero, %[vl32], e8, m2\n\t"
+                "th.vlb.v v8, (%[q3])\n\t"
+                "th.vsrl.vi v10, v8, 2\n\t"
+                "th.vsrl.vi v12, v8, 4\n\t"
+                "th.vsrl.vi v14, v8, 6\n\t"
+                "th.vand.vi v8, v8, 3\n\t"
+                "th.vand.vi v10, v10, 3\n\t"
+                "th.vand.vi v12, v12, 3\n\t"
+                "th.vlb.v v2, (%[qh])\n\t"
+                "th.vand.vx v4, v2, %[m]\n\t"
+                "slli %[m], %[m], 1\n\t"
+                "th.vmseq.vx v0, v4, zero\n\t"
+                "th.vadd.vi v8, v8, -4, v0.t\n\t"
+                "th.vand.vx v4, v2, %[m]\n\t"
+                "slli %[m], %[m], 1\n\t"
+                "th.vmseq.vx v0, v4, zero\n\t"
+                "th.vadd.vi v10, v10, -4, v0.t\n\t"
+                "th.vand.vx v4, v2, %[m]\n\t"
+                "slli %[m], %[m], 1\n\t"
+                "th.vmseq.vx v0, v4, zero\n\t"
+                "th.vadd.vi v12, v12, -4, v0.t\n\t"
+                "th.vand.vx v4, v2, %[m]\n\t"
+                "slli %[m], %[m], 1\n\t"
+                "th.vmseq.vx v0, v4, zero\n\t"
+                "th.vadd.vi v14, v14, -4, v0.t\n\t"
+                "th.vsetvli zero, %[vl128], e8, m8\n\t"
+                "th.vlb.v v0, (%[q8])\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t"
+                "th.vwmul.vv v16, v0, v8\n\t"
+                "th.vwmul.vv v24, v4, v12\n\t"
+                "li %[tmp], 16\n\t"
+                "th.vsetvli zero, %[tmp], e16, m2\n\t"
+                "th.vmv.v.x v0, zero\n\t"
+                "th.vwredsum.vs v10, v16, v0\n\t"
+                "th.vwredsum.vs v9, v18, v0\n\t"
+                "th.vwredsum.vs v8, v20, v0\n\t"
+                "th.vwredsum.vs v7, v22, v0\n\t"
+                "th.vwredsum.vs v11, v24, v0\n\t"
+                "th.vwredsum.vs v12, v26, v0\n\t"
+                "th.vwredsum.vs v13, v28, v0\n\t"
+                "th.vwredsum.vs v14, v30, v0\n\t"
+                "li %[tmp], 4\n\t"
+                "th.vsetvli zero, %[tmp], e32, m1\n\t"
+                "th.vslideup.vi v10, v9, 1\n\t"
+                "th.vslideup.vi v8, v7, 1\n\t"
+                "th.vslideup.vi v11, v12, 1\n\t"
+                "th.vslideup.vi v13, v14, 1\n\t"
+                "th.vslideup.vi v10, v8, 2\n\t"
+                "th.vslideup.vi v11, v13, 2\n\t"
+                "li %[tmp], 8\n\t"
+                "th.vsetvli zero, %[tmp], e32, m2\n\t"
+                "th.vlb.v v12, (%[scale])\n\t"
+                "th.vmul.vv v10, v10, v12\n\t"
+                "th.vredsum.vs v0, v10, v0\n\t"
+                "th.vmv.x.s %[tmp], v0\n\t"
+                "add %[isum], %[isum], %[tmp]"
+                : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
+                : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
+                , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+            q3 += 32;    q8 += 128;   scale += 8;
+        }
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        sumf += d * isum;
+    }
+
+    *s = sumf;
+
+#elif defined __riscv_v
+
+    uint32_t utmp[4];
+    float sumf = 0;
+    uint32_t aux[3];
+    const int vector_length = __riscv_vlenb() * 8;
+
     switch (vector_length) {
     case 256:
         for (int i = 0; i < nb; ++i) {
@@ -6331,7 +6555,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
                     "vslideup.vi v13, v14, 1\n\t"
                     "vslideup.vi v10, v8, 2\n\t"
                     "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"\
+                    "vsetivli zero, 8, e32, m2\n\t"
                     "vle8.v v15, (%[scale])\n\t"
                     "vsext.vf4 v12, v15\n\t"
                     "vmul.vv v10, v10, v12\n\t"
@@ -6771,7 +6995,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
+#ifdef __ARM_FEATURE_MATMUL_INT8
+    assert((nrc == 2) || (nrc == 1));
+#else
     assert(nrc == 1);
+#endif
     UNUSED(nrc);
     UNUSED(bx);
     UNUSED(by);
@@ -6788,6 +7016,146 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     uint32_t utmp[4];
 
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+    if (nrc == 2) {
+        const block_q4_K * GGML_RESTRICT x0 = x;
+        const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
+        const block_q8_K * GGML_RESTRICT y0 = y;
+        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0f);
+
+        float32x4_t vfsum = vdupq_n_f32(0.0f);
+
+        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
+            const uint8_t * GGML_RESTRICT qx0 = x0->qs;
+            const uint8_t * GGML_RESTRICT qx1 = x1->qs;
+            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
+            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
+
+            // decode scales and mins
+            int8_t x0_scales[8], x1_scales[8];
+            int16x8_t x0_mins, x1_mins;
+            {
+                uint32_t scales_mins[3];
+                memcpy(scales_mins, x0->scales, 12);
+                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
+                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
+                const uint32x2_t mins = {mins_0_3, mins_4_7};
+                x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
+                uint32_t scales[2];
+                scales[0] = scales_mins[0] & kmask1; // scales 0~3
+                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
+                memcpy(x0_scales, scales, 8);
+            }
+            {
+                uint32_t scales_mins[3];
+                memcpy(scales_mins, x1->scales, 12);
+                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
+                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
+                const uint32x2_t mins = {mins_0_3, mins_4_7};
+                x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
+                uint32_t scales[2];
+                scales[0] = scales_mins[0] & kmask1; // scales 0~3
+                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
+                memcpy(x1_scales, scales, 8);
+            }
+
+            int32x4_t visum = {0};
+
+            // process 64 data points per iteration, totally 256 data points
+            for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) {
+                const int8x16x4_t vy0 = vld1q_s8_x4(qy0);
+                const int8x16x4_t vy1 = vld1q_s8_x4(qy1);
+
+                int8x16_t vx0[4], vx1[4];
+                {
+                    const uint8x16x2_t vv = vld1q_u8_x2(qx0);
+                    vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
+                    vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
+                    vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
+                    vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
+                }
+                {
+                    const uint8x16x2_t vv = vld1q_u8_x2(qx1);
+                    vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
+                    vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
+                    vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
+                    vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
+                }
+
+                // process 32 data points (share same block scale) per iteration
+                for (int k = 0; k < 2; ++k) {
+                    const int blk = j * 2 + k;
+                    const int32x4_t block_scale = {
+                        x0_scales[blk],
+                        x0_scales[blk],
+                        x1_scales[blk],
+                        x1_scales[blk],
+                    };
+
+                    int32x4_t vr = {0};
+                    for (int l = 0; l < 2; ++l) {
+                        const int idx = k * 2 + l;
+                        const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]);
+                        const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]);
+                        const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]);
+                        const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]);
+                        const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64));
+                        const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64));
+                        const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64));
+                        const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64));
+                        vr = vmmlaq_s32(vr, vx_l, vy_l);
+                        vr = vmmlaq_s32(vr, vx_h, vy_h);
+                    }
+                    // apply block scale, will NOT overflow
+                    // block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits
+                    visum = vmlaq_s32(visum, vr, block_scale);
+                }
+            }
+
+            // adjust bias, apply superblock scale
+            {
+                int32_t bias[4];
+                // no obvious uplift from sve sdot-16, just use neon mul add
+                const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8));
+                const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8));
+                bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)),
+                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins))));
+                bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)),
+                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins))));
+                bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)),
+                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins))));
+                bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
+                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
+                const float32x4_t dmins = {
+                    GGML_FP16_TO_FP32(x0->dmin) * y0->d,
+                    GGML_FP16_TO_FP32(x0->dmin) * y1->d,
+                    GGML_FP16_TO_FP32(x1->dmin) * y0->d,
+                    GGML_FP16_TO_FP32(x1->dmin) * y1->d,
+                };
+                vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
+
+                const float32x4_t superblock_scale = {
+                    GGML_FP16_TO_FP32(x0->d) * y0->d,
+                    GGML_FP16_TO_FP32(x0->d) * y1->d,
+                    GGML_FP16_TO_FP32(x1->d) * y0->d,
+                    GGML_FP16_TO_FP32(x1->d) * y1->d,
+                };
+                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
+            }
+        }
+
+        // vfsum = ABCD -> ACBD
+        // AC -> s, BD -> (s+bs)
+        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
+        vst1_f32(s,      vget_low_f32 (vfsum));
+        vst1_f32(s + bs, vget_high_f32(vfsum));
+
+        return;
+    }
+#endif
+
 #ifdef __ARM_FEATURE_SVE
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
@@ -7180,14 +7548,130 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
 
-#elif defined __riscv_v_intrinsic
+#elif defined __riscv_xtheadvector
 
     const uint8_t * scales = (const uint8_t*)&utmp[0];
     const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-    const int vector_length = __riscv_vlenb() * 8;
     float sumf = 0;
 
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        int tmp, tmp2, sumi;
+        __asm__ __volatile__(
+            "li %[t1], 12\n\t"
+            "th.vsetvli zero, %[t1], e8, m1\n\t"
+            "th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
+            "li %[t1], 4\n\t"
+            "th.vsetvli zero, %[t1], e32, m1\n\t"
+            "th.vslidedown.vi v2, v1, 2\n\t"
+            "th.vmv.v.v v3, v2\n\t"
+            "th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
+            "li %[t1], 2\n\t"
+            "th.vsetvli zero, %[t1], e32, m1\n\t"
+            "th.vmv.v.i v4, 4\n\t"
+            "th.vand.vx v8, v1, %[kmask1]\n\t"
+            "th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
+            "th.vsrl.vi v6, v1, 6\n\t"
+            "th.vsrl.vv v7, v2, v5\n\t"
+            "th.vand.vx v0, v6, %[kmask3]\n\t"
+            "th.vand.vx v2, v7, %[kmask2]\n\t"
+            "th.vsll.vi v6, v0, 4\n\t"
+            "li %[t2], 8\n\t"
+            "addi %[t1], %[utmp], 4\n\t"
+            "th.vor.vv v1, v6, v2\n\t"
+            "th.vssw.v v8, (%[utmp]), %[t2]\n\t"
+            "th.vssw.v v1, (%[t1]), %[t2]\n\t"
+            "th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
+            "th.vlw.v v2, (%[bsums])\n\t"
+            "th.vsetvli zero, %[t2], e16, m1\n\t"
+            "th.vnsrl.vi v0, v2, 0\n\t"
+            "th.vnsrl.vi v1, v2, 16\n\t"
+            "th.vadd.vv v2, v0, v1\n\t"
+            "th.vlbu.v v4, (%[mins])\n\t"
+            "th.vwmul.vv v6, v4, v2\n\t"
+            "th.vmv.v.x v0, zero\n\t"
+            "th.vsetvli zero, %[t2], e32, m2\n\t"
+            "th.vredsum.vs v0, v6, v0\n\t"
+            "th.vmv.x.s %[sumi], v0"
+            : [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
+            : [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
+            , [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
+            , [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
+            : "memory"
+            , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+            , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+            , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+        );
+        sumf -= dmin * sumi;
+
+        const uint8_t * restrict q4 = x[i].qs;
+        const int8_t  * restrict q8 = y[i].qs;
+
+        sumi = 0;
+        const uint8_t * scale = scales;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            int vl128 = 128, vl64 = 64, vl32 = 32;
+            __asm__ __volatile__(
+                "th.vsetvli zero, %[vl128], e8, m8\n\t"
+                "th.vlb.v v8, (%[q8])\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t"
+                "th.vlb.v v0, (%[q4])\n\t"
+                "th.vsrl.vi v4, v0, 4\n\t"
+                "th.vand.vi v0, v0, 0xF\n\t"
+                "th.vsetvli zero, %[vl32], e8, m2\n\t"
+                "th.vwmul.vv v28, v6, v14\n\t"
+                "th.vwmul.vv v20, v4, v10\n\t"
+                "th.vwmul.vv v24, v2, v12\n\t"
+                "th.vwmul.vv v16, v0, v8\n\t"
+                "li %[tmp], 4\n\t"
+                "th.vsetvli zero, %[tmp], e32, m1\n\t"
+                "th.vlbu.v v1, (%[scale])\n\t"
+                "th.vmv.v.x v0, zero\n\t"
+                "th.vsetvli zero, %[vl32], e16, m4\n\t"
+                "th.vwredsum.vs v6, v24, v0\n\t"
+                "th.vwredsum.vs v7, v28, v0\n\t"
+                "th.vwredsum.vs v4, v16, v0\n\t"
+                "th.vwredsum.vs v5, v20, v0\n\t"
+                "th.vsetvli zero, %[tmp], e32, m1\n\t"
+                "th.vslideup.vi v6, v7, 1\n\t"
+                "th.vslideup.vi v4, v5, 1\n\t"
+                "th.vslideup.vi v4, v6, 2\n\t"
+                "th.vmul.vv v8, v4, v1\n\t"
+                "th.vredsum.vs v0, v8, v0\n\t"
+                "th.vmv.x.s %[tmp], v0\n\t"
+                "add %[sumi], %[sumi], %[tmp]"
+                : [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
+                : [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
+                , [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+
+            q4 += 64;    q8 += 128;    scale += 4;
+        }
+
+        sumf += d * sumi;
+
+    }
+
+    *s = sumf;
+
+#elif defined __riscv_v
+
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    float sumf = 0;
+    const int vector_length = __riscv_vlenb() * 8;
+
     switch (vector_length) {
     case 256:
         for (int i = 0; i < nb; ++i) {
@@ -8074,7 +8558,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
     *s = sumf;
 
-#elif defined __riscv_v_intrinsic
+#elif defined __riscv_v
 
     const uint8_t * scales = (const uint8_t*)&utmp[0];
     const uint8_t * mins   = (const uint8_t*)&utmp[2];
@@ -9232,11 +9716,92 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
     }
     *s = sumf;
 
-#elif defined __riscv_v_intrinsic
+#elif defined __riscv_xtheadvector
 
-    const int vector_length = __riscv_vlenb() * 8;
     float sumf = 0;
 
+    for (int i = 0; i < nb; ++i) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+
+        const uint8_t * restrict q6 = x[i].ql;
+        const uint8_t * restrict qh = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+
+        const int8_t * restrict scale = x[i].scales;
+
+        int sum_t = 0;
+        int t0;
+
+        for (int j = 0; j < QK_K/128; ++j) {
+            __asm__ __volatile__(
+                "th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
+                "th.vlb.v v4, (%[qh])\n\t"
+                "th.vsll.vi v0, v4, 4\n\t"
+                "th.vsll.vi v2, v4, 2\n\t"
+                "th.vsrl.vi v6, v4, 2\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
+                "th.vlb.v v8, (%[q6])\n\t"
+                "th.vsrl.vi v12, v8, 4\n\t"
+                "th.vand.vi v8, v8, 0xF\n\t"
+                "th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
+                "th.vand.vx v0, v0, %[mask]\n\t"
+                "th.vor.vv v8, v8, v0\n\t"
+                "th.vlb.v v0, (%[q8])\n\t"
+                "th.vsub.vx v8, v8, %[vl32]\n\t"
+                "th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
+                "th.vwmul.vv v16, v0, v8\n\t"
+                "th.vwmul.vv v24, v4, v12\n\t"
+                "li %[t0], 16\n\t"
+                "th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
+                "th.vmv.v.x v0, zero\n\t"
+                "th.vwredsum.vs v10, v16, v0\n\t"
+                "th.vwredsum.vs v9, v18, v0\n\t"
+                "th.vwredsum.vs v8, v20, v0\n\t"
+                "th.vwredsum.vs v7, v22, v0\n\t"
+                "th.vwredsum.vs v11, v24, v0\n\t"
+                "th.vwredsum.vs v12, v26, v0\n\t"
+                "th.vwredsum.vs v13, v28, v0\n\t"
+                "th.vwredsum.vs v14, v30, v0\n\t"
+                "li %[t0], 4\n\t"
+                "th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
+                "th.vslideup.vi v10, v9, 1\n\t"
+                "th.vslideup.vi v8, v7, 1\n\t"
+                "th.vslideup.vi v11, v12, 1\n\t"
+                "th.vslideup.vi v13, v14, 1\n\t"
+                "th.vslideup.vi v10, v8, 2\n\t"
+                "th.vslideup.vi v11, v13, 2\n\t"
+                "li %[t0], 8\n\t"
+                "th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
+                "th.vlb.v v4, (%[scale])\n\t"
+                "th.vmul.vv v2, v4, v10\n\t"
+                "th.vredsum.vs v0, v2, v0\n\t"
+                "th.vmv.x.s %[t0], v0\n\t"
+                "add %[sumi], %[sumi], %[t0]"
+                : [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
+                : [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
+                , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
+                , [mask] "r" (0x30)
+                : "memory"
+                , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+                , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+                , "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                , "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+            q6 += 64;   qh += 32;   q8 += 128;   scale += 8;
+        }
+
+        sumf += d * sum_t;
+
+    }
+
+    *s = sumf;
+
+#elif defined __riscv_v
+
+    float sumf = 0;
+    const int vector_length = __riscv_vlenb() * 8;
+
     switch (vector_length) {
     case 256:
         for (int i = 0; i < nb; ++i) {
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 46f75ad97..c7426df2b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -270,7 +270,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .from_float               = quantize_row_q4_K,
         .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
         .nrows                    = 1,
+#endif
     },
     [GGML_TYPE_Q5_K] = {
         .from_float               = quantize_row_q5_K,
@@ -2414,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) {
     // This is up to the applications.
     DWORD p = THREAD_PRIORITY_NORMAL;
     switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      p = THREAD_PRIORITY_BELOW_NORMAL;  break;
         case GGML_SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
         case GGML_SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
         case GGML_SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
         case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
     }
 
+    if (prio != GGML_SCHED_PRIO_LOW) {
+        // Tell Windows that this thread should not be throttled (needs its own CPU core).
+        // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
+        // all our threads onto the first 4 cores which results in terrible performance with
+        // n_threads > 4
+        #if _WIN32_WINNT >= 0x0602
+        THREAD_POWER_THROTTLING_STATE t;
+        ZeroMemory(&t, sizeof(t));
+        t.Version     = THREAD_POWER_THROTTLING_CURRENT_VERSION;
+        t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
+        t.StateMask   = 0;
+
+        if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
+            GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
+            return false;
+        }
+        #endif
+    }
+
     if (prio == GGML_SCHED_PRIO_NORMAL) {
         // Keep inherited policy/priority
         return true;
@@ -2447,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
     switch (prio) {
+        // TODO: there seems to be no way to set lower prio on Apple platforms
+        case GGML_SCHED_PRIO_LOW:      policy = SCHED_OTHER; p.sched_priority = 0;  break;
         case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
         case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
         case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
@@ -2503,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
     struct sched_param p;
     int32_t policy = SCHED_OTHER;
     switch (prio) {
+        case GGML_SCHED_PRIO_LOW:      policy = SCHED_BATCH; p.sched_priority = 0;  break;
         case GGML_SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
         case GGML_SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
         case GGML_SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
@@ -3484,6 +3511,19 @@ void ggml_cpu_init(void) {
             const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
 
             GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
+
+#ifdef GGML_USE_OPENMP
+            //if (!getenv("OMP_WAIT_POLICY")) {
+            //    // set the wait policy to active, so that OpenMP threads don't sleep
+            //    putenv("OMP_WAIT_POLICY=active");
+            //}
+
+            if (!getenv("KMP_BLOCKTIME")) {
+                // set the time to wait before sleeping a thread
+                // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
+                putenv("KMP_BLOCKTIME=200"); // 200ms
+            }
+#endif
         }
 
 #if defined(__ARM_ARCH)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 26501b711..d8de7531b 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7633,39 +7633,83 @@ static void ggml_compute_forward_ssm_scan_f32(
     const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
 
-    for (int i3 = 0; i3 < n_s; ++i3) {
-        for (int i2 = 0; i2 < n_t; ++i2) {
-            const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
-            const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-            const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
-            const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-            const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
-            const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
-                  float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-                  float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+    #ifdef __ARM_FEATURE_SVE
+        for (int i3 = 0; i3 < n_s; ++i3) {
+            for (int i2 = 0; i2 < n_t; ++i2) {
+                const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+                const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+                const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+                const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+                const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                    float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                    float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
 
-            // use the output as the source for the next token-wise iterations
-            if (i2 > 0) { s0 = s; }
+                // use the output as the source for the next token-wise iterations
+                if (i2 > 0) { s0 = s; }
 
-            // d_inner
-            for (int i1 = 0; i1 < ir; ++i1) {
-                // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
-                float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
-                float x_dt = x[i1] * dt_soft_plus;
-                float sumf = 0.0f;
-                // d_state
-                for (int i0 = 0; i0 < nc; ++i0) {
-                    int i = i0 + i1*nc;
-                    // state = prev_state * dA + dB * x
-                    float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
-                    // y = rowwise_dotprod(state, C)
-                    sumf += state * C[i0];
-                    s[i] = state;
+                // d_inner
+                for (int i1 = 0; i1 < ir; ++i1) {
+                    float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                    float x_dt = x[i1] * dt_soft_plus;
+                    svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
+                    svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
+                    svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
+
+                    for (int64_t k = 0; k < nc; k += svcntw()) {
+                        svfloat32_t vA = GGML_F32_VEC_LOAD(&A[i1*nc + k]);
+                        svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k]);
+                        svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k]);
+                        svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[i1*nc + k]);
+
+                        svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
+                        t1 = exp_ps_sve(svptrue_b32(), t1);
+                        svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
+
+                        vs0 = GGML_F32_VEC_FMA(vs0, t1, t2);
+                        r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
+
+                        GGML_F32_VEC_STORE(&s[i1*nc + k], vs0);
+                    }
+                    y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector);
                 }
-                y[i1] = sumf;
             }
         }
-    }
+    #else
+        for (int i3 = 0; i3 < n_s; ++i3) {
+            for (int i2 = 0; i2 < n_t; ++i2) {
+                const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+                const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+                const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+                const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+                const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                    float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                    float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+
+                // use the output as the source for the next token-wise iterations
+                if (i2 > 0) { s0 = s; }
+
+                // d_inner
+                for (int i1 = 0; i1 < ir; ++i1) {
+                    // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+                    float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                    float x_dt = x[i1] * dt_soft_plus;
+                    float sumf = 0.0f;
+                    // d_state
+                    for (int i0 = 0; i0 < nc; ++i0) {
+                        int i = i0 + i1*nc;
+                        // state = prev_state * dA + dB * x
+                        float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                        // y = rowwise_dotprod(state, C)
+                        sumf += state * C[i0];
+                        s[i] = state;
+                    }
+                    y[i1] = sumf;
+                }
+            }
+        }
+    #endif
 }
 
 void ggml_compute_forward_ssm_scan(
@@ -8070,6 +8114,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         #define GGML_F32X_MUL GGML_F32x16_MUL
         #define GGML_F32X_FMA GGML_F32x16_FMA
         #define WKV_VECTOR_SIZE 16
+    #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+        #define GGML_F32X GGML_F32xt
+        #define GGML_F32X_SET1 GGML_F32xt_SET1
+        #define GGML_F32X_LOAD GGML_F32xt_LOAD
+        #define GGML_F32X_STORE GGML_F32xt_STORE
+        #define GGML_F32X_MUL GGML_F32xt_MUL
+        #define GGML_F32X_FMA GGML_F32xt_FMA
+        #define WKV_VECTOR_SIZE 8
     #elif defined(__ARM_NEON) && defined(__aarch64__)
         #define GGML_F32X GGML_F32x4
         #define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8080,8 +8132,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
         #define WKV_VECTOR_SIZE 4
     #endif
 
+    int wkv_vector_size;
     #ifdef WKV_VECTOR_SIZE
-        const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
+        #if defined(__ARM_FEATURE_SVE)
+            wkv_vector_size = svcntw();
+        #else
+            wkv_vector_size = WKV_VECTOR_SIZE;
+        #endif
+        const int64_t vec_count = head_size / wkv_vector_size;
 
         for (int64_t t = 0; t < T; t++) {
             size_t t_offset = t * t_stride;
@@ -8111,7 +8169,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
                     GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
 
                     for (int64_t j = 0; j < vec_count; j++) {
-                        size_t base_j = j * WKV_VECTOR_SIZE;
+                        size_t base_j = j * wkv_vector_size;
                         size_t t_h_j_offset = t_h_offset + base_j;
                         size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
 
@@ -8136,7 +8194,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
                     }
 
                     // Handle remaining elements, this will not be used.
-                    for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
+                    for (int64_t j = vec_count * wkv_vector_size; j < head_size; j++) {
                         size_t t_h_j_offset = t_h_offset + j;
                         size_t h_2d_i_j_offset = h_2d_i_offset + j;
                         float v_val = v[t_h_j_offset];
@@ -8272,6 +8330,14 @@ static void ggml_compute_forward_gla_f32(
         #define GGML_F32X_MUL GGML_F32x16_MUL
         #define GGML_F32X_FMA GGML_F32x16_FMA
         #define GLA_VECTOR_SIZE 16
+    #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+        #define GGML_F32X GGML_F32xt
+        #define GGML_F32X_SET1 GGML_F32xt_SET1
+        #define GGML_F32X_LOAD GGML_F32xt_LOAD
+        #define GGML_F32X_STORE GGML_F32xt_STORE
+        #define GGML_F32X_MUL GGML_F32xt_MUL
+        #define GGML_F32X_FMA GGML_F32xt_FMA
+        #define GLA_VECTOR_SIZE 8
     #elif defined(__ARM_NEON) && defined(__aarch64__)
         #define GGML_F32X GGML_F32x4
         #define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8282,8 +8348,14 @@ static void ggml_compute_forward_gla_f32(
         #define GLA_VECTOR_SIZE 4
     #endif
 
+    int gla_vector_size;
     #ifdef GLA_VECTOR_SIZE
-        const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
+        #if defined(__ARM_FEATURE_SVE)
+            gla_vector_size = svcntw();
+        #else
+            gla_vector_size = GLA_VECTOR_SIZE;
+        #endif
+        const int64_t vec_count = head_size / gla_vector_size;
 
         for (int64_t t = 0; t < T; t++) {
             size_t t_offset = t * t_stride;
@@ -8310,7 +8382,7 @@ static void ggml_compute_forward_gla_f32(
                     GGML_F32X g_vec = GGML_F32X_SET1(g_val);
 
                     for (int64_t j = 0; j < vec_count; j++) {
-                        size_t base_j = j * GLA_VECTOR_SIZE;
+                        size_t base_j = j * gla_vector_size;
                         size_t t_h_j_offset = t_h_offset + base_j;
                         size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
 
@@ -8334,7 +8406,7 @@ static void ggml_compute_forward_gla_f32(
                     }
 
                     // Handle remaining elements, this will not be used.
-                    for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
+                    for (int64_t j = vec_count * gla_vector_size; j < head_size; j++) {
                         size_t t_h_j_offset = t_h_offset + j;
                         size_t h_2d_i_j_offset = h_2d_i_offset + j;
                         float v_val = v[t_h_j_offset];
@@ -8443,83 +8515,126 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
     int64_t h_stride_2d = head_size * head_size;
 
     #if defined(GGML_SIMD)
-        for (int64_t t = 0; t < T; t++) {
-            int64_t t_offset = t * t_stride;
-            int64_t state_offset = head_size * C * (t / (T / n_seqs));
-            float * state_cur = state + state_offset;
-            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
+        #if defined(__ARM_FEATURE_SVE)
+            // scalar Route to scalar implementation       //TODO: Write SVE code
+            for (int64_t t = 0; t < T; t++) {
+                int64_t t_offset = t * t_stride;
+                int64_t state_offset = head_size * C * (t / (T / n_seqs));
+                float * state_cur = state + state_offset;
+                float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
 
-            for (int64_t h = h_start; h < h_end; h++) {
-                int64_t h_offset = h * h_stride;
-                int64_t t_h_offset = t_offset + h_offset;
-                int64_t h_2d_offset = h * h_stride_2d;
+                for (int64_t h = h_start; h < h_end; h++) {
+                    int64_t h_offset = h * h_stride;
+                    int64_t t_h_offset = t_offset + h_offset;
+                    int64_t h_2d_offset = h * h_stride_2d;
 
-                for (int64_t ii = 0; ii < head_size; ii++) {
-                    int64_t t_h_i_offset = t_h_offset + ii;
-                    int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
+                    for (int64_t i = 0; i < head_size; i++) {
+                        int64_t t_h_i_offset = t_h_offset + i;
+                        int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
 
-                    GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
+                        float v_val = v[t_h_i_offset];
 
-                    float sa = 0;
-                    {
-                        GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-                        GGML_F32_VEC ax[GGML_F32_ARR];
-                        GGML_F32_VEC ay[GGML_F32_ARR];
-                        for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
-                            for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
-                                ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
-                                ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
-                                sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
-                            }
+                        float sa = 0, result = 0;
+                        for (int64_t j = 0; j < head_size; j++) {
+                            sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
                         }
-                        GGML_F32_VEC_REDUCE(sa, sum);
-                    }
 
-                    GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
+                        for (int64_t j = 0; j < head_size; j++) {
+                            int64_t t_h_j_offset = t_h_offset + j;
+                            int64_t h_2d_i_j_offset = h_2d_i_offset + j;
 
-                    int64_t j = 0;
-                    GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-                    for (; j < head_size; j += GGML_F32_STEP) {
-                        for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
-                            int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
-                            int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
-
-                            GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
-                            GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
-                            GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
-                            GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
-
-                            k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
-
-                            GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
-                            // kv + s * decay + sa * b
-                            state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
-                            state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
-                            GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
-
-                            result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
+                            float r_val = r[t_h_j_offset];
+                            float w_val = w[t_h_j_offset];
+                            float k_val = k[t_h_j_offset];
+                            float b_val = b[t_h_j_offset];
+                            float kv_val = v_val * k_val;
+                            float prev_state_val = state_prev[h_2d_i_j_offset];
+                            state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
+                            result += state_cur[h_2d_i_j_offset] * r_val;
                         }
-                    }
-                    GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
-
-                    // There shouldn't be left-overs though.
-                    for (; j < head_size; j++) {
-                        int64_t t_h_j_offset = t_h_offset + j;
-                        int64_t h_2d_i_j_offset = h_2d_i_offset + j;
-
-                        float r_val = r[t_h_j_offset];
-                        float w_val = w[t_h_j_offset];
-                        float k_val = k[t_h_j_offset];
-                        float b_val = b[t_h_j_offset];
-                        float kv_val = v[t_h_i_offset] * k_val;
-
-                        float prev_state_val = state_prev[h_2d_i_j_offset];
-                        state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
-                        dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
+                        dst_data[t_h_i_offset] = result;
                     }
                 }
             }
-        }
+        #else
+            for (int64_t t = 0; t < T; t++) {
+                int64_t t_offset = t * t_stride;
+                int64_t state_offset = head_size * C * (t / (T / n_seqs));
+                float * state_cur = state + state_offset;
+                float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
+
+                for (int64_t h = h_start; h < h_end; h++) {
+                    int64_t h_offset = h * h_stride;
+                    int64_t t_h_offset = t_offset + h_offset;
+                    int64_t h_2d_offset = h * h_stride_2d;
+
+                    for (int64_t ii = 0; ii < head_size; ii++) {
+                        int64_t t_h_i_offset = t_h_offset + ii;
+                        int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
+
+                        GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
+
+                        float sa = 0;
+                        {
+                            GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+                            GGML_F32_VEC ax[GGML_F32_ARR];
+                            GGML_F32_VEC ay[GGML_F32_ARR];
+                            for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
+                                for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
+                                    ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
+                                    ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
+                                    sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
+                                }
+                            }
+                            GGML_F32_VEC_REDUCE(sa, sum);
+                        }
+
+                        GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
+
+                        int64_t j = 0;
+                        GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+                        for (; j < head_size; j += GGML_F32_STEP) {
+                            for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
+                                int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
+                                int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
+
+                                GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
+                                GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
+                                GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
+                                GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
+
+                                k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
+
+                                GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
+                                // kv + s * decay + sa * b
+                                state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
+                                state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
+                                GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
+
+                                result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
+                            }
+                        }
+                        GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
+
+                        // There shouldn't be left-overs though.
+                        for (; j < head_size; j++) {
+                            int64_t t_h_j_offset = t_h_offset + j;
+                            int64_t h_2d_i_j_offset = h_2d_i_offset + j;
+
+                            float r_val = r[t_h_j_offset];
+                            float w_val = w[t_h_j_offset];
+                            float k_val = k[t_h_j_offset];
+                            float b_val = b[t_h_j_offset];
+                            float kv_val = v[t_h_i_offset] * k_val;
+
+                            float prev_state_val = state_prev[h_2d_i_j_offset];
+                            state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
+                            dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
+                        }
+                    }
+                }
+            }
+        #endif
     #else
         for (int64_t t = 0; t < T; t++) {
             int64_t t_offset = t * t_stride;
diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h
index 45c31cf1f..2e3669c01 100644
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -17,7 +17,123 @@
 //   number of elements to fit in a single register
 //
 
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
+
+#define GGML_SIMD
+
+// F32 SVE
+#define GGML_F32_EPR 8
+#define DEFAULT_PG svptrue_b32()
+
+#define GGML_F32xt                        svfloat32_t
+#define GGML_F32xt_ZERO                   svdup_n_f32(0.0f)
+#define GGML_F32xt_SET1(x)                svdup_n_f32(x)
+#define GGML_F32xt_LOAD_IMPL(pg, a, ...)  svld1_f32(pg, a)
+#define GGML_F32xt_LOAD(...)              GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_STORE_IMPL(pg,a,b)     svst1_f32(pg, a, b)
+#define GGML_F32xt_STORE(...)             GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, a, b, c)
+#define GGML_F32xt_FMA(...)               GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
+#define GGML_F32xt_ADD(...)               GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_MUL_IMPL(pg, a, b)     svmul_f32_m(pg, a, b)
+#define GGML_F32xt_MUL(...)               GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
+#define GGML_F32xt_REDUCE_ONE(...)        GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
+#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
+{                                                      \
+    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2);        \
+    sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4);        \
+    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6);        \
+    sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8);        \
+    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3);        \
+    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7);        \
+    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5);        \
+    (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1);  \
+}
+#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
+
+#define GGML_F32_VEC        GGML_F32xt
+#define GGML_F32_VEC_ZERO   GGML_F32xt_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32xt_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32xt_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32xt_STORE
+#define GGML_F32_VEC_FMA    GGML_F32xt_FMA
+#define GGML_F32_VEC_ADD    GGML_F32xt_ADD
+#define GGML_F32_VEC_MUL    GGML_F32xt_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
+
+// F16 NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    #define GGML_F16_STEP 32
+    #define GGML_F16_EPR  8
+
+    #define GGML_F16x8              float16x8_t
+    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
+    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
+    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
+    #define GGML_F16x8_STORE        vst1q_f16
+    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+    #define GGML_F16x8_ADD          vaddq_f16
+    #define GGML_F16x8_MUL          vmulq_f16
+    #define GGML_F16x8_REDUCE(res, x)                               \
+    do {                                                            \
+        int offset = GGML_F16_ARR >> 1;                             \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    } while (0)
+
+    #define GGML_F16_VEC                GGML_F16x8
+    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
+    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
+    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
+    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
+#else
+    // if FP16 vector arithmetic is not supported, we use FP32 instead
+    // and take advantage of the vcvt_ functions to convert to/from FP16
+
+    #define GGML_F16_STEP 16
+    #define GGML_F16_EPR  4
+
+    #define GGML_F32Cx4              float32x4_t
+    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
+    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
+    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
+    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
+    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+    #define GGML_F32Cx4_ADD          vaddq_f32
+    #define GGML_F32Cx4_MUL          vmulq_f32
+    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
+
+    #define GGML_F16_VEC                GGML_F32Cx4
+    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
+    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
+    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
+    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
+#endif
+
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 #define GGML_SIMD
 
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
index 02d406182..f7614568e 100644
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -17,29 +17,98 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
 
 #if defined(GGML_SIMD)
     float sumf = 0.0f;
-    const int np = (n & ~(GGML_F32_STEP - 1));
 
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
+        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
+        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
 
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
+        const int np = (n & ~(ggml_f32_step - 1));
+        svfloat32_t sum1 = svdup_n_f32(0.0f);
+        svfloat32_t sum2 = svdup_n_f32(0.0f);
+        svfloat32_t sum3 = svdup_n_f32(0.0f);
+        svfloat32_t sum4 = svdup_n_f32(0.0f);
+        svfloat32_t sum5 = svdup_n_f32(0.0f);
+        svfloat32_t sum6 = svdup_n_f32(0.0f);
+        svfloat32_t sum7 = svdup_n_f32(0.0f);
+        svfloat32_t sum8 = svdup_n_f32(0.0f);
+        svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
+        svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
+        for (int i = 0; i < np; i += ggml_f32_step) {
+            ax1 = GGML_F32_VEC_LOAD(x + i);
+            ay1 = GGML_F32_VEC_LOAD(y + i);
+            sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
 
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
+            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
+            sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
 
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
+            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
+            sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
+
+            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
+            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
+            sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
+
+            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
+            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
+            sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
+
+            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
+            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
+            sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
+
+            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
+            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
+            sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
+
+            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
+            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
+            sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
         }
-    }
+        // leftovers
+        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
+        const int np2 = (n & ~(ggml_f32_epr - 1));
+        for (int i = np; i < np2; i += ggml_f32_epr) {
+            ax1 = GGML_F32_VEC_LOAD(x + i);
+            ay1 = GGML_F32_VEC_LOAD(y + i);
+            sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
+        }
+        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
+        if (np2 < n) {
+            svbool_t pg = svwhilelt_b32(np2, n);
+            ax1 = svld1_f32(pg, x + np2);
+            ay1 = svld1_f32(pg, y + np2);
+            sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
+        }
+        // reduce sum1,sum2 to sum1
+        GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
+    #else
+        const int np = (n & ~(GGML_F32_STEP - 1));
 
-    // reduce sum0..sum3 to sum0
-    GGML_F32_VEC_REDUCE(sumf, sum);
+        GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
 
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
+        GGML_F32_VEC ax[GGML_F32_ARR];
+        GGML_F32_VEC ay[GGML_F32_ARR];
+
+        for (int i = 0; i < np; i += GGML_F32_STEP) {
+            for (int j = 0; j < GGML_F32_ARR; j++) {
+                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+                sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+            }
+        }
+
+        // reduce sum0..sum3 to sum0
+        GGML_F32_VEC_REDUCE(sumf, sum);
+
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            sumf += x[i]*y[i];
+        }
+    #endif
 #else
     // scalar
     ggml_float sumf = 0.0;
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index c77349ebe..09dbade21 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -5,6 +5,7 @@
 #include "ggml-impl.h"
 #include "simd-mappings.h"
 #include "ggml.h"
+#include "ggml-cpu.h"
 
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
@@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
 
 inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
 #if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
 
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
+        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
+        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
+        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
 
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
+        const int np = (n & ~(ggml_f32_step - 1));
+        svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
+        svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
+        for (int i = 0; i < np; i += ggml_f32_step) {
 
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
+            ax1 = GGML_F32_VEC_LOAD(x + i);
+            ay1 = GGML_F32_VEC_LOAD(y + i);
+            ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
 
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            GGML_F32_VEC_STORE(y + i, ay1);
+
+            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
+            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
+            ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
+
+            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
+
+            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
+            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
+            ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
+
+            GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
+
+            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
+            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
+            ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
+
+            GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
+
+            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
+            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
+            ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
+
+            GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
+
+            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
+            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
+            ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
+
+            GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
+
+            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
+            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
+            ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
+
+            GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
+
+            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
+            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
+            ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
+
+            GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
         }
-    }
+        // leftovers
+        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
+        const int np2 = (n & ~(ggml_f32_epr - 1));
+        for (int i = np; i < np2; i += ggml_f32_epr) {
+            ax1 = GGML_F32_VEC_LOAD(x + i);
+            ay1 = GGML_F32_VEC_LOAD(y + i);
+            ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
 
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
+            GGML_F32_VEC_STORE(y + i, ay1);
+        }
+        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
+        if (np2 < n) {
+            svbool_t pg =svwhilelt_b32(np2, n);
+            ax1 = svld1_f32(pg, x + np2);
+            ay1 = svld1_f32(pg, y + np2);
+            ay1 = svmad_f32_m(pg, ax1, vx, ay1);
+
+            svst1_f32(pg, y + np2, ay1);
+        }
+    #else
+        const int np = (n & ~(GGML_F32_STEP - 1));
+
+        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+        GGML_F32_VEC ax[GGML_F32_ARR];
+        GGML_F32_VEC ay[GGML_F32_ARR];
+
+        for (int i = 0; i < np; i += GGML_F32_STEP) {
+            for (int j = 0; j < GGML_F32_ARR; j++) {
+                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
+
+                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            }
+        }
+
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] += x[i]*v;
+        }
+    #endif
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
@@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
     }
 
 #if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
-
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
-    }
-
-    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
-                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+    #if defined(__ARM_FEATURE_SVE)
+        // scalar Route to scalar implementation       //TODO: Write SVE code
+        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+            for (int i = 0; i < n; ++i) {
+                y[i] += x[k][i]*v[k][0];
             }
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
         }
-    }
+    #else
+        const int np = (n & ~(GGML_F32_STEP - 1));
 
-    // leftovers
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = np; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
+        GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
+
+        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+            vx[k] = GGML_F32_VEC_SET1(v[k][0]);
         }
-    }
+
+        GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
+        GGML_F32_VEC ay[GGML_F32_ARR];
+
+        for (int i = 0; i < np; i += GGML_F32_STEP) {
+            for (int j = 0; j < GGML_F32_ARR; j++) {
+                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+                for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+                    ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
+                    ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+                }
+
+                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            }
+        }
+
+        // leftovers
+        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+            for (int i = np; i < n; ++i) {
+                y[i] += x[k][i]*v[k][0];
+            }
+        }
+    #endif
 #else
     // scalar
     for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
@@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(GGML_USE_ACCELERATE)
     vDSP_vsmul(y, 1, &v, y, 1, n);
 #elif defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
+        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
+        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
+        const int ggml_f32_step = 2 * ggml_f32_epr;
 
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+        const int np = (n & ~(ggml_f32_step - 1));
+        svfloat32_t ay1;
+        svfloat32_t ay2;
+        for (int i = 0; i < np; i += ggml_f32_step) {
+            ay1 = GGML_F32_VEC_LOAD(y + i);
+            ay1 = GGML_F32_VEC_MUL(ay1, vx);
+            GGML_F32_VEC_STORE(y + i, ay1);
 
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
+            ay2 = GGML_F32_VEC_MUL(ay2, vx);
+            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
         }
-    }
+        // leftovers
+        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
+        if (np < n) {
+            svbool_t pg = svwhilelt_b32(np, n);
+            ay1 = svld1_f32(pg, y + np);
+            ay1 = svmul_f32_m(pg, ay1, vx);
+            svst1_f32(pg, y + np, ay1);
+        }
+    #else
+        const int np = (n & ~(GGML_F32_STEP - 1));
 
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] *= v;
-    }
+        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+        GGML_F32_VEC ay[GGML_F32_ARR];
+
+        for (int i = 0; i < np; i += GGML_F32_STEP) {
+            for (int j = 0; j < GGML_F32_ARR; j++) {
+                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
+
+                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            }
+        }
+
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] *= v;
+        }
+    #endif
 #else
     // scalar
     for (int i = 0; i < n; ++i) {
@@ -528,6 +647,42 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
 #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
 #endif
 
+/* Below function was borrowed from the GitHub repository:
+https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
+#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
+    inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
+        // Constants
+        const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
+        const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
+        const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
+        const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
+        const svfloat32_t one = svdup_n_f32(1.0f);
+        const svfloat32_t inactive1 = svdup_n_f32(0.0f);
+        const svint32_t inactive2 = svdup_n_s32(0);
+
+        // Algorithm starts here
+        svfloat32_t t0 = svmul_f32_m(pg, src, log2_e);  // y = x * log2(e)
+        svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0);         // rount to int (float)
+        svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1);         // n
+
+        t1 = svsub_f32_m(pg, t0, t1);   // a = y - floor(y)
+        t1 = svadd_f32_m(pg, t1, one);  // b = a + 1
+
+        svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17);  // v = b >> 17 (u32)
+        svfloat32_t t4 = svexpa_f32(t3);                                   // c = fexpa(v)
+        t4 = svscale_f32_m(pg, t4, t2);                                    // fexpa(v) * 2^(n)
+
+        // and_(t2.d, t1.d, not_mask17.d)
+        svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
+        t5 = svsub_f32_m(pg, t1, t5);                // z
+        t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq);  // ln2 + half_ln2_sq * z
+        t0 = svmla_f32_m(pg, one, t5, t0);           // 1 + (ln2 * z) + (half_ln2_sq * z * z)
+        t0 = svmul_f32_m(pg, t0, t4);                // Final result
+
+        return t0;
+    }
+#endif
+
 #if defined(__ARM_NEON) && defined(__aarch64__)
 
 // adapted from arm limited optimized routine
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 64fb4ff4c..e1ce1d4cd 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -168,7 +168,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
 
 #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
 
-#if !defined(GGML_USE_HIP)
+#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
 static const char * cu_get_error_str(CUresult err) {
     const char * err_str;
     cuGetErrorString(err, &err_str);
@@ -635,6 +635,7 @@ struct ggml_cuda_device_info {
         int     nsm;                // number of streaming multiprocessors
         size_t  smpb;               // max. shared memory per block
         size_t  smpbo;              // max. shared memory per block (with opt-in)
+        bool    integrated;         // Device is integrated as opposed to discrete
         bool    vmm;                // virtual memory support
         size_t  vmm_granularity;    // granularity of virtual memory
         size_t  total_vram;
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index a4fbd8236..cfab2b5eb 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
     __builtin_assume(tid < D);
 
     extern __shared__ float2 meta[];
-    if (tid < 2*parallel_blocks) {
-        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
+    for (int i = tid; i < 2*parallel_blocks; i += D) {
+        ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
     }
 
     __syncthreads();
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index 7120053b6..925f39e89 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1246,7 +1246,7 @@ static __global__ void flash_attn_ext_f16(
         NO_DEVICE_CODE;
         return;
     }
-#endif __CUDA_ARCH__ == GGML_CUDA_CC_TURING
+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
 
     static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");
 
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index 798a59b27..35e649cb3 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -212,6 +212,7 @@ static __global__ void flash_attn_vec_ext_f16(
                 }
             }
             if (__all_sync(0xFFFFFFFF, skip)) {
+                __syncthreads();
                 continue;
             }
 #endif // GGML_USE_HIP
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
index 49c592ea5..953967917 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -217,6 +217,7 @@ static __global__ void flash_attn_vec_ext_f32(
                 }
             }
             if (__all_sync(0xFFFFFFFF, skip)) {
+                __syncthreads();
                 continue;
             }
 #endif // GGML_USE_HIP
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 02dc8c12d..2a6f7f108 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
         info.default_tensor_split[id] = total_vram;
         total_vram += prop.totalGlobalMem;
-
-        info.devices[id].nsm       = prop.multiProcessorCount;
-        info.devices[id].smpb      = prop.sharedMemPerBlock;
-        info.devices[id].warp_size = prop.warpSize;
+        info.devices[id].integrated = prop.integrated;
+        info.devices[id].nsm        = prop.multiProcessorCount;
+        info.devices[id].smpb       = prop.sharedMemPerBlock;
+        info.devices[id].warp_size  = prop.warpSize;
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
         info.devices[id].smpbo = prop.sharedMemPerBlock;
 
@@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
     GGML_UNUSED(buft);
 }
 
+static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name;
+}
+
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     CUDA_CHECK(cudaFreeHost(buffer->context));
 }
@@ -2192,6 +2196,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                 case GGML_UNARY_OP_SILU:
                     ggml_cuda_op_silu(ctx, dst);
                     break;
+                case GGML_UNARY_OP_GELU_ERF:
+                    ggml_cuda_op_gelu_erf(ctx, dst);
+                    break;
                 case GGML_UNARY_OP_GELU_QUICK:
                     ggml_cuda_op_gelu_quick(ctx, dst);
                     break;
@@ -2638,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
 
 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
     bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
+    // flag used to determine whether it is an integrated_gpu
+    const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
 
     while (!graph_evaluated_or_captured) {
         // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2656,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     if (node->src[j] != nullptr) {
                         assert(node->src[j]->buffer);
                         assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
-                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
+                               ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
                     }
                 }
 #endif
@@ -2977,6 +2986,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_SIGMOID:
                 case GGML_UNARY_OP_HARDSIGMOID:
                 case GGML_UNARY_OP_HARDSWISH:
+                case GGML_UNARY_OP_GELU_ERF:
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
@@ -2990,9 +3000,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             {
                 struct ggml_tensor * a = op->src[0];
                 struct ggml_tensor * b = op->src[1];
-                // for small weight matrices the active device can end up without any rows, don't use row split in those cases
-                // this avoids some edge cases (and the performance would not be good anyways)
                 if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
+                    if (a->ne[2] > 1 || a->ne[3] > 1) {
+                        return false;
+                    }
+                    // for small weight matrices the active device can end up without any rows, don't use row split in those cases
+                    // this avoids some edge cases (and the performance would not be good anyways)
                     ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
                     int64_t row_low;
                     int64_t row_high;
@@ -3259,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 }
 
 static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
+    ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
+    const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated;
+    return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft)));
 }
 
 static int64_t get_op_batch_size(const ggml_tensor * op) {
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
index ec5773e01..2c0375fbe 100644
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -23,6 +23,12 @@ static __device__ __forceinline__ float op_gelu(float x) {
     return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }
 
+static __device__ __forceinline__ float op_gelu_erf(float x) {
+    const float SQRT_2_INV = 0.70710678118654752440084436210484f;
+
+    return 0.5f*x*(1.0f + erff(x*SQRT_2_INV));
+}
+
 static __device__ __forceinline__ float op_gelu_quick(float x) {
     const float GELU_QUICK_COEF = -1.702f;
 
@@ -134,6 +140,10 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     ggml_cuda_op_unary<op_gelu>(ctx, dst);
 }
 
+void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_unary<op_gelu_erf>(ctx, dst);
+}
+
 void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     ggml_cuda_op_unary<op_gelu_quick>(ctx, dst);
 }
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
index 940a1feed..6686fc17e 100644
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -30,6 +30,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
+void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
 void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index a19cfb14e..6dc5ce0d9 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -32,6 +32,8 @@
 extern "C" {
 #endif
 
+void ggml_print_backtrace(void);
+
 #ifndef MIN
 #    define MIN(a, b) ((a) < (b) ? (a) : (b))
 #endif
@@ -386,7 +388,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
         return r;
     }
 
-#elif defined(__riscv) && defined(GGML_RV_ZFH)
+#elif defined(__riscv) && defined(__riscv_zfhmin)
 
     static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
         float f;
diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt
index 352deb321..9f930c70b 100644
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -55,14 +55,17 @@ endfunction()
 
 set(GGML_OPENCL_KERNELS
     add
+    argsort
     clamp
     cpy
     cvt
     diag_mask_inf
+    div
     gelu
     gemv_noshuffle_general
     gemv_noshuffle
     get_rows
+    group_norm
     im2col_f32
     im2col_f16
     mul_mat_Ab_Bi_8x4
@@ -83,11 +86,14 @@ set(GGML_OPENCL_KERNELS
     rms_norm
     rope
     scale
+    sigmoid
     silu
     softmax_4_f32
     softmax_4_f16
     softmax_f32
     softmax_f16
+    sub
+    sum_rows
     transpose
 )
 
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 586946048..5dbe97ab2 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <memory>
 #include <charconv>
+#include <mutex>
 
 #undef MIN
 #undef MAX
@@ -74,6 +75,7 @@ struct ggml_cl_version {
     cl_uint minor = 0;
 };
 
+
 struct ggml_cl_compiler_version {
     ADRENO_CL_COMPILER_TYPE type;
     int major = -1;
@@ -91,6 +93,14 @@ struct ggml_cl_compiler_version {
     }
 };
 
+static size_t align_to(size_t value, size_t to_alignment) {
+    GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
+    GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
+
+    return ((value + to_alignment - 1) / to_alignment) * to_alignment;
+}
+
+
 // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
 static ggml_cl_version parse_cl_version(std::string_view str) {
     size_t major_str_begin = 0;
@@ -221,13 +231,25 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
     return { type, major, minor, patch };
 }
 
+struct ggml_backend_opencl_context;
+
 // backend device context
 struct ggml_backend_opencl_device_context {
     cl_platform_id platform;
     std::string platform_name;
 
-    cl_device_id device;
-    std::string device_name;
+    cl_device_id   device;
+    std::string    device_name;
+    cl_device_type device_type;
+    std::string    device_version;
+
+    // Initialized by ggml_cl2_init().
+    ggml_backend_opencl_context * backend_ctx = nullptr;
+
+    // Initialized by ggml_backend_opencl_device_get_buffer_type()
+    ggml_backend_buffer_type buffer_type;
+
+    cl_context context = nullptr;
 };
 
 // backend context
@@ -248,6 +270,8 @@ struct ggml_backend_opencl_context {
 
     int adreno_wave_size;
 
+    cl_bool non_uniform_workgroups;
+
     cl_context context;
     cl_command_queue queue;
 
@@ -275,27 +299,37 @@ struct ggml_backend_opencl_context {
     cl_program program_mul_mv_f16_f32;
     cl_program program_mul_mv_f32_f32;
     cl_program program_mul;
+    cl_program program_div;
+    cl_program program_sub;
     cl_program program_norm;
     cl_program program_relu;
     cl_program program_rms_norm;
+    cl_program program_group_norm;
     cl_program program_rope;
     cl_program program_scale;
     cl_program program_silu;
+    cl_program program_sigmoid;
     cl_program program_softmax_f32;
     cl_program program_softmax_f16;
     cl_program program_softmax_4_f32;
     cl_program program_softmax_4_f16;
+    cl_program program_argsort_f32_i32;
+    cl_program program_sum_rows_f32;
 
     cl_kernel kernel_add, kernel_add_row;
     cl_kernel kernel_mul, kernel_mul_row;
+    cl_kernel kernel_div, kernel_div_row;
+    cl_kernel kernel_sub, kernel_sub_row;
     cl_kernel kernel_scale;
     cl_kernel kernel_silu, kernel_silu_4;
     cl_kernel kernel_gelu, kernel_gelu_4;
     cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
     cl_kernel kernel_relu;
+    cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
     cl_kernel kernel_clamp;
     cl_kernel kernel_norm;
     cl_kernel kernel_rms_norm;
+    cl_kernel kernel_group_norm;
     cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
     cl_kernel kernel_soft_max, kernel_soft_max_4;
     cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -315,6 +349,8 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
     cl_kernel kernel_mul_mv_q6_K_f32;
     cl_kernel kernel_im2col_f32, kernel_im2col_f16;
+    cl_kernel kernel_argsort_f32_i32;
+    cl_kernel kernel_sum_rows_f32;
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     // Transpose kernels
@@ -344,15 +380,8 @@ struct ggml_backend_opencl_context {
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 };
 
-static ggml_backend_device                 g_ggml_backend_opencl_device;
-static ggml_backend_opencl_device_context  g_ggml_ctx_dev_main {
-    /*.platform         =*/ nullptr,
-    /*.platform_nane    =*/ "",
-    /*.device           =*/ nullptr,
-    /*.device_name      =*/ "",
-};
-
-static int ggml_backend_opencl_n_devices = 0;
+// All registered devices with a default device in the front.
+static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
 
 // Profiling
 #ifdef GGML_OPENCL_PROFILING
@@ -969,6 +998,105 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         GGML_LOG_CONT(".");
     }
 
+    // argsort
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "argsort.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("argsort.cl");
+#endif
+        backend_ctx->program_argsort_f32_i32 =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // div
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "div.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("div.cl");
+#endif
+        backend_ctx->program_div =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_div     = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
+        CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // sub
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "sub.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("sub.cl");
+#endif
+        backend_ctx->program_sub =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_sub     = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
+        CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // sum_rows
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "sum_rows.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("sum_rows.cl");
+#endif
+        backend_ctx->program_sum_rows_f32 =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // sigmoid
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "sigmoid.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("sigmoid.cl");
+#endif
+        backend_ctx->program_sigmoid =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // group_norm
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "group_norm.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("group_norm.cl");
+#endif
+        backend_ctx->program_group_norm =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
     // Adreno kernels
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     // transpose
@@ -1107,25 +1235,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
     GGML_LOG_CONT("\n");
 }
 
-static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
-    static bool initialized = false;
-    static ggml_backend_opencl_context *backend_ctx = nullptr;
+// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+// XXX    static bool initialized = false;
+// XXX    static ggml_backend_opencl_context *backend_ctx = nullptr;
 
-    if (initialized) {
-        return backend_ctx;
-    }
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
 
-    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
-    GGML_ASSERT(dev_ctx);
-    GGML_ASSERT(dev_ctx->platform == nullptr);
-    GGML_ASSERT(dev_ctx->device == nullptr);
-    GGML_ASSERT(backend_ctx == nullptr);
+namespace /* anonymous */ {
+extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
+}
 
-    initialized = true;
-    backend_ctx = new ggml_backend_opencl_context();
-    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-
-    cl_int err;
+// Look for available and suitable devices.
+static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
+    std::vector<ggml_backend_device> found_devices;
 
 #ifdef GGML_OPENCL_PROFILING
     GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
@@ -1158,11 +1280,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     struct cl_device devices[NDEV];
     unsigned n_devices = 0;
     struct cl_device * default_device = NULL;
+    unsigned           default_platform_number = 0;
 
     cl_platform_id platform_ids[NPLAT];
     if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
         GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
-        return backend_ctx;
+        return found_devices;
     }
 
     for (unsigned i = 0; i < n_platforms; i++) {
@@ -1197,19 +1320,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
         }
 
         if (default_device == NULL && p->default_device != NULL) {
-            default_device = p->default_device;
+            default_device          = p->default_device;
+            default_platform_number = i;
         }
     }
 
     if (n_devices == 0) {
         GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
-        return backend_ctx;
+        return found_devices;
     }
 
-    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
-    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
-    int user_platform_number = -1;
-    int user_device_number = -1;
+    char *      user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char *      user_device_string   = getenv("GGML_OPENCL_DEVICE");
+    int         user_platform_number = -1;
+    int         user_device_number   = -1;
+    cl_device * candidate_devices    = nullptr;
+    unsigned    n_candidate_devices  = 0;
 
     unsigned n;
     if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
@@ -1224,12 +1350,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
             GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
             exit(1);
         }
-        default_device = &platform->devices[user_device_number];
+        default_device      = &platform->devices[user_device_number];
+        candidate_devices   = platform->devices;
+        n_candidate_devices = platform->n_devices;
     } else {
-
-        struct cl_device * selected_devices = devices;
-        unsigned n_selected_devices = n_devices;
-
+        // Choose a platform by matching a substring.
         if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
             for (unsigned i = 0; i < n_platforms; i++) {
                 struct cl_platform * p = &platforms[i];
@@ -1244,20 +1369,20 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
                 exit(1);
             }
         }
-        if (user_platform_number != -1) {
-            struct cl_platform * p = &platforms[user_platform_number];
-            selected_devices = p->devices;
-            n_selected_devices = p->n_devices;
-            default_device = p->default_device;
-            if (n_selected_devices == 0) {
-                GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
-                exit(1);
-            }
+
+        int                  platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
+        struct cl_platform * p            = &platforms[platform_idx];
+        candidate_devices                 = p->devices;
+        n_candidate_devices               = p->n_devices;
+        default_device                    = p->default_device;
+        if (n_candidate_devices == 0) {
+            GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+            exit(1);
         }
 
         if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
-            for (unsigned i = 0; i < n_selected_devices; i++) {
-                struct cl_device * d = &selected_devices[i];
+            for (unsigned i = 0; i < n_candidate_devices; i++) {
+                struct cl_device * d = &candidate_devices[i];
                 if (strstr(d->name, user_device_string) != NULL) {
                     user_device_number = d->number;
                     break;
@@ -1269,71 +1394,145 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
             }
         }
         if (user_device_number != -1) {
-            selected_devices = &devices[user_device_number];
-            n_selected_devices = 1;
-            default_device = &selected_devices[0];
+            candidate_devices   = &devices[user_device_number];
+            n_candidate_devices = 1;
+            default_device      = &candidate_devices[0];
         }
 
-        GGML_ASSERT(n_selected_devices > 0);
+        GGML_ASSERT(n_candidate_devices > 0);
 
         if (default_device == NULL) {
-            default_device = &selected_devices[0];
+            default_device = &candidate_devices[0];
         }
     }
 
-    GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
-    GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
-    if (default_device->type != CL_DEVICE_TYPE_GPU) {
-        GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
+
+    // Put the default device in front.
+    for (unsigned i = 1; i < n_candidate_devices; i++) {
+        if (&candidate_devices[i] == default_device) {
+            std::swap(candidate_devices[0], candidate_devices[i]);
+            default_device = &candidate_devices[0];
+            break;
+        }
     }
 
-    dev_ctx->platform = default_device->platform->id;
-    dev_ctx->device = default_device->id;
-    backend_ctx->device = default_device->id;
+    GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
 
-    if (strstr(default_device->name, "Adreno") ||
-        strstr(default_device->name, "Qualcomm") ||
-        strstr(default_device->version, "Adreno")) {
+    std::vector<cl_device_id> device_ids;
+    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
+        device_ids.push_back(dev->id);
+    }
+
+    cl_int                err;
+    cl_context            shared_context;
+    cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
+
+    CL_CHECK(
+        (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
+
+    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
+        GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
+
+        auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
+            /*.platform         =*/dev->platform->id,
+            /*.platform_nane    =*/dev->platform->name,
+            /*.device           =*/dev->id,
+            /*.device_name      =*/dev->name,
+            /*.device_type      =*/dev->type,
+            /*.device_version   =*/dev->version,
+            /*.backend_ctx      =*/nullptr,
+            /*.buffer_type      =*/{},
+            /*.context          =*/shared_context,
+        });
+
+        found_devices.push_back(ggml_backend_device{
+            /* .iface   = */ ggml_backend_opencl_device_i,
+            /* .reg     = */ reg,
+            /* .context = */ dev_ctx.get(),
+        });
+
+        if (!ggml_cl2_init(&found_devices.back())) {
+            found_devices.pop_back();
+            GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
+            continue;
+        }
+
+        dev_ctx.release();
+    }
+
+    if (found_devices.size()) {
+        auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
+        GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
+                      dev_ctx->device_version.c_str());
+
+        if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
+            GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
+                          dev_ctx->device_name.c_str());
+        }
+    }
+
+    return found_devices;
+}
+
+// Initialize device if it is supported (returns nullptr if it is not).
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+    GGML_ASSERT(dev);
+    GGML_ASSERT(dev->context);
+
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+    GGML_ASSERT(dev_ctx->platform);
+    GGML_ASSERT(dev_ctx->device);
+
+    if (dev_ctx->backend_ctx) {
+        return dev_ctx->backend_ctx;
+    }
+
+    auto backend_ctx        = std::make_unique<ggml_backend_opencl_context>();
+    backend_ctx->device     = dev_ctx->device;
+    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+
+    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
+        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
+        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
         backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
         // Usually device version contains the detailed device name
-        backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
+        backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
         if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
-            backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
+            backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
         }
 
         // Use wave size of 64 for all Adreno GPUs.
         backend_ctx->adreno_wave_size = 64;
-    } else if (strstr(default_device->name, "Intel")) {
+    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
         backend_ctx->gpu_family = GPU_FAMILY::INTEL;
     } else {
-        GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
+        GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
         backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-        return backend_ctx;
+        return nullptr;
     }
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
         GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
             "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
-        return backend_ctx;
+        return nullptr;
     }
 #endif
 
     // Populate backend device name
-    dev_ctx->platform_name = default_device->platform->name;
-    dev_ctx->device_name = default_device->name;
-    backend_ctx->device_name = default_device->name;
+    backend_ctx->device_name = dev_ctx->device_name;
 
     // A local ref of cl_device_id for convenience
     cl_device_id device = backend_ctx->device;
 
-    ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
+    ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
 
     // Check device OpenCL version, OpenCL 2.0 or above is required
     ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
     if (opencl_c_version.major < 2) {
         GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
-        return backend_ctx;
+        return nullptr;
     }
 
     // Check driver version
@@ -1364,7 +1563,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     // fp16 is required
     if (!backend_ctx->fp16_support) {
         GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
-        return backend_ctx;
+        return nullptr;
     }
 
     // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
@@ -1373,7 +1572,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
         strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
         GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
             "(note that subgroups is an optional feature in OpenCL 3.0)\n");
-        return backend_ctx;
+        return nullptr;
     }
 
     cl_uint base_align_in_bits;
@@ -1397,6 +1596,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
         svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
 
+    if (opencl_c_version.major >= 3) {
+        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
+                                 &backend_ctx->non_uniform_workgroups, 0));
+    } else {
+        GGML_ASSERT(opencl_c_version.major == 2);
+        // Non-uniform workgroup sizes is mandatory feature in v2.x.
+        backend_ctx->non_uniform_workgroups = true;
+    }
+
     // Print out configurations
 #ifdef GGML_OPENCL_SOA_Q
     GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -1406,14 +1614,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
-    cl_context_properties properties[] = {
-        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
-    };
-
-    CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+    cl_int err;
 
     // A local ref of cl_context for convenience
-    cl_context context = backend_ctx->context;
+    cl_context context = backend_ctx->context = dev_ctx->context;
 
     //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
     //    (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
@@ -1426,7 +1630,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
 
     // Load kernels
-    load_cl_kernels(backend_ctx, opencl_c_version);
+    load_cl_kernels(backend_ctx.get(), opencl_c_version);
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     // Allocate intermediate buffers and images
@@ -1456,10 +1660,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     CL_CHECK((backend_ctx->B_d_max   = clCreateBuffer(context, 0, max_B_d_bytes,   NULL, &err), err));
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
-    // For now we support a single devices
-    ggml_backend_opencl_n_devices = 1;
-
-    return backend_ctx;
+    dev_ctx->backend_ctx = backend_ctx.release();
+    return dev_ctx->backend_ctx;
 }
 
 static void ggml_cl2_free(void) {
@@ -1664,10 +1866,46 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
     GGML_UNUSED(backend);
 }
 
+// Syncronizes the 'backend_ctx's device with others so that commands
+// enqueued to it won't start until commands in the other devices have
+// completed.
+static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
+    if (g_ggml_backend_opencl_devices.size() < 2)
+      return; // No other devices to synchronize with.
+
+    std::vector<cl_event> events;
+    events.reserve(g_ggml_backend_opencl_devices.size());
+
+    for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
+        auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
+        if (backend_ctx != other_backend_ctx) {
+            cl_event ev;
+            CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
+            CL_CHECK(clFlush(other_backend_ctx->queue));
+            events.push_back(ev);
+        }
+    }
+
+    CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
+    for (auto ev : events) {
+        CL_CHECK(clReleaseEvent(ev));
+    }
+}
+
+static void sync_with_other_backends(ggml_backend_t backend) {
+    auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
+    sync_with_other_backends(backend_ctx);
+}
+
 static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
+        // NOTE: this may oversynchronize by synchronizing with
+        //       backends/devices which don't compute 'cgraph's
+        //       dependencies.
+        sync_with_other_backends(backend);
+
         if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
             continue;
         }
@@ -1729,6 +1967,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
         case GGML_OP_ADD:
         case GGML_OP_SCALE:
         case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SUB:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -1736,7 +1976,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_GELU_QUICK:
-                   return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
+                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
+                case GGML_UNARY_OP_SIGMOID:
+                    return ggml_is_contiguous(op->src[0]);
                 default:
                     return false;
             }
@@ -1746,11 +1988,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
         case GGML_OP_NORM:
         case GGML_OP_RMS_NORM:
             return true;
+        case GGML_OP_GROUP_NORM:
+            return ggml_is_contiguous(op->src[0]);
         case GGML_OP_MUL_MAT:
             if (op->src[0]->type == GGML_TYPE_F16) {
                 return true;
             } else if (op->src[0]->type == GGML_TYPE_F32) {
-                return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+                return op->src[1]->type == GGML_TYPE_F32;
             } else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
                        op->src[0]->type == GGML_TYPE_Q6_K) {
                 return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
@@ -1785,6 +2029,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
         }
         case GGML_OP_IM2COL:
             return true;
+        case GGML_OP_ARGSORT:
+            return op->src[0]->type == GGML_TYPE_F32;
+        case GGML_OP_SUM_ROWS:
+            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
         default:
             return false;
     }
@@ -2058,15 +2306,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         // The original tensor memory is divided into scales and quants, i.e.,
         // we first store scales, then quants.
         // Create subbuffer for scales.
-        region.origin = extra_orig->offset + tensor->view_offs + offset;
+        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
         region.size = size_d;
         extra->d = clCreateSubBuffer(
             extra_orig->data_device, CL_MEM_READ_WRITE,
             CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
         CL_CHECK(err);
+        auto previous_origin = region.origin;
 
         // Create subbuffer for quants.
-        region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
+        region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
         region.size = size_q;
         extra->q = clCreateSubBuffer(
             extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2271,8 +2520,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
     cl_context context = backend_ctx->context;
     cl_command_queue queue = backend_ctx->queue;
 
-    // Make sure all previously submitted commands are finished.
-    CL_CHECK(clFinish(queue));
+    // Make sure all previously submitted commands in other devices are finished.
+    sync_with_other_backends(backend_ctx);
 
 #ifdef GGML_OPENCL_SOA_Q
     // In end-to-end runs, get_tensor is usually used to get back the logits,
@@ -2376,13 +2625,8 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
 }
 
 static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    // FIXME: not thread safe, device may not be initialized yet
-    static cl_uint alignment = -1;
-    if (alignment == (cl_uint)-1) {
-        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
-        alignment = backend_ctx->alignment;
-    }
-    return alignment;
+    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+    return backend_ctx->alignment;
 }
 
 static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
@@ -2409,16 +2653,6 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
     /* .is_host          = */ NULL,
 };
 
-ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
-    static ggml_backend_buffer_type buffer_type = {
-        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
-        /* .device  = */ &g_ggml_backend_opencl_device,
-        /* .context = */ nullptr,
-    };
-
-    return &buffer_type;
-}
-
 //
 // backend device
 //
@@ -2476,9 +2710,15 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
 }
 
 static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_opencl_buffer_type();
+    auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
 
-    GGML_UNUSED(dev);
+    dev_ctx->buffer_type = ggml_backend_buffer_type{
+        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
+        /* .device  = */ dev,
+        /* .context = */ nullptr,
+    };
+
+    return &dev_ctx->buffer_type;
 }
 
 static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -2494,12 +2734,21 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const
 }
 
 static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
+    // Check 'dev' and 'buffer_type' are not objects belonging to this backend.
+    if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
+        buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
+        return false;
+    }
 
-    GGML_UNUSED(dev);
+    // Check cl_context is the same. clEnqueue* commands may not use
+    // buffers from another cl_context.
+    ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
+    ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
+    return backend_ctx0->context == backend_ctx1->context;
 }
 
-static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
+namespace /* anonymous */ {
+struct ggml_backend_device_i ggml_backend_opencl_device_i = {
     /* .get_name             = */ ggml_backend_opencl_device_get_name,
     /* .get_description      = */ ggml_backend_opencl_device_get_description,
     /* .get_memory           = */ ggml_backend_opencl_device_get_memory,
@@ -2516,6 +2765,7 @@ static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
     /* .event_free           = */ NULL,
     /* .event_synchronize    = */ NULL,
 };
+}
 
 // Backend registry
 
@@ -2526,15 +2776,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
 }
 
 static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
-    return ggml_backend_opencl_n_devices;
+    return g_ggml_backend_opencl_devices.size();
 
     GGML_UNUSED(reg);
 }
 
 static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
+    GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
 
-    return &g_ggml_backend_opencl_device;
+    return &g_ggml_backend_opencl_devices[index];
 
     GGML_UNUSED(reg);
     GGML_UNUSED(index);
@@ -2548,27 +2798,23 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
 };
 
 ggml_backend_reg_t ggml_backend_opencl_reg(void) {
-    // TODO: make this thread-safe somehow?
+    static std::mutex mutex;
     static ggml_backend_reg reg;
     static bool initialized = false;
+    std::lock_guard<std::mutex> lock(mutex);
 
-    if (!initialized) {
-        reg = ggml_backend_reg {
-            /* .api_version = */ GGML_BACKEND_API_VERSION,
-            /* .iface   = */ ggml_backend_opencl_reg_i,
-            /* .context = */ NULL,
-        };
-
-        g_ggml_backend_opencl_device = ggml_backend_device {
-            /* .iface   = */ ggml_backend_opencl_device_i,
-            /* .reg     = */ &reg,
-            /* .context = */ &g_ggml_ctx_dev_main,
-        };
-
-        ggml_cl2_init(&g_ggml_backend_opencl_device);
-
-        initialized = true;
+    if (initialized) {
+        return &reg;
     }
+    initialized = true;
+
+    g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(&reg);
+
+    reg = ggml_backend_reg{
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_opencl_reg_i,
+        /* .context     = */ NULL,
+    };
 
     return &reg;
 }
@@ -2942,14 +3188,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)n, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
 #ifdef GGML_OPENCL_PROFILING
         cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
         g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
     } else {
         unsigned int nth = MIN(64, ne0);
@@ -3072,6 +3323,261 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
         CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
     }
 
+    if (bcast_row) {
+        int n = ggml_nelements(dst)/4;
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+#endif
+    } else {
+        unsigned int nth = MIN(64, ne0);
+        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
+
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
+
+    const int ne10 = src1->ne[0];
+    const int ne11 = src1->ne[1];
+    const int ne12 = src1->ne[2];
+    const int ne13 = src1->ne[3];
+
+    const cl_ulong nb10 = src1->nb[0];
+    const cl_ulong nb11 = src1->nb[1];
+    const cl_ulong nb12 = src1->nb[2];
+    const cl_ulong nb13 = src1->nb[3];
+
+    const int ne0  = dst->ne[0];
+
+    const cl_ulong nb0  = dst->nb[0];
+    const cl_ulong nb1  = dst->nb[1];
+    const cl_ulong nb2  = dst->nb[2];
+    const cl_ulong nb3  = dst->nb[3];
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    bool bcast_row = false;
+    cl_kernel kernel;
+
+    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
+
+        // src1 is a row
+        GGML_ASSERT(ne11 == 1);
+
+        bcast_row = true;
+        int ne = ne00 / 4;
+        kernel = backend_ctx->kernel_div_row;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
+    } else {
+        kernel = backend_ctx->kernel_div;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne10));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne11));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne13));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
+        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
+        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
+        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne0));
+        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
+    }
+
+    if (bcast_row) {
+        int n = ggml_nelements(dst)/4;
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    } else {
+        unsigned int nth = MIN(64, ne0);
+        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {nth, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+        g_profiling_info.emplace_back();
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+    }
+}
+
+static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
+
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
+
+    const int ne10 = src1->ne[0];
+    const int ne11 = src1->ne[1];
+    const int ne12 = src1->ne[2];
+    const int ne13 = src1->ne[3];
+
+    const cl_ulong nb10 = src1->nb[0];
+    const cl_ulong nb11 = src1->nb[1];
+    const cl_ulong nb12 = src1->nb[2];
+    const cl_ulong nb13 = src1->nb[3];
+
+    const int ne0  = dst->ne[0];
+
+    const cl_ulong nb0  = dst->nb[0];
+    const cl_ulong nb1  = dst->nb[1];
+    const cl_ulong nb2  = dst->nb[2];
+    const cl_ulong nb3  = dst->nb[3];
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    bool bcast_row = false;
+    cl_kernel kernel;
+
+    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
+
+        // src1 is a row
+        GGML_ASSERT(ne11 == 1);
+
+        bcast_row = true;
+        int ne = ne00 / 4;
+        kernel = backend_ctx->kernel_sub_row;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
+    } else {
+        kernel = backend_ctx->kernel_sub;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne10));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne11));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne13));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
+        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
+        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
+        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
+        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne0));
+        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
+    }
+
     if (bcast_row) {
         int n = ggml_nelements(dst)/4;
         size_t global_work_size[] = {(size_t)n, 1, 1};
@@ -3233,14 +3739,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
     g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }
 
@@ -3273,14 +3784,71 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
     g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    cl_kernel kernel;
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        kernel = backend_ctx->kernel_sigmoid_f32;
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        kernel = backend_ctx->kernel_sigmoid_f16;
+    } else {
+        GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
+    }
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+    const int64_t n = ggml_nelements(dst);
+
+    size_t global_work_size[] = {(size_t)n, 1, 1};
+    size_t local_work_size[] = {64, 1, 1};
+
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }
 
@@ -3320,14 +3888,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
     g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }
 
@@ -3476,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
 #endif
 }
 
+static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    UNUSED(src1);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    int32_t n_groups   = ((const int32_t *) dst->op_params)[0];
+    int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
+    float   eps        = ((const float *) dst->op_params)[1];
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne = ne00*ne01*ne02;
+
+    cl_kernel kernel = backend_ctx->kernel_group_norm;
+
+    size_t sgs = 64;
+    if (backend_ctx->gpu_family == ADRENO) {
+        sgs = 64;
+    } else if (backend_ctx->gpu_family == INTEL) {
+        sgs = 32;
+    } else {
+        GGML_ASSERT(false && "Unsupported GPU");
+    }
+
+    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne));
+    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &group_size));
+    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float),    &eps));
+
+    size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
+    size_t local_work_size[] = {(size_t)sgs, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
 static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
@@ -4230,14 +4862,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
     g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }
 
@@ -4418,14 +5055,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
         size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
         size_t local_work_size[] = {64, 1, 1};
 
+        size_t * local_work_size_ptr = local_work_size;
+        if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
 #ifdef GGML_OPENCL_PROFILING
         cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
         g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
     }
 }
@@ -4815,6 +5457,124 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
 #endif
 }
 
+static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_UNUSED(src1);
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    const int ne00  = src0->ne[0];
+    const int nrows = ggml_nrows(src0);
+
+    int ne00_padded = 1;
+    while (ne00_padded < ne00) {
+        ne00_padded *= 2;
+    }
+
+    int order = (enum ggml_sort_order) dst->op_params[0];
+
+    cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
+
+    CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),            &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_ulong),          &offset0));
+    CL_CHECK(clSetKernelArg(kernel,   2, sizeof(cl_mem),            &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,   3, sizeof(cl_ulong),          &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,   4, sizeof(int),               &ne00));
+    CL_CHECK(clSetKernelArg(kernel,   5, sizeof(int),               &ne00_padded));
+    CL_CHECK(clSetKernelArg(kernel,   6, sizeof(int),               &order));
+    CL_CHECK(clSetKernelArg(kernel,   7, ne00_padded*sizeof(int),   NULL));
+
+    size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
+    size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
+static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+    GGML_UNUSED(src1);
+
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+    cl_command_queue queue = backend_ctx->queue;
+
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
+
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
+
+    const cl_ulong nb1  = dst->nb[1];
+    const cl_ulong nb2  = dst->nb[2];
+    const cl_ulong nb3  = dst->nb[3];
+
+    cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
+
+    CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,   2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,   3, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,   4, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,   5, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel,   6, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel,   7, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel,   8, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel,   9, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel,  10, sizeof(cl_ulong), &nb03));
+    CL_CHECK(clSetKernelArg(kernel,  11, sizeof(cl_ulong), &nb1));
+    CL_CHECK(clSetKernelArg(kernel,  12, sizeof(cl_ulong), &nb2));
+    CL_CHECK(clSetKernelArg(kernel,  13, sizeof(cl_ulong), &nb3));
+
+    size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
+    size_t local_work_size[] = {(size_t)64, 1, 1};
+
+#ifdef GGML_OPENCL_PROFILING
+    cl_event evt;
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+
+    g_profiling_info.emplace_back();
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+#else
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+#endif
+}
+
 //------------------------------------------------------------------------------
 // Op offloading
 //------------------------------------------------------------------------------
@@ -4863,6 +5623,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             }
             func = ggml_cl_mul;
             break;
+        case GGML_OP_DIV:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_div;
+            break;
+        case GGML_OP_SUB:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_sub;
+            break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(tensor)) {
                 case GGML_UNARY_OP_GELU:
@@ -4889,6 +5661,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
                     }
                     func = ggml_cl_relu;
                     break;
+                case GGML_UNARY_OP_SIGMOID:
+                    if (!any_on_device) {
+                        return false;
+                    }
+                    func = ggml_cl_sigmoid;
+                    break;
                 default:
                     return false;
             } break;
@@ -4910,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             }
             func = ggml_cl_rms_norm;
             break;
+        case GGML_OP_GROUP_NORM:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_group_norm;
+            break;
         case GGML_OP_MUL_MAT:
             if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
                 return false;
@@ -4955,6 +5739,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
             }
             func = ggml_cl_im2col;
             break;
+        case GGML_OP_ARGSORT:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_argsort;
+            break;
+        case GGML_OP_SUM_ROWS:
+            if (!any_on_device) {
+                return false;
+            }
+            func = ggml_cl_sum_rows;
+            break;
         default:
             return false;
     }
diff --git a/ggml/src/ggml-opencl/kernels/argsort.cl b/ggml/src/ggml-opencl/kernels/argsort.cl
new file mode 100644
index 000000000..af4adc7b8
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/argsort.cl
@@ -0,0 +1,86 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
+
+enum ggml_sort_order {
+    GGML_SORT_ORDER_ASC,
+    GGML_SORT_ORDER_DESC,
+};
+
+kernel void kernel_argsort_f32_i32(
+    global float * src0,
+    ulong          offset0,
+    global int   * dst,
+    ulong          offsetd,
+    const int      ne00,
+    const int      ne00_pad,
+    const int      order,
+    local int    * dst_row
+) {
+    // bitonic sort
+    int col = get_local_id(0);
+    int row = get_group_id(1);
+
+    if (col >= ne00_pad) {
+        return;
+    }
+
+    src0 = (global char  *)((global char *)src0 + offset0);
+    dst  = (global float *)((global char *)dst  + offsetd);
+
+    global float * x_row = src0 + row * ne00;
+
+    // initialize indices
+    dst_row[col] = col;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int k = 2; k <= ne00_pad; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (dst_row[col] >= ne00 ||
+                        (dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                    ) {
+                        SWAP(dst_row[col], dst_row[ixj], int);
+                    }
+                } else {
+                    if (dst_row[ixj] >= ne00 ||
+                        (dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                    ) {
+                        SWAP(dst_row[col], dst_row[ixj], int);
+                    }
+                }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+    }
+
+    // copy the result to dst without the padding
+    if (col < ne00) {
+        dst[row * ne00 + col] = dst_row[col];
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/div.cl b/ggml/src/ggml-opencl/kernels/div.cl
new file mode 100644
index 000000000..d453ad99b
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/div.cl
@@ -0,0 +1,72 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+//------------------------------------------------------------------------------
+// div
+//------------------------------------------------------------------------------
+kernel void kernel_div(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global char * dst,
+        ulong offsetd,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        int ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int i13 = i03 % ne13;
+    int i12 = i02 % ne12;
+    int i11 = i01 % ne11;
+
+    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+
+    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+        const int i10 = i0 % ne10;
+        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
+    }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_div_row(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * src1,
+        ulong offset1,
+        global float4 * dst,
+        ulong offsetd,
+        int ne
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    // This performs better than using %.
+    uint gid = get_global_id(0);
+    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
+    dst[gid] = src0[gid] / src1[idx1];
+}
diff --git a/ggml/src/ggml-opencl/kernels/group_norm.cl b/ggml/src/ggml-opencl/kernels/group_norm.cl
new file mode 100644
index 000000000..57c9df4d3
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/group_norm.cl
@@ -0,0 +1,72 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+// Workgroup must be a subgroup
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_32
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_group_norm(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne,
+        int group_size,
+        float eps
+) {
+    src0 = (global float  *)((global char *)src0 + offset0);
+    dst  = (global float *)((global char *)dst  + offsetd);
+
+    int start = get_group_id(0) * group_size;
+    int end   = start + group_size;
+
+    start += get_local_id(0);
+
+    if (end >= ne) {
+        end = ne;
+    }
+
+    float tmp = 0.0f;
+
+    for (int j = start; j < end; j += get_local_size(0)) {
+        tmp += src0[j];
+    }
+
+    tmp = sub_group_reduce_add(tmp);
+
+    const float mean = tmp / group_size;
+    tmp = 0.0f;
+
+    for (int j = start; j < end; j += get_local_size(0)) {
+        float xi = src0[j] - mean;
+        dst[j] = xi;
+        tmp += xi * xi;
+    }
+
+    tmp = sub_group_reduce_add(tmp);
+
+    const float variance = tmp / group_size;
+    const float scale = 1.0f/sqrt(variance + eps);
+    for (int j = start; j < end; j += get_local_size(0)) {
+        dst[j] *= scale;
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/sigmoid.cl b/ggml/src/ggml-opencl/kernels/sigmoid.cl
new file mode 100644
index 000000000..e3f669dde
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/sigmoid.cl
@@ -0,0 +1,29 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+//------------------------------------------------------------------------------
+// sigmoid
+//------------------------------------------------------------------------------
+
+kernel void kernel_sigmoid_f32(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
+}
+
+kernel void kernel_sigmoid_f16(
+        global half * src0,
+        ulong offset0,
+        global half * dst,
+        ulong offsetd
+) {
+    src0 = (global half*)((global char*)src0 + offset0);
+    dst = (global half*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
+}
diff --git a/ggml/src/ggml-opencl/kernels/sub.cl b/ggml/src/ggml-opencl/kernels/sub.cl
new file mode 100644
index 000000000..041e88ad3
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/sub.cl
@@ -0,0 +1,72 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+//------------------------------------------------------------------------------
+// div
+//------------------------------------------------------------------------------
+kernel void kernel_sub(
+        global char * src0,
+        ulong offset0,
+        global char * src1,
+        ulong offset1,
+        global char * dst,
+        ulong offsetd,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne10,
+        int ne11,
+        int ne12,
+        int ne13,
+        ulong nb10,
+        ulong nb11,
+        ulong nb12,
+        ulong nb13,
+        int ne0,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst + offsetd;
+
+    int i03 = get_group_id(2);
+    int i02 = get_group_id(1);
+    int i01 = get_group_id(0);
+
+    int i13 = i03 % ne13;
+    int i12 = i02 % ne12;
+    int i11 = i01 % ne11;
+
+    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
+    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
+
+    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+        const int i10 = i0 % ne10;
+        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
+    }
+}
+
+// assumption: src1 is a row
+// broadcast src1 into src0
+kernel void kernel_sub_row(
+        global float4 * src0,
+        ulong offset0,
+        global float4 * src1,
+        ulong offset1,
+        global float4 * dst,
+        ulong offsetd,
+        int ne
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst = (global float4*)((global char*)dst + offsetd);
+
+    // This performs better than using %.
+    uint gid = get_global_id(0);
+    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
+    dst[gid] = src0[gid] - src1[idx1];
+}
diff --git a/ggml/src/ggml-opencl/kernels/sum_rows.cl b/ggml/src/ggml-opencl/kernels/sum_rows.cl
new file mode 100644
index 000000000..c5f7c570f
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/sum_rows.cl
@@ -0,0 +1,39 @@
+
+kernel void kernel_sum_rows_f32(
+    global float *  src0,
+    ulong           offset0,
+    global float *  dst,
+    ulong           offsetd,
+    int             ne00,
+    int             ne01,
+    int             ne02,
+    int             ne03,
+    ulong           nb01,
+    ulong           nb02,
+    ulong           nb03,
+    ulong           nb1,
+    ulong           nb2,
+    ulong           nb3
+) {
+    src0 = (global float *)((global char *)src0 + offset0);
+    dst  = (global float *)((global char *)dst  + offsetd);
+
+    int i3 = get_global_id(2);
+    int i2 = get_global_id(1);
+    int i1 = get_global_id(0);
+
+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
+        return;
+    }
+
+    global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
+    global float * dst_row = (global float *) ((global char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
+
+    float row_sum = 0;
+
+    for (int i0 = 0; i0 < ne00; i0++) {
+        row_sum += src_row[i0];
+    }
+
+    dst_row[0] = row_sum;
+}
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index a2e261248..2a0045bcc 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -13,7 +13,7 @@ elseif(SUPPORTS_SYCL)
         If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
         source /opt/intel/oneapi/setvars.sh")
 else()
-    message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
+    message(FATAL_ERROR "C++ compiler lacks SYCL support.")
 endif()
 message(STATUS "SYCL found")
 #todo: AOT
@@ -170,7 +170,7 @@ else()
         target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
         if (NOT GGML_SYCL_DEVICE_ARCH)
-            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
+            message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
         endif()
         target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
         target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp
index aaa94176f..0a3883ae1 100644
--- a/ggml/src/ggml-sycl/binbcast.cpp
+++ b/ggml/src/ggml-sycl/binbcast.cpp
@@ -1,74 +1,93 @@
 #include "binbcast.hpp"
 
-#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/sycl.hpp>
 
-#include "dpct/helper.hpp"
 #include "ggml.h"
 
-template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1,
-                                                   dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) {
-    auto element_id   = it.get_global_id(0);
-    auto global_range = it.get_global_range(0);
-    for (; element_id < num_elements; element_id += global_range) {
-        auto  src0_float_val = sycl::vec(src0[element_id]).template convert<float, sycl::rounding_mode::rte>();
-        auto  src1_float_val = sycl::vec(src1[element_id]).template convert<float, sycl::rounding_mode::rte>();
-        float dst_val        = bin_op(src0_float_val[0], src1_float_val[0]);
-        auto  val_to_store   = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
-        dst[element_id]      = val_to_store;
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s00,*/ int s01, int s02, int s03,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1));
+    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) /
+                   ne3;
+    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) %
+                   ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0;
+         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
     }
 }
 
-template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst,
-                                        int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13,
-                                        int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10,
-                                        int s11, int s12, int s13, std::size_t num_dst_elements,
-                                        const sycl::nd_item<1> & item_ct1) {
-    auto calculate_logical_index =
-        [](const std::array<int, 4> & dims, std::size_t element_id) __attribute__((always_inline))->std::array<int, 4> {
-        std::array<int, 4> logical_index;
-#pragma unroll(4)
-        for (int i = 3; i >= 0; i--) {
-            logical_index[i] = element_id % dims[i];
-            element_id /= dims[i];
-        }
-        return logical_index;
-    };
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s00,*/ int s01, int s02, int s03,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
 
-    auto calculate_index = [](const std::array<int, 4> & dims, const std::array<int, 4> & strides,
-                              const std::array<int, 4> & indices) __attribute__((always_inline))
-                               ->std::size_t {
-        std::size_t index = 0;
-#pragma unroll(4)
-        for (int i = 0; i < 4; i++) {
-            auto index_i = indices[i];
-            if (indices[i] >= dims[i]) {
-                index_i = indices[i] % dims[i];
-            }
-            index += strides[i] * index_i;
-        }
-        return index;
-    };
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
 
-    auto element_id = item_ct1.get_global_id(0);
-    for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) {
-        auto  logical_index  = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id);
-        auto  src_0_index    = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index);
-        auto  src_1_index    = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index);
-        auto  dst_index      = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index);
-        auto  src0_float_val = sycl::vec(src0[src_0_index]).template convert<float, sycl::rounding_mode::rte>();
-        auto  src1_float_val = sycl::vec(src1[src_1_index]).template convert<float, sycl::rounding_mode::rte>();
-        float dst_val        = bin_op(src0_float_val[0], src1_float_val[0]);
-        auto  val_to_store   = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
-        dst[dst_index]       = val_to_store;
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
     }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 }
 
-template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_sycl {
     template <typename src0_t, typename src1_t, typename dst_t>
     void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
                     const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
@@ -77,73 +96,165 @@ template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
                     const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
                     const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
                     const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
-        auto check_bcast_required = [](const std::array<int64_t, 4> & src_dims,
-                                       const std::array<int64_t, 4> & dst_dims) -> bool {
-            for (int i = 0; i < 4; i++) {
-                if (dst_dims[i] > src_dims[i]) {
-                    return true;
-                }
-            }
-            return false;
+        int nr0 = ne10 / ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne[] = {ne0, ne1, ne2, ne3};
+        int64_t cne0[] = {ne00, ne01, ne02, ne03};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+        size_t cnb[] = {nb0, nb1, nb2, nb3};
+        size_t cnb0[] = {nb00, nb01, nb02, nb03};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
         };
 
-        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };
 
-        GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
-        GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
-        GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
-        GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+        if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
+            for (int i = 0; i < 4; i++) {
+                if (nr[i] != 1) {
+                    break;
+                }
+                if (i > 0) {
+                    collapse_nb(cnb, cne);
+                    collapse_nb(cnb0, cne0);
+                    collapse_nb(cnb1, cne1);
+                    collapse(cne);
+                    collapse(cne0);
+                    collapse(cne1);
+                }
+            }
+        }
+        {
+            int64_t ne0 = cne[0];
+            int64_t ne1 = cne[1];
+            int64_t ne2 = cne[2];
+            int64_t ne3 = cne[3];
 
-        GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
-        GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
-        GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
-        GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];
 
-        GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
-        GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
-        GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
-        GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
+            size_t nb0 = cnb[0];
+            size_t nb1 = cnb[1];
+            size_t nb2 = cnb[2];
+            size_t nb3 = cnb[3];
 
-        // dst strides in number of elements
-        size_t s0 = nb0 / sizeof(dst_t);
-        size_t s1 = nb1 / sizeof(dst_t);
-        size_t s2 = nb2 / sizeof(dst_t);
-        size_t s3 = nb3 / sizeof(dst_t);
+            size_t nb00 = cnb0[0];
+            size_t nb01 = cnb0[1];
+            size_t nb02 = cnb0[2];
+            size_t nb03 = cnb0[3];
 
-        // src1 strides in number of elements
-        size_t s10 = nb10 / sizeof(src0_t);
-        size_t s11 = nb11 / sizeof(src1_t);
-        size_t s12 = nb12 / sizeof(src1_t);
-        size_t s13 = nb13 / sizeof(src1_t);
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];
 
-        // src0 strides in number of elements
-        size_t s00 = nb00 / sizeof(src0_t);
-        size_t s01 = nb01 / sizeof(src0_t);
-        size_t s02 = nb02 / sizeof(src0_t);
-        size_t s03 = nb03 / sizeof(src0_t);
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);
 
-        std::size_t num_dst_elements = static_cast<std::size_t>(ne0) * static_cast<std::size_t>(ne1) *
-                                       static_cast<std::size_t>(ne2) * static_cast<std::size_t>(ne3);
-        std::size_t local_range  = 256;
-        std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range;
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);
 
-        bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) ||
-                                  check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 });
-        bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous;
+            size_t s00 = nb00 / sizeof(src0_t);
+            size_t s01 = nb01 / sizeof(src0_t);
+            size_t s02 = nb02 / sizeof(src0_t);
+            size_t s03 = nb03 / sizeof(src0_t);
 
-        if (! needs_broadcasting && all_contiguous) {
-            stream->submit([&](sycl::handler & cgh) {
-                cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
-                    k_bin_bcast_contiguous<bin_op>(src0_dd, src1_dd, dst_dd, num_dst_elements, it);
-                });
-            });
-        } else {
-            stream->submit([&](sycl::handler & cgh) {
-                cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
-                    k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1,
-                                        s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it);
-                });
-            });
+            GGML_UNUSED(s00);
+
+            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+
+            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
+
+            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
+
+            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            sycl::range<3> block_dims(1, 1, 1);
+            block_dims[2] = std::min<unsigned int>(hne0, block_size);
+            block_dims[1] = std::min<unsigned int>(
+                ne1, block_size / (unsigned int)block_dims[2]);
+            block_dims[0] = std::min(
+                std::min<unsigned int>(
+                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
+                                   (unsigned int)block_dims[1]),
+                64U);
+
+            sycl::range<3> block_nums(
+                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
+                (ne1 + block_dims[1] - 1) / block_dims[1],
+                (hne0 + block_dims[2] - 1) / block_dims[2]);
+
+            if (block_nums[0] > 65535) {
+                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                {
+                    dpct::has_capability_or_fail(stream->get_device(),
+                                                 {sycl::aspect::fp16});
+
+                    stream->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
+                                              sycl::range<3>(1, 1, block_size),
+                                          sycl::range<3>(1, 1, block_size)),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_bin_bcast_unravel<bin_op>(
+                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
+                                ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
+                                s03, s11, s12, s13, item_ct1);
+                        });
+                }
+            } else {
+                /*
+                DPCT1049:16: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                dpct::has_capability_or_fail(stream->get_device(),
+                                             {sycl::aspect::fp16});
+
+                stream->parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
+                                            ne2, ne3, ne10, ne11, ne12, ne13,
+                                            s1, s2, s3, s01, s02, s03, s11, s12, s13,
+                                            item_ct1);
+                    });
+            }
         }
     }
 };
@@ -208,32 +319,27 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *ds
 
 
 void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     ggml_sycl_op_add(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     ggml_sycl_op_sub(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     ggml_sycl_op_mul(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     ggml_sycl_op_div(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_repeat(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 60909dde7..15ee9dc69 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -13,8 +13,10 @@
 #ifndef GGML_SYCL_COMMON_HPP
 #define GGML_SYCL_COMMON_HPP
 
+#include <cstddef>
 #include <fstream>
 #include <iostream>
+#include <string>
 
 #include "dpct/helper.hpp"
 #include "ggml-sycl.h"
@@ -44,11 +46,20 @@ extern int g_ggml_sycl_debug;
 extern int g_ggml_sycl_disable_optimize;
 extern int g_ggml_sycl_prioritize_dmmv;
 
-#define GGML_SYCL_DEBUG(...)        \
-  do {                              \
-    if (g_ggml_sycl_debug)          \
-      fprintf(stderr, __VA_ARGS__); \
-  } while (0)
+#if defined(__clang__) && __has_builtin(__builtin_expect)
+// Hint the optimizer to pipeline the more likely following instruction in branches
+#    define LIKELY(expr)   __builtin_expect(expr, true)
+#    define UNLIKELY(expr) __builtin_expect(expr, false)
+#else
+#    define LIKELY(expr)   (expr)
+#    define UNLIKELY(expr) (expr)
+#endif
+
+#define GGML_SYCL_DEBUG(...)              \
+    do {                                  \
+        if (UNLIKELY(g_ggml_sycl_debug))  \
+            fprintf(stderr, __VA_ARGS__); \
+    } while (0)
 
 #define CHECK_TRY_ERROR(expr)                                            \
   [&]() {                                                                \
@@ -471,6 +482,19 @@ static __dpct_inline__ float warp_reduce_max(float x,
     return x;
 }
 
+/* Helper for Computing the linear offset of a ggml_tensor given
+per-dimension sizes, strides, and indices */
+template<int N>
+__dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
+    size_t offset = 0;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+        auto index_i = indices[i];
+        offset += strides[i] * index_i;
+    }
+    return offset;
+}
+
 // Helper for vec loading aligned data
 template <typename Tp, int n>
 inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
@@ -490,4 +514,76 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
 }
 
 bool gpu_has_xmx(sycl::device &dev);
+
+template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
+    if (LIKELY(!g_ggml_sycl_debug)) {
+        return;
+    }
+    std::stringstream ss;
+    ss << prefix << "=[";
+    for (std::size_t i = 0; i < N - 1; ++i) {
+        ss << array[i] << ", ";
+    }
+    if constexpr (N > 0) {
+        ss << array[N - 1];
+    }
+    ss << "]";
+    GGML_SYCL_DEBUG("%s", ss.str().c_str());
+}
+
+inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
+                               const std::string & suffix = "") {
+    if (LIKELY(!g_ggml_sycl_debug)) {
+        return;
+    }
+    GGML_SYCL_DEBUG("%s=", prefix.c_str());
+    if (tensor) {
+        GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
+        debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
+        debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
+        if (!ggml_is_contiguous(tensor)) {
+            GGML_SYCL_DEBUG(";strided");
+        }
+        if (ggml_is_permuted(tensor)) {
+            GGML_SYCL_DEBUG(";permuted");
+        }
+    } else {
+        GGML_SYCL_DEBUG("nullptr");
+    }
+    GGML_SYCL_DEBUG("%s", suffix.c_str());
+}
+
+// Use scope_op_debug_print to log operations coming from running a model
+struct scope_op_debug_print {
+    // Use string_views to avoid the cost of creating a string and concatenating them
+    // string_views must be alive for as long as the object is alive
+    // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
+    scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
+                         std::size_t num_src, const std::string_view & suffix = "") :
+        func(func),
+        func_suffix(func_suffix) {
+        if (LIKELY(!g_ggml_sycl_debug)) {
+            return;
+        }
+        GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
+        debug_print_tensor(" dst", dst);
+        if (dst) {
+            for (std::size_t i = 0; i < num_src; ++i) {
+                debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
+            }
+        }
+        GGML_SYCL_DEBUG("%s\n", suffix.data());
+    }
+
+    scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
+                         const std::string_view & suffix = "") :
+        scope_op_debug_print(func, "", dst, num_src, suffix) {}
+
+    ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
+
+  private:
+    std::string_view func;
+    std::string_view func_suffix;
+};
+
 #endif // GGML_SYCL_COMMON_HPP
diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp
index d41cfd3a6..7aa91c861 100644
--- a/ggml/src/ggml-sycl/concat.cpp
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -159,39 +159,37 @@ static void concat_f32_sycl_non_cont(
 }
 
 void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-  const ggml_tensor *src0 = dst->src[0];
-  const ggml_tensor *src1 = dst->src[1];
-  queue_ptr stream = ctx.stream();
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
+    const ggml_tensor *  src0   = dst->src[0];
+    const ggml_tensor *  src1   = dst->src[1];
+    queue_ptr            stream = ctx.stream();
 
-  const int32_t dim = ((int32_t *)dst->op_params)[0];
+    const int32_t dim = ((int32_t *) dst->op_params)[0];
 
-  if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-    const float *src0_d = (const float *)src0->data;
-    const float *src1_d = (const float *)src1->data;
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
+        const float * src0_d = (const float *) src0->data;
+        const float * src1_d = (const float *) src1->data;
 
-    float *dst_d = (float *)dst->data;
+        float * dst_d = (float *) dst->data;
 
-    if (dim != 3) {
-      for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-        concat_f32_sycl(
-            src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
-            dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
-            src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
-      }
+        if (dim != 3) {
+            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+                concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
+                                dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
+                                dst->ne[1], dst->ne[2], dim, stream);
+            }
+        } else {
+            const size_t size0 = ggml_nbytes(src0);
+            const size_t size1 = ggml_nbytes(src1);
+
+            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
+            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
+        }
     } else {
-      const size_t size0 = ggml_nbytes(src0);
-      const size_t size1 = ggml_nbytes(src1);
-
-      SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
-      SYCL_CHECK(CHECK_TRY_ERROR(
-          stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
+        concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
+                                 src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
+                                 src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                                 src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
+                                 dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
     }
-  } else
-    concat_f32_sycl_non_cont(
-        stream, (const char *)src0->data, (const char *)src1->data,
-        (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
-        src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
-        src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
-        dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
 }
diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp
index ddba601e1..475bd34a2 100644
--- a/ggml/src/ggml-sycl/conv.cpp
+++ b/ggml/src/ggml-sycl/conv.cpp
@@ -72,6 +72,7 @@ static void conv_transpose_1d_f32_f32_sycl(
 }
 
 void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     const ggml_tensor *src0 = dst->src[0];
     const ggml_tensor *src1 = dst->src[1];
     const float * src0_d = (const float *)src0->data;
diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp
index 5a2314589..44487c256 100644
--- a/ggml/src/ggml-sycl/cpy.cpp
+++ b/ggml/src/ggml-sycl/cpy.cpp
@@ -616,6 +616,9 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
 }
 
 void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
+    // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
+    scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
+                                         std::string(" src0 type=") + ggml_type_name(src0->type));
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
 
@@ -629,8 +632,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
 
     char * src0_ddc = (char *) src0->data;
     char * src1_ddc = (char *) src1->data;
-    GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
-                    ggml_type_name(src1->type));
 
     if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
@@ -694,8 +695,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
 }
 
 void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    // TODO: why do we pass dst as src1 here?
-    GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_cpy(ctx, dst->src[0], dst);
-    GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
 }
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
index b58150c68..4f2760110 100644
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -1092,6 +1092,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
         src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
 
     if (src1_convert_f16) {
+        scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
+                                             " : converting src1 to fp16");
         src1_dfloat = src1_dfloat_a.alloc(ne00);
         const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
         GGML_ASSERT(to_fp16_sycl != nullptr);
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index becaac404..5b7c4f0b4 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -84,6 +84,15 @@ static void gelu_quick(const T *x, T *dst, int k,
     dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
 }
 
+template<typename T>
+static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
+    const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
+    for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
+       auto x_i = x[i];
+        dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
+    }
+}
+
 template<typename T>
 static void tanh(const T *x, T *dst, int k,
                      const sycl::nd_item<3> &item_ct1) {
@@ -400,6 +409,20 @@ static void gelu_quick_sycl(const T *x, T *dst, const int k,
         });
 }
 
+
+template<typename T>
+static void gelu_erf_sycl(const T *x, T *dst, const int k,
+                                queue_ptr stream) {
+    const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_erf(x, dst, k, item_ct1);
+        });
+}
+
 template<typename T>
 static void tanh_sycl(const T *x, T *dst, const int k,
                           queue_ptr stream) {
@@ -816,6 +839,38 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
     }
 }
 
+inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+#if defined (GGML_SYCL_F16)
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
+    GGML_ASSERT(dst->src[0]->type == dst->type);
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    switch (dst->type) {
+#if defined (GGML_SYCL_F16)
+        case GGML_TYPE_F16:
+            {
+                auto data_pts = cast_data<sycl::half>(dst);
+                gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+#endif
+        case GGML_TYPE_F32:
+            {
+                auto data_pts = cast_data<float>(dst);
+                gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+    }
+}
+
+
 inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 #if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
@@ -1391,146 +1446,126 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
 
 
 void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_sqrt(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_sin(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_cos(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     ggml_sycl_op_acc(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_gelu(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_silu(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_gelu_quick(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    ggml_sycl_op_gelu_erf(ctx, dst);
 }
 
 void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_tanh(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_relu(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_sigmoid(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_hardsigmoid(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_hardswish(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
-
 void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_exp(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_log(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_neg(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_step(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_leaky_relu(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_sqr(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_upscale(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_pad(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_clamp(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_sgn(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_abs(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_elu(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
index f4199d69d..bd40113f0 100644
--- a/ggml/src/ggml-sycl/element_wise.hpp
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -38,6 +38,8 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
+void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
 void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp
index 64665be46..4a7712781 100644
--- a/ggml/src/ggml-sycl/getrows.cpp
+++ b/ggml/src/ggml-sycl/getrows.cpp
@@ -257,8 +257,7 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
     GGML_UNUSED(ctx);
 }
 
-void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
+void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
@@ -308,4 +307,3 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
             GGML_ABORT("fatal error");
     }
 }
-
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index d05919781..bcd2ea536 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -346,6 +346,8 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
 static enum ggml_status
 ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                      ggml_tensor *tensor) try {
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor, "\n");
     ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
 
     if (tensor->view_src != NULL) {
@@ -381,7 +383,9 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                 ggml_tensor *tensor,
                                                 const void *data, size_t offset,
                                                 size_t size) try {
-
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
     ggml_sycl_set_device(ctx->device);
     auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
@@ -407,7 +411,9 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                 const ggml_tensor *tensor,
                                                 void *data, size_t offset,
                                                 size_t size) try {
-
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
 
     ggml_sycl_set_device(ctx->device);
@@ -435,7 +441,12 @@ static bool
 ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
                                     const ggml_tensor *src,
                                     ggml_tensor *dst) try {
-    if (ggml_backend_buffer_is_sycl(src->buffer)) {
+    bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": dst=", dst);
+    debug_print_tensor(" src=", src);
+    GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
+    if (is_cpy_supported) {
         ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
         ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
 
@@ -492,7 +503,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
 
 static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
                                            uint8_t value) try {
-     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
+    GGML_SYCL_DEBUG("[SYCL] call %s: size=%zu\n", __func__, buffer->size);
+    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
 
     ggml_sycl_set_device(ctx->device);
     queue_ptr stream = ctx->stream;
@@ -511,7 +523,9 @@ catch (sycl::exception const &exc) {
 
 static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
                                                    size_t offset, size_t size) {
-    GGML_SYCL_DEBUG(" [SYCL] call %s\n", __func__);
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
     ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
     SYCL_CHECK(ggml_sycl_set_device(ctx->device));
     auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
@@ -789,6 +803,8 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
 static enum ggml_status
 ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                            ggml_tensor *tensor) try {
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor, "\n");
     GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 
     ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -873,6 +889,9 @@ static void
 ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                           ggml_tensor *tensor, const void *data,
                                           size_t offset, size_t size) try {
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
     GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -926,6 +945,9 @@ static void
 ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                           const ggml_tensor *tensor, void *data,
                                           size_t offset, size_t size) try {
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
     GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -2015,12 +2037,12 @@ inline void ggml_sycl_op_mul_mat_sycl(
 #else
     bool use_fp16 = false;
 #endif
-    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-        use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
-        dst->op_params[0] == GGML_PREC_DEFAULT) {
-        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
+    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
+        row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
         ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
         if (src0->type != GGML_TYPE_F16) {
+            scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
+                                                 " : converting src0 to fp16");
             const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
             GGML_ASSERT(to_fp16_sycl != nullptr);
             size_t ne = row_diff*ne00;
@@ -2033,6 +2055,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
 
         ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
         if (src1->type != GGML_TYPE_F16) {
+            scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
+                                                 " : converting src1 to fp16");
             const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
             GGML_ASSERT(to_fp16_sycl != nullptr);
             size_t ne = src1_ncols*ne10;
@@ -2049,6 +2073,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
             DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
                                       DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
                                       dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
+            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
+                                                 " : converting dst to fp32");
             const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
             to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
         }
@@ -2064,21 +2090,25 @@ inline void ggml_sycl_op_mul_mat_sycl(
                 src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
                 dst_f16.get(), dpct::library_data_t::real_half, ldc,
                 dpct::library_data_t::real_half)));
+            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
+                                                 " : converting dst to fp32");
             const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
             to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
         }
-    }
-    else {
-        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
+    } else {
         ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
         ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
         if (src0->type != GGML_TYPE_F32) {
+            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
+                                                 " : converting src0 to fp32");
             const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
             GGML_ASSERT(to_fp32_sycl != nullptr);
             src0_ddq_as_f32.alloc(row_diff*ne00);
             to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
         }
         if (src1->type != GGML_TYPE_F32) {
+            scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
+                                                 " : converting src1 to fp32");
             const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
             GGML_ASSERT(to_fp32_sycl != nullptr);
             src1_ddq_as_f32.alloc(src1_ncols*ne10);
@@ -2114,8 +2144,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
+static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
     dpct::queue_ptr main_stream = ctx.stream();
@@ -2167,8 +2196,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
     sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
 }
 
-inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
+inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
     dpct::queue_ptr main_stream = ctx.stream();
@@ -2199,8 +2227,7 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
     argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
 }
 
-inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
+inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_I32);
 
@@ -2215,8 +2242,7 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *ds
     argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
 }
 
-inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tensor *dst) {
-
+inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
     dpct::queue_ptr main_stream = ctx.stream();
@@ -2233,8 +2259,7 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tens
     diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
 }
 
-inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
+inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
     dpct::queue_ptr main_stream = ctx.stream();
@@ -2421,6 +2446,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
             dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
 
             if (src1_on_device && src1_is_contiguous) {
+                scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
+                                                     /*num_src=*/2, " : converting src1 to Q8_1");
                 quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
                 /*
                 DPCT1010:90: SYCL uses exceptions to report errors and does not
@@ -2525,6 +2552,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                 }
 
                 if (convert_src1_to_q8_1 && !src1_is_contiguous) {
+                    scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
+                                                         /*num_src=*/2, " : converting src1 to Q8_1");
                     quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     /*
                     DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -2619,33 +2648,28 @@ catch (sycl::exception const &exc) {
 
 
 static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     ggml_sycl_op_get_rows(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_norm(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_rms_norm(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_l2_norm(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_group_norm(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2773,6 +2797,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
 
     // convert src1 to fp16
     if (src1->type != GGML_TYPE_F16) {
+        scope_op_debug_print    scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
+                                                " : converting src1 to fp16");
         const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
         GGML_ASSERT(to_fp16_nc_sycl != nullptr);
         const int64_t ne_src1 = ggml_nelements(src1);
@@ -3076,6 +3102,7 @@ static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor *
 }
 
 static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
     int64_t min_compute_capability = INT_MAX;
 
@@ -3153,7 +3180,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
         constexpr bool convert_src1_to_q8_1 = false;
         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
     }
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
 
@@ -3224,6 +3250,7 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
 
 static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                                  ggml_tensor *dst) try {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
     const ggml_tensor *src0 = dst->src[0];
     const ggml_tensor *src1 = dst->src[1];
     GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
@@ -3392,37 +3419,45 @@ catch (sycl::exception const &exc) {
 }
 
 static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_scale(ctx, dst);
 }
 
 static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_diag_mask_inf(ctx, dst);
 }
 
 static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_pool2d(ctx, dst);
 }
 
 static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     ggml_sycl_op_im2col(ctx, dst);
 }
 
 static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
     ggml_sycl_op_sum(ctx, dst);
 }
 
 static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
     ggml_sycl_op_sum_rows(ctx, dst);
 }
 
 static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
     ggml_sycl_op_argsort(ctx, dst);
 }
 
 static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
     ggml_sycl_op_argmax(ctx, dst);
 }
@@ -3508,6 +3543,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
                 case GGML_UNARY_OP_GELU_QUICK:
                     ggml_sycl_gelu_quick(ctx, dst);
                     break;
+                case GGML_UNARY_OP_GELU_ERF:
+                    ggml_sycl_gelu_erf(ctx, dst);
+                    break;
                 case GGML_UNARY_OP_TANH:
                     ggml_sycl_tanh(ctx, dst);
                     break;
@@ -3716,6 +3754,9 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
                                                ggml_tensor *tensor,
                                                const void *data, size_t offset,
                                                size_t size) try {
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
@@ -3734,13 +3775,16 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
                                                const ggml_tensor *tensor,
                                                void *data, size_t offset,
                                                size_t size) try {
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": tensor=", tensor);
+    GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
     const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
     SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-        data, (const char *)tensor->data + offset, size).wait()));
+        data, (const char *)tensor->data + offset, size)));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3752,7 +3796,13 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
                                                const ggml_tensor *src,
                                                ggml_tensor *dst) try {
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
-    if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
+    bool is_cpy_supported                = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
+                            ggml_backend_buffer_is_sycl(src->buffer);
+    GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
+    debug_print_tensor(": dst=", dst);
+    debug_print_tensor(" src=", src);
+    GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
+    if (is_cpy_supported) {
         /*
         DPCT1009:215: SYCL uses exceptions to report errors and does not use the
         error codes. The original code was commented out and a warning string
@@ -3760,7 +3810,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
         */
         const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
         SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-            dst->data, src->data, ggml_nbytes(dst)).wait()));
+            dst->data, src->data, ggml_nbytes(dst))));
         return true;
     }
 
@@ -3773,6 +3823,7 @@ catch (sycl::exception const &exc) {
 }
 
 static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
+    GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
     const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
     SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
@@ -3809,11 +3860,43 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
     }
 }
 
+#ifdef GGML_SYCL_GRAPH
+static bool check_graph_compatibility(ggml_cgraph * cgraph) {
+    if (ggml_sycl_info().device_count > 1) {
+        // A sycl_ex::command_graph object can only be created for a single device
+        GGML_LOG_INFO("%s: disabling SYCL graphs due to multiple devices\n", __func__);
+        return false;
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        const ggml_op node_op = cgraph->nodes[i]->op;
+        switch (node_op) {
+            default:
+                break;
+            case GGML_OP_CONCAT:
+                // ggml_sycl_op_concat() does a blocking host wait after memcpy operations,
+                // but wait() can't be called on the events returned by a queue recording
+                // to a graph.
+                [[fallthrough]];
+            case GGML_OP_MUL_MAT_ID:
+                // ggml_sycl_mul_mat_id() does a blocking host wait on the sycl queue after
+                // submitting a memcpy operation, but wait() can't be called on a queue that
+                // is recording to a graph.
+                GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
+                              ggml_op_name(node_op));
+                return false;
+        }
+    }
+    return true;
+}
+#endif
+
 static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
 
 #ifdef GGML_SYCL_GRAPH
-    if (!g_ggml_sycl_disable_graph) {
+    bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
+    if (use_sycl_graph) {
         const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph);
         if (!graph_support) {
             GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
@@ -3874,7 +3957,7 @@ catch (sycl::exception const &exc)
 }
 
 static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
-
+    GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
     sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
 
     if (ggml_backend_is_sycl(backend)) {
@@ -4016,6 +4099,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_HARDSIGMOID:
                 case GGML_UNARY_OP_HARDSWISH:
                 case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_GELU_ERF:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
                 case GGML_UNARY_OP_SGN:
@@ -4161,6 +4245,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
 #endif
         case GGML_OP_NORM:
         case GGML_OP_RMS_NORM:
+            return true;
         case GGML_OP_L2_NORM:
         case GGML_OP_GROUP_NORM:
             return ggml_is_contiguous(op->src[0]);
@@ -4172,14 +4257,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_SOFT_MAX:
             return true;
         case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-                // mode is not used as a bitmask in practice, the various rope type modes are independent implementations
-                if (mode == GGML_ROPE_TYPE_MROPE) {
-                    return false;
-                }
-                return true;
-            }
         case GGML_OP_IM2COL:
             return true;
         case GGML_OP_UPSCALE:
@@ -4269,6 +4346,7 @@ static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_bac
 
 static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
   GGML_UNUSED(dev);
+  GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
 
   sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
   SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));
diff --git a/ggml/src/ggml-sycl/gla.cpp b/ggml/src/ggml-sycl/gla.cpp
index eedb47486..879184fdd 100644
--- a/ggml/src/ggml-sycl/gla.cpp
+++ b/ggml/src/ggml-sycl/gla.cpp
@@ -76,6 +76,7 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
 }
 
 void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/5);
     const float * k_d  = static_cast<const float *>(dst->src[0]->data);
     const float * v_d  = static_cast<const float *>(dst->src[1]->data);
     const float * r_d  = static_cast<const float *>(dst->src[2]->data);
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index 23eeb74da..cb70f83a4 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1059,8 +1059,10 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
             case GGML_TYPE_Q4_K:
                 if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
                     ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                    GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
                     reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                 } else {
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
                     mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
                 }
                 break;
diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp
index 4e9f438b4..4ec141684 100644
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -1,40 +1,50 @@
 #include "norm.hpp"
+#include "ggml-sycl/common.hpp"
+#include "ggml-sycl/presets.hpp"
 
-static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
-    const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-        item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
+static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
+        const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
+
+    const int nrows = item_ct1.get_group_range(2);
+    const int nchannels = item_ct1.get_group_range(1);
 
     const int nthreads = item_ct1.get_local_range(2);
+    const int sample  = item_ct1.get_group(0);
+    const int channel = item_ct1.get_group(1);
+    const int row     = item_ct1.get_group(2);
+
+    const int tid = item_ct1.get_local_id(2);
     const int nwarps = nthreads / WARP_SIZE;
+
+    const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
+    const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
+
+    x += strided_offset;
+    dst += packed_offset;
+
     sycl::float2 mean_var = sycl::float2(0.f, 0.f);
 
     for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row * ncols + col];
+        const float xi = x[col];
         mean_var.x() += xi;
         mean_var.y() += xi * xi;
     }
 
     // sum up partial sums
     mean_var = warp_reduce_sum(mean_var, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
+    if  (block_size > WARP_SIZE) {
+        const auto sub_group = item_ct1.get_sub_group();
+        const auto sg_id = sub_group.get_group_linear_id();
+        const auto wi_in_sg = sub_group.get_local_linear_id();
+        if (wi_in_sg == 0) {
+            s_sum[sg_id] = mean_var;
         }
-        /*
-        DPCT1118:0: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
         item_ct1.barrier(sycl::access::fence_space::local_space);
         mean_var = 0.f;
-        size_t nreduce = nwarps / WARP_SIZE;
+        const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
         for (size_t i = 0; i < nreduce; i += 1)
         {
-            mean_var += s_sum[lane_id + i * WARP_SIZE];
+            mean_var += s_sum[wi_in_sg + i * WARP_SIZE];
         }
         mean_var = warp_reduce_sum(mean_var, item_ct1);
     }
@@ -44,7 +54,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
     const float inv_std = sycl::rsqrt(var + eps);
 
     for (int col = tid; col < ncols; col += block_size) {
-        dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
+        dst[col] = (x[col] - mean) * inv_std;
     }
 }
 
@@ -135,39 +145,51 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
     }
 }
 
-static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
-    const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-        item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
+static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
+        const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
+
+    const int nrows = item_ct1.get_group_range(2);
+    const int nchannels = item_ct1.get_group_range(1);
+
+    const int sample  = item_ct1.get_group(0);
+    const int channel = item_ct1.get_group(1);
+    const int row     = item_ct1.get_group(2);
+
     const int nthreads = item_ct1.get_local_range(2);
+
+    const int tid = item_ct1.get_local_id(2);
     const int nwarps = nthreads / WARP_SIZE;
+
+    const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
+    const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
+
+    x   += strided_offset;
+    dst += packed_offset;
+
+
     float tmp = 0.0f; // partial sum for thread in warp
 
     for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row * ncols + col];
+        const float xi = x[col];
         tmp += xi * xi;
     }
 
     // sum up partial sums
     tmp = warp_reduce_sum(tmp, item_ct1);
     if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
+        const auto sub_group = item_ct1.get_sub_group();
+        const auto sg_id = sub_group.get_group_linear_id();
+        const auto wi_in_sg = sub_group.get_local_linear_id();
+        if (wi_in_sg == 0) {
+            s_sum[sg_id] = tmp;
         }
-        /*
-        DPCT1118:3: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
+
         item_ct1.barrier(sycl::access::fence_space::local_space);
-        size_t nreduce = nwarps / WARP_SIZE;
+        const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
         tmp = 0.f;
         for (size_t i = 0; i < nreduce; i += 1)
         {
-            tmp += s_sum[lane_id + i * WARP_SIZE];
+            tmp += s_sum[wi_in_sg + i * WARP_SIZE];
         }
         tmp = warp_reduce_sum(tmp, item_ct1);
     }
@@ -176,7 +198,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
     const float scale = sycl::rsqrt(mean + eps);
 
     for (int col = tid; col < ncols; col += block_size) {
-        dst[row * ncols + col] = scale * x[row * ncols + col];
+        dst[col] = scale * x[col];
     }
 }
 
@@ -224,20 +246,20 @@ static void l2_norm_f32(const float* x, float* dst, const int ncols, const float
     }
 }
 
-static void norm_f32_sycl(const float* x, float* dst, const int ncols,
-    const int nrows, const float eps,
-    queue_ptr stream, int device) {
+static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
+        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
+        const float eps, queue_ptr stream, int device) {
+
+    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    norm_f32(x, dst, ncols, eps, item_ct1,
-                        nullptr, WARP_SIZE);
+                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
                 });
             });
     }
@@ -252,15 +274,12 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
         */
         stream->submit([&](sycl::handler& cgh) {
             sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
-                sycl::range<1>(work_group_size / WARP_SIZE), cgh);
-
+                            sycl::range<1>(work_group_size / WARP_SIZE), cgh);
             cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    norm_f32(x, dst, ncols, eps, item_ct1,
-                        get_pointer(s_sum_acc_ct1), work_group_size);
+                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
                 });
             });
     }
@@ -313,21 +332,20 @@ static void group_norm_f32_sycl(const float* x, float* dst,
     }
 }
 
-static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
-    const int nrows, const float eps,
-    queue_ptr stream, int device) {
+static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
+        const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
     // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
+
+    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    rms_norm_f32(x, dst, ncols, eps, item_ct1,
-                        nullptr, WARP_SIZE);
+                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
                 });
             });
     }
@@ -344,12 +362,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
             sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
                 cgh);
             cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                    block_dims),
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                 [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                    rms_norm_f32(x, dst, ncols, eps, item_ct1,
-                        get_pointer(s_sum_acc_ct1), work_group_size);
+                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
                 });
             });
     }
@@ -398,12 +414,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
 }
 
 void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
+    const ggml_tensor * src0 = dst->src[0];
 
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
-    const int64_t ne00 = dst->src[0]->ne[0];
-    const int64_t nrows = ggml_nrows(dst->src[0]);
+    GGML_TENSOR_UNARY_OP_LOCALS
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
     const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
@@ -411,8 +427,14 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
+    GGML_ASSERT(eps >= 0.0f);
+    const size_t ts0 = ggml_type_size(src0->type);
+    GGML_ASSERT(nb00 == ts0);
+    const int64_t s01 = nb01 / ts0;
+    const int64_t s02 = nb02 / ts0;
+    const int64_t s03 = nb03 / ts0;
 
-    norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
+    norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
 }
 
 void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
@@ -436,11 +458,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
 void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
+    const ggml_tensor * src0 = dst->src[0];
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
-    const int64_t ne00 = dst->src[0]->ne[0];
-    const int64_t nrows = ggml_nrows(dst->src[0]);
     dpct::queue_ptr main_stream = ctx.stream();
     SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 
@@ -450,7 +471,13 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
 
-    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
+    GGML_TENSOR_UNARY_OP_LOCALS
+    const size_t ts0 = ggml_type_size(src0->type);
+    GGML_ASSERT(nb00 == ts0);
+    const int64_t s01 = nb01 / ts0;
+    const int64_t s02 = nb02 / ts0;
+    const int64_t s03 = nb03 / ts0;
+    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
 }
 
 void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp
index b60415784..3a17f3a1b 100644
--- a/ggml/src/ggml-sycl/outprod.cpp
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -1,6 +1,7 @@
 #include "outprod.hpp"
 
 void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     const ggml_tensor *src0 = dst->src[0];
     const ggml_tensor *src1 = dst->src[1];
 
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index 4e276d3b6..44473e1e5 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -49,10 +49,7 @@ static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const
 
     if (i0 >= n_dims) {
         const int i = row * ne0 + i0;
-
-        dst[i + 0] = x[i + 0];
-        dst[i + 1] = x[i + 1];
-
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
         return;
     }
 
@@ -93,10 +90,7 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
 
     if (i0 >= n_dims) {
         const int i = row * ne0 + i0;
-
-        dst[i + 0] = x[i + 0];
-        dst[i + 1] = x[i + 1];
-
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
         return;
     }
 
@@ -122,6 +116,63 @@ static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const
     dst[i + n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
 }
 
+template <typename T, bool has_ff>
+static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
+                        const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
+                        const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
+                        const float theta_scale, const float * freq_factors, const mrope_sections sections,
+                        const sycl::nd_item<3> & item_ct1) {
+    // get index pos
+    const int i0 = 2 * (item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1));
+    if (i0 >= ne0) {
+        return;
+    }
+    const int    row_dst   = (item_ct1.get_group(2) * item_ct1.get_local_range(2)) + item_ct1.get_local_id(2);
+
+    if (i0 >= n_dims) {
+        const int i = row_dst*ne0 + i0;
+        *reinterpret_cast<sycl::vec<T, 2> *>(dst + i) = *reinterpret_cast<const sycl::vec<T, 2> *>(x + i);
+        return;
+    }
+
+    const int    row_x     = row_dst % ne1;
+    const int    channel_x = row_dst / ne1;
+    const int    idst      = (row_dst * ne0) + (i0 / 2);
+    const size_t ix        = ((size_t) channel_x * s2) + ((size_t) row_x * s1) + (i0 / 2);
+
+    const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
+    const int sec_w = sections.v[1] + sections.v[0];
+    const int sector = (i0 / 2) % sect_dims;
+
+
+    float theta_base = 0.0;
+    if (sector < sections.v[0]) {
+        theta_base = pos[channel_x]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sections.v[0] && sector < sec_w) {
+        theta_base = pos[channel_x + ne2 * 1]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 2]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 3]*sycl::pow(theta_scale, i0/2.0f);
+    }
+
+    const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
+    float       cos_theta;
+    float       sin_theta;
+    rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + n_dims/2];
+
+    // store results in dst
+    dst[idst + 0]      = x0 * cos_theta - x1 * sin_theta;
+    dst[idst + n_dims/2] = x0 * sin_theta + x1 * cos_theta;
+}
+
+
+
 template <typename T, bool has_ff>
 static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
                         const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
@@ -171,7 +222,7 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
                            const float * freq_factors, queue_ptr stream) {
     GGML_ASSERT(ne0 % 2 == 0);
     const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int            num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
+    const int            num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
     const sycl::range<3> block_nums(1, num_blocks_x, nr);
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
@@ -208,7 +259,7 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
                            const rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
     GGML_ASSERT(ne0 % 2 == 0);
     const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int            num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
+    const int            num_blocks_x = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
     const sycl::range<3> block_nums(1, num_blocks_x, nr);
 
     const float theta_scale = powf(freq_base, -2.0f / n_dims);
@@ -228,6 +279,40 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
     }
 }
 
+template <typename T>
+static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
+                             const size_t s2, const int n_dims, const int nr, const int32_t * pos,
+                             const float freq_scale, const float freq_base, const float ext_factor,
+                             const float attn_factor, const rope_corr_dims corr_dims, const float * freq_factors,
+                             const mrope_sections sections, queue_ptr stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const sycl::range<3>    block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
+    const int               n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
+    const sycl::range<3>    grid_dims(1, n_blocks_y, nr);
+    const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
+
+    const float theta_scale = std::pow(freq_base, -2.0f / n_dims);
+    // Add FP16 capability check if T could be sycl::half
+    if constexpr (std::is_same_v<T, sycl::half>) {
+        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
+    }
+    // launch kernel
+    if (freq_factors == nullptr) {
+        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
+            rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
+                                  corr_dims, theta_scale, freq_factors, sections, item_ct1);
+        });
+    } else {
+        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
+            rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
+                                 corr_dims, theta_scale, freq_factors, sections, item_ct1);
+        });
+    }
+}
+
+
+
+
 // rope vision
 template <typename T>
 static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
@@ -237,7 +322,7 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
                              const mrope_sections sections, queue_ptr stream) {
     GGML_ASSERT(ne0 % 2 == 0);
     const sycl::range<3>    block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int               n_blocks_y = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
+    const int               n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
     const sycl::range<3>    grid_dims(1, n_blocks_y, nr);
     const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
 
@@ -298,8 +383,17 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
     memcpy(&sections.v,  (int32_t *) dst->op_params + 11, sizeof(int)*4);
 
     const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
     const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
 
+    if (is_mrope) {
+        GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne00/2);
+    }
+
     const int32_t * pos = (const int32_t *) dst->src[1]->data;
 
     const float * freq_factors = nullptr;
@@ -326,6 +420,19 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
         } else {
             GGML_ABORT("fatal error");
         }
+    } else if (is_mrope && !is_vision) {
+        GGML_SYCL_DEBUG("%s: mrope path\n", __func__);
+        if (dst->src[0]->type == GGML_TYPE_F16) {
+            rope_multi_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01,
+                s02, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                freq_factors, sections, main_stream);
+        } else if (dst->src[0]->type == GGML_TYPE_F32) {
+            rope_multi_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, ne02, s01, s02, n_dims,
+                             nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections,
+                             main_stream);
+        } else {
+            GGML_ABORT("Fatal error: Tensor type unsupported!");
+        }
     } else if (is_vision) {
         GGML_SYCL_DEBUG("%s: vision path\n", __func__);
         if (dst->src[0]->type == GGML_TYPE_F16) {
@@ -355,8 +462,7 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
 }
 
 void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_SYCL_DEBUG("call %s\n", __func__);
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
     ggml_sycl_op_rope(ctx, dst);
-    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp
index 7563d9ced..52fcf4b3d 100644
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -225,7 +225,7 @@ static void soft_max_f32_sycl(const float * x, const T * mask,
 }
 
 void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
@@ -249,16 +249,13 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
     if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
         const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
-        GGML_SYCL_DEBUG("%s: F16 mask\n", __func__);
         soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
                           main_stream, ctx.device);
     } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
         const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
-        GGML_SYCL_DEBUG("%s: F32 mask\n", __func__);
         soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
     } else {
         /* mask unavailable */
-        GGML_SYCL_DEBUG("%s: No mask\n", __func__);
         soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
     }
 }
diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp
index b877d18c1..f6ca626ea 100644
--- a/ggml/src/ggml-sycl/tsembd.cpp
+++ b/ggml/src/ggml-sycl/tsembd.cpp
@@ -56,8 +56,8 @@ static void timestep_embedding_f32_sycl(
 }
 
 void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor *src0 = dst->src[0];
-    const ggml_tensor *src1 = dst->src[1];
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    const ggml_tensor *  src0   = dst->src[0];
     const float * src0_d = (const float *)src0->data;
     float * dst_d = (float *)dst->data;
     dpct::queue_ptr stream = ctx.stream();
@@ -69,5 +69,4 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso
     const int max_period = dst->op_params[1];
 
     timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
-    GGML_UNUSED(src1);
 }
diff --git a/ggml/src/ggml-sycl/wkv.cpp b/ggml/src/ggml-sycl/wkv.cpp
index 540f6fbf5..c10e2f764 100644
--- a/ggml/src/ggml-sycl/wkv.cpp
+++ b/ggml/src/ggml-sycl/wkv.cpp
@@ -180,10 +180,7 @@ static void rwkv_wkv7_f32_kernel(
 }
 
 void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
-
-    const ggml_tensor *src0 = dst->src[0];
-    const ggml_tensor *src1 = dst->src[1];
-
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/6);
     const float* k_d = (const float*)dst->src[0]->data;
     const float* v_d = (const float*)dst->src[1]->data;
     const float* r_d = (const float*)dst->src[2]->data;
@@ -236,16 +233,10 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
                 });
         });
     }
-
-    GGML_UNUSED(src0);
-    GGML_UNUSED(src1);
 }
 
 void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
-
-    const ggml_tensor *src0 = dst->src[0];
-    const ggml_tensor *src1 = dst->src[1];
-
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/7);
     const float* r_d = (const float*)dst->src[0]->data;
     const float* w_d = (const float*)dst->src[1]->data;
     const float* k_d = (const float*)dst->src[2]->data;
@@ -299,7 +290,4 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
                 });
         });
     }
-
-    GGML_UNUSED(src0);
-    GGML_UNUSED(src1);
 }
diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
index 662f13771..4a88415f9 100644
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -109,10 +109,6 @@ if (Vulkan_FOUND)
         add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
     endif()
 
-    if (GGML_VULKAN_PERF)
-        add_compile_definitions(GGML_VULKAN_PERF)
-    endif()
-
     if (GGML_VULKAN_VALIDATE)
         add_compile_definitions(GGML_VULKAN_VALIDATE)
     endif()
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index c160a9984..41d20aa5d 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1,6 +1,6 @@
 #include "ggml-vulkan.h"
 #include <vulkan/vulkan_core.h>
-#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
+#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
 #include <chrono>
 #include "ggml-cpu.h"
 #endif
@@ -184,9 +184,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
 #ifdef GGML_VULKAN_MEMORY_DEBUG
 class vk_memory_logger;
 #endif
-#ifdef GGML_VULKAN_PERF
 class vk_perf_logger;
-#endif
 static void ggml_vk_destroy_buffer(vk_buffer& buf);
 
 static constexpr uint32_t mul_mat_vec_max_cols = 8;
@@ -442,9 +440,11 @@ struct vk_device_struct {
 #ifdef GGML_VULKAN_MEMORY_DEBUG
     std::unique_ptr<vk_memory_logger> memory_logger;
 #endif
-#ifdef GGML_VULKAN_PERF
+
+    // for GGML_VK_PERF_LOGGER
     std::unique_ptr<vk_perf_logger> perf_logger;
-#endif
+    vk::QueryPool query_pool;
+    uint32_t num_queries;
 
     ~vk_device_struct() {
         VK_LOG_DEBUG("destroy device " << name);
@@ -828,8 +828,6 @@ private:
 #define VK_LOG_MEMORY(msg) ((void) 0)
 #endif // GGML_VULKAN_MEMORY_DEBUG
 
-#if defined(GGML_VULKAN_PERF)
-
 class vk_perf_logger {
 public:
     void print_timings() {
@@ -839,7 +837,7 @@ public:
             for (const auto& time : t.second) {
                 total += time;
             }
-            std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
+            std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
         }
 
         timings.clear();
@@ -868,7 +866,6 @@ public:
 private:
     std::map<std::string, std::vector<uint64_t>> timings;
 };
-#endif // GGML_VULKAN_PERF
 
 struct ggml_backend_vk_context {
     std::string name;
@@ -958,6 +955,8 @@ struct vk_instance_t {
 static bool vk_instance_initialized = false;
 static vk_instance_t vk_instance;
 
+static bool vk_perf_logger_enabled = false;
+
 #ifdef GGML_VULKAN_CHECK_RESULTS
 static size_t vk_skip_checks;
 static size_t vk_output_tensor;
@@ -1653,7 +1652,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
         return {64, 32};
     }
     return {64, 64};
-};
+}
 
 static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
 
@@ -2757,9 +2756,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
 #ifdef GGML_VULKAN_MEMORY_DEBUG
         device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
 #endif
-#ifdef GGML_VULKAN_PERF
-        device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
-#endif
+        if (vk_perf_logger_enabled) {
+            device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
+        }
 
         size_t dev_num = vk_instance.device_indices[idx];
 
@@ -2804,23 +2803,29 @@ static vk_device ggml_vk_get_device(size_t idx) {
                 pipeline_robustness = true;
             } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
                 device->subgroup_size_control = true;
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
             } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
                        !getenv("GGML_VK_DISABLE_COOPMAT")) {
                 device->coopmat_support = true;
                 device->coopmat_m = 0;
                 device->coopmat_n = 0;
                 device->coopmat_k = 0;
+#endif
+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
             } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
                        !getenv("GGML_VK_DISABLE_COOPMAT2")) {
                 coopmat2_support = true;
+#endif
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
             } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
                        !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
                 device->integer_dot_product = true;
 #endif
+#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
             } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 &&
                        !getenv("GGML_VK_DISABLE_BFLOAT16")) {
                 bfloat16_support = true;
+#endif
             }
         }
 
@@ -3541,6 +3546,8 @@ static void ggml_vk_instance_init() {
     vk_instance.instance = vk::createInstance(instance_create_info);
     vk_instance_initialized = true;
 
+    vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
+
     size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
 
     // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -4670,6 +4677,19 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
         }
     }
 
+    if (src->type == to) {
+        // Copy two or four bytes at a time, depending on block size.
+        // For quantized types, we scale by block size/type size. But
+        // this path is also used for bf16->bf16 for example, where the
+        // type size must be exactly 2 or 4.
+        GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4);
+        if ((ggml_type_size(src->type) % 4) == 0) {
+            return ctx->device->pipeline_contig_cpy_f32_f32;
+        } else {
+            return ctx->device->pipeline_contig_cpy_f16_f16;
+        }
+    }
+
     std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
     GGML_ABORT("fatal error");
 }
@@ -6433,6 +6453,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
     case GGML_OP_ROPE:
     case GGML_OP_RMS_NORM:
     case GGML_OP_CONV_2D_DW:
+    case GGML_OP_IM2COL:
         return true;
     default:
         return false;
@@ -6731,7 +6752,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     case GGML_OP_UNARY:
     case GGML_OP_CONV_2D_DW:
         {
-            const uint32_t ne = ggml_nelements(dst);
+            uint32_t ne = ggml_nelements(dst);
+            if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
+                // Convert from number of logical elements to 2- or 4-byte units.
+                ne /= ggml_blck_size(src0->type);
+                if ((ggml_type_size(src0->type) % 4) == 0) {
+                    ne *= ggml_type_size(src0->type) / 4;
+                } else {
+                    ne *= ggml_type_size(src0->type) / 2;
+                }
+            }
             if (ne > 262144) {
                 elements = { 512, 512, CEIL_DIV(ne, 262144) };
             } else if (ne > 512) {
@@ -7281,8 +7311,19 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
 
+    uint32_t ne = (uint32_t)ggml_nelements(src0);
+    if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
+        // Convert from number of logical elements to 2- or 4-byte units.
+        ne /= ggml_blck_size(src0->type);
+        if ((ggml_type_size(src0->type) % 4) == 0) {
+            ne *= ggml_type_size(src0->type) / 4;
+        } else {
+            ne *= ggml_type_size(src0->type) / 2;
+        }
+    }
+
     ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
-        (uint32_t)ggml_nelements(src0),
+        ne,
         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
         0,
@@ -8845,7 +8886,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 
     ctx->tensor_ctxs[node_idx] = compute_ctx;
 
-#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
+#if defined(GGML_VULKAN_CHECK_RESULTS)
     // Force context reset on each node so that each tensor ends up in its own context
     // and can be run and compared to its CPU equivalent separately
     last_node = true;
@@ -9264,8 +9305,7 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
     try {
         ptr = ggml_vk_host_malloc(vk_instance.devices[0], size);
     } catch (vk::SystemError& e) {
-        std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
-        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
+        GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what());
         // fallback to cpu buffer
         return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
     }
@@ -9466,6 +9506,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     bool first_node_in_batch = true; // true if next node will be first node in a batch
     int submit_node_idx = 0; // index to first node in a batch
 
+    vk_context compute_ctx;
+    if (vk_perf_logger_enabled) {
+        // allocate/resize the query pool
+        if (ctx->device->num_queries < cgraph->n_nodes + 1) {
+            if (ctx->device->query_pool) {
+                ctx->device->device.destroyQueryPool(ctx->device->query_pool);
+            }
+            VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
+            query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
+            query_create_info.queryCount = cgraph->n_nodes + 100;
+            ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
+            ctx->device->num_queries = query_create_info.queryCount;
+        }
+
+        ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
+
+        GGML_ASSERT(ctx->compute_ctx.expired());
+        compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+        ctx->compute_ctx = compute_ctx;
+        ggml_vk_ctx_begin(ctx->device, compute_ctx);
+        compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
+    }
+
     // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
     // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
     // (and scaled down based on model size, so smaller models submit earlier).
@@ -9493,6 +9556,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
 
         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
 
+        if (vk_perf_logger_enabled) {
+            if (ctx->compute_ctx.expired()) {
+                compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
+                ctx->compute_ctx = compute_ctx;
+                ggml_vk_ctx_begin(ctx->device, compute_ctx);
+            } else {
+                compute_ctx = ctx->compute_ctx.lock();
+            }
+            compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
+        }
+
         if (enqueued) {
             ++submitted_nodes;
 
@@ -9514,9 +9588,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         }
     }
 
-#ifdef GGML_VULKAN_PERF
-    ctx->device->perf_logger->print_timings();
-#endif
+    if (vk_perf_logger_enabled) {
+        // End the command buffer and submit/wait
+        GGML_ASSERT(!ctx->compute_ctx.expired());
+        compute_ctx = ctx->compute_ctx.lock();
+        ggml_vk_ctx_end(compute_ctx);
+
+        ggml_vk_submit(compute_ctx, ctx->device->fence);
+        VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
+        ctx->device->device.resetFences({ ctx->device->fence });
+
+        // Get the results and pass them to the logger
+        std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
+        ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            if (!ggml_vk_is_empty(cgraph->nodes[i])) {
+                ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
+            }
+        }
+
+        ctx->device->perf_logger->print_timings();
+    }
 
     ggml_vk_graph_cleanup(ctx);
 
@@ -9867,6 +9959,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
                     return true;
                 }
+
+                // We can handle copying from a type to the same type if it's
+                // contiguous (memcpy). We use f16 or f32 shaders to do the copy,
+                // so the type/block size must be a multiple of 4.
+                if (src0_type == src1_type &&
+                    ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op) &&
+                    (ggml_type_size(src0_type) % 2) == 0) {
+                    return true;
+                }
                 return false;
             } break;
         case GGML_OP_REPEAT:
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 57d3e39ad..196b7b8f3 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -133,7 +133,7 @@ static void ggml_print_backtrace_symbols(void) {
 }
 #endif
 
-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
     const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
     if (GGML_NO_BACKTRACE) {
         return;
@@ -160,6 +160,10 @@ static void ggml_print_backtrace(void) {
     const int parent_pid = getpid();
     const int child_pid = fork();
     if (child_pid < 0) { // error
+#if defined(__linux__)
+        close(lock[1]);
+        close(lock[0]);
+#endif
         return;
     } else if (child_pid == 0) { // child
         char attach[32];
@@ -167,6 +171,7 @@ static void ggml_print_backtrace(void) {
 #if defined(__linux__)
         close(lock[1]);
         (void) !read(lock[0], lock, 1);
+        close(lock[0]);
 #endif
         // try gdb
         execlp("gdb", "gdb", "--batch",
@@ -195,7 +200,7 @@ static void ggml_print_backtrace(void) {
     }
 }
 #else
-static void ggml_print_backtrace(void) {
+void ggml_print_backtrace(void) {
     // platform not supported
 }
 #endif
@@ -216,6 +221,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
     abort();
 }
 
+// ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
+
 //
 // logging
 //
@@ -2312,6 +2319,26 @@ struct ggml_tensor * ggml_repeat(
     return result;
 }
 
+struct ggml_tensor * ggml_repeat_4d(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    const bool can_repeat = ggml_is_empty(a) || (
+        (ne0 % a->ne[0] == 0) &&
+        (ne1 % a->ne[1] == 0) &&
+        (ne2 % a->ne[2] == 0) &&
+        (ne3 % a->ne[3] == 0)
+    );
+    GGML_ASSERT(can_repeat);
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
+
+    result->op     = GGML_OP_REPEAT;
+    result->src[0] = a;
+
+    return result;
+}
+
 // ggml_repeat_back
 
 struct ggml_tensor * ggml_repeat_back(
diff --git a/ggml/src/ggml.cpp b/ggml/src/ggml.cpp
new file mode 100644
index 000000000..0d388d455
--- /dev/null
+++ b/ggml/src/ggml.cpp
@@ -0,0 +1,26 @@
+#include "ggml-impl.h"
+
+#include <cstdlib>
+#include <exception>
+
+static std::terminate_handler previous_terminate_handler;
+
+GGML_NORETURN static void ggml_uncaught_exception() {
+    ggml_print_backtrace();
+    if (previous_terminate_handler) {
+        previous_terminate_handler();
+    }
+    abort(); // unreachable unless previous_terminate_handler was nullptr
+}
+
+static bool ggml_uncaught_exception_init = []{
+    const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
+    if (GGML_NO_BACKTRACE) {
+        return false;
+    }
+    const auto prev{std::get_terminate()};
+    GGML_ASSERT(prev != ggml_uncaught_exception);
+    previous_terminate_handler = prev;
+    std::set_terminate(ggml_uncaught_exception);
+    return true;
+}();
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index 8667a80bd..a0a318a29 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -347,11 +347,28 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
     int64_t n_tensors = 0;
 
     if (ok && gr.read(ctx->version)) {
-        if (ctx->version == 1) {
+        if (ok && ctx->version == 0) {
+            GGML_LOG_ERROR("%s: bad GGUF version: %" PRIu32 "\n", __func__, ctx->version);
+            ok = false;
+        }
+
+        /*
+         * bit layout is different when reading non-native endian models.
+         * assuming that the GGUF version is 3, the non-native endian model
+         * would read it as 0x30000000. we can use the AND operation against
+         * the last 4 hexadecimal digits to check if the model is the same
+         * endianness as the host system.
+        */
+        if (ok && (ctx->version & 0x0000FFFF) == 0x00000000) {
+            GGML_LOG_ERROR("%s: failed to load model: this GGUF file version %" PRIu32 " is extremely large, is there a mismatch between the host and model endianness?\n", __func__, ctx->version);
+            ok = false;
+        }
+
+        if (ok && ctx->version == 1) {
             GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
             ok = false;
         }
-        if (ctx->version > GGUF_VERSION) {
+        if (ok && ctx->version > GGUF_VERSION) {
             GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
                 __func__, ctx->version, GGUF_VERSION);
             ok = false;
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 618f87180..ae30129b3 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -177,6 +177,9 @@ class Keys:
         EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
         BLOCK_COUNT      = "{arch}.convnext.block_count"
 
+    class Classifier:
+        OUTPUT_LABELS = "{arch}.classifier.output_labels"
+
     class Tokenizer:
         MODEL                = "tokenizer.ggml.model"
         PRE                  = "tokenizer.ggml.pre"
@@ -219,10 +222,13 @@ class Keys:
         TYPE       = "adapter.type"
         LORA_ALPHA = "adapter.lora.alpha"
 
-    class ClipVision:
+    class Clip:
         PROJECTOR_TYPE      = "clip.projector_type"
         HAS_VISION_ENCODER  = "clip.has_vision_encoder"
+        HAS_AUDIO_ENCODER   = "clip.has_audio_encoder"
         HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
+
+    class ClipVision:
         IMAGE_SIZE          = "clip.vision.image_size"
         PATCH_SIZE          = "clip.vision.patch_size"
         EMBEDDING_LENGTH    = "clip.vision.embedding_length"
@@ -243,19 +249,33 @@ class Keys:
         class Projector:
             SCALE_FACTOR    = "clip.vision.projector.scale_factor"
 
+    class ClipAudio:
+        NUM_MEL_BINS        = "clip.audio.num_mel_bins"
+        EMBEDDING_LENGTH    = "clip.audio.embedding_length"
+        FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
+        PROJECTION_DIM      = "clip.audio.projection_dim"
+        BLOCK_COUNT         = "clip.audio.block_count"
+
+        class Attention:
+            HEAD_COUNT      = "clip.audio.attention.head_count"
+            LAYERNORM_EPS   = "clip.audio.attention.layer_norm_epsilon"
+
+        class Projector:
+            STACK_FACTOR    = "clip.audio.projector.stack_factor"
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
 
 
 class GGUFType:
-    MODEL       = "model"
-    ADAPTER     = "adapter"
-    CLIP_VISION = "clip-vision"
+    MODEL   = "model"
+    ADAPTER = "adapter"
+    MMPROJ  = "mmproj" # dummy, unused for now
 
 
 class MODEL_ARCH(IntEnum):
-    CLIP_VISION      = auto() # dummy arch for clip.cpp
+    MMPROJ           = auto() # dummy arch for clip.cpp
     LLAMA            = auto()
     LLAMA4           = auto()
     DECI             = auto()
@@ -515,10 +535,28 @@ class MODEL_TENSOR(IntEnum):
     V_RESMPL_QUERY       = auto() # minicpmv
     V_TOK_EMBD_IMG_BREAK = auto() # pixtral
     V_MM_PATCH_MERGER    = auto() # mistral small 3.1
+    # audio (mtmd)
+    A_ENC_EMBD_POS       = auto()
+    A_ENC_CONV1D         = auto()
+    A_PRE_NORM           = auto()
+    A_POST_NORM          = auto()
+    A_ENC_ATTN_Q         = auto()
+    A_ENC_ATTN_K         = auto()
+    A_ENC_ATTN_V         = auto()
+    A_ENC_INPUT_NORM     = auto()
+    A_ENC_OUTPUT         = auto()
+    A_ENC_OUTPUT_NORM    = auto()
+    A_ENC_FFN_UP         = auto()
+    A_ENC_FFN_GATE       = auto()
+    A_ENC_FFN_DOWN       = auto()
+    A_MMPROJ             = auto()
+    A_MMPROJ_FC          = auto()
+    A_MM_NORM_PRE        = auto()
+    A_MM_NORM_MID        = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.CLIP_VISION:      "clip", # dummy arch for clip.cpp
+    MODEL_ARCH.MMPROJ:           "clip", # dummy arch for clip.cpp
     MODEL_ARCH.LLAMA:            "llama",
     MODEL_ARCH.LLAMA4:           "llama4",
     MODEL_ARCH.DECI:             "deci",
@@ -778,10 +816,28 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.V_RESMPL_QUERY:            "resampler.query",
     MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK:      "v.token_embd.img_break", # pixtral
     MODEL_TENSOR.V_MM_PATCH_MERGER:         "mm.patch_merger", # mistral small 3.1
+    # audio (mtmd)
+    MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
+    MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
+    MODEL_TENSOR.A_PRE_NORM:                "a.pre_ln",
+    MODEL_TENSOR.A_POST_NORM:               "a.post_ln",
+    MODEL_TENSOR.A_ENC_ATTN_Q:              "a.blk.{bid}.attn_q",
+    MODEL_TENSOR.A_ENC_ATTN_K:              "a.blk.{bid}.attn_k",
+    MODEL_TENSOR.A_ENC_ATTN_V:              "a.blk.{bid}.attn_v",
+    MODEL_TENSOR.A_ENC_INPUT_NORM:          "a.blk.{bid}.ln1",
+    MODEL_TENSOR.A_ENC_OUTPUT:              "a.blk.{bid}.attn_out",
+    MODEL_TENSOR.A_ENC_OUTPUT_NORM:         "a.blk.{bid}.ln2",
+    MODEL_TENSOR.A_ENC_FFN_UP:              "a.blk.{bid}.ffn_up",
+    MODEL_TENSOR.A_ENC_FFN_GATE:            "a.blk.{bid}.ffn_gate",
+    MODEL_TENSOR.A_ENC_FFN_DOWN:            "a.blk.{bid}.ffn_down",
+    MODEL_TENSOR.A_MMPROJ:                  "mm.a.mlp.{bid}",
+    MODEL_TENSOR.A_MMPROJ_FC:               "mm.a.fc",
+    MODEL_TENSOR.A_MM_NORM_PRE:             "mm.a.norm_pre",
+    MODEL_TENSOR.A_MM_NORM_MID:             "mm.a.norm_mid",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
-    MODEL_ARCH.CLIP_VISION: [
+    MODEL_ARCH.MMPROJ: [
         MODEL_TENSOR.V_MMPROJ,
         MODEL_TENSOR.V_MMPROJ_FC,
         MODEL_TENSOR.V_MMPROJ_MLP,
@@ -821,6 +877,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.V_RESMPL_QUERY,
         MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
         MODEL_TENSOR.V_MM_PATCH_MERGER,
+        # audio
+        MODEL_TENSOR.A_ENC_EMBD_POS,
+        MODEL_TENSOR.A_ENC_CONV1D,
+        MODEL_TENSOR.A_PRE_NORM,
+        MODEL_TENSOR.A_POST_NORM,
+        MODEL_TENSOR.A_ENC_ATTN_Q,
+        MODEL_TENSOR.A_ENC_ATTN_K,
+        MODEL_TENSOR.A_ENC_ATTN_V,
+        MODEL_TENSOR.A_ENC_INPUT_NORM,
+        MODEL_TENSOR.A_ENC_OUTPUT,
+        MODEL_TENSOR.A_ENC_OUTPUT_NORM,
+        MODEL_TENSOR.A_ENC_FFN_UP,
+        MODEL_TENSOR.A_ENC_FFN_GATE,
+        MODEL_TENSOR.A_ENC_FFN_DOWN,
+        MODEL_TENSOR.A_MMPROJ,
+        MODEL_TENSOR.A_MMPROJ_FC,
+        MODEL_TENSOR.A_MM_NORM_PRE,
+        MODEL_TENSOR.A_MM_NORM_MID,
     ],
     MODEL_ARCH.LLAMA: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -964,6 +1038,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.POS_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.ATTN_OUT_NORM,
+        MODEL_TENSOR.ATTN_QKV,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
         MODEL_TENSOR.ATTN_V,
@@ -2200,7 +2275,10 @@ class VisionProjectorType:
     LLAMA4 = "llama4"
     QWEN2VL = "qwen2vl_merger"
     QWEN25VL = "qwen2.5vl_merger"
+    ULTRAVOX = "ultravox"
     INTERNVL = "internvl"
+    QWEN2A = "qwen2a" # audio
+    QWEN25O = "qwen2.5o" # omni
 
 
 # Items here are (block size, type size)
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index ff50d3de3..de6e45ae8 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -49,6 +49,7 @@ class TensorInfo:
 class GGUFValue:
     value: Any
     type: GGUFValueType
+    sub_type: GGUFValueType | None = None
 
 
 class WriterState(Enum):
@@ -238,7 +239,7 @@ class GGUFWriter:
 
             for key, val in kv_data.items():
                 kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
-                kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
+                kv_bytes += self._pack_val(val.value, val.type, add_vtype=True, sub_type=val.sub_type)
 
             fout.write(kv_bytes)
 
@@ -268,11 +269,11 @@ class GGUFWriter:
             fout.flush()
         self.state = WriterState.TI_DATA
 
-    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
+    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
         if any(key in kv_data for kv_data in self.kv_data):
             raise ValueError(f'Duplicated key name {key!r}')
 
-        self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
+        self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
 
     def add_uint8(self, key: str, val: int) -> None:
         self.add_key_value(key,val, GGUFValueType.UINT8)
@@ -896,7 +897,7 @@ class GGUFWriter:
     def add_remove_extra_whitespaces(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
 
-    def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
+    def add_precompiled_charsmap(self, charsmap: bytes) -> None:
         self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
 
     def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
@@ -936,12 +937,18 @@ class GGUFWriter:
 
     # for vision models
 
+    def add_clip_has_vision_encoder(self, value: bool) -> None:
+        self.add_bool(Keys.Clip.HAS_VISION_ENCODER, value)
+
+    def add_clip_has_audio_encoder(self, value: bool) -> None:
+        self.add_bool(Keys.Clip.HAS_AUDIO_ENCODER, value)
+
+    def add_clip_projector_type(self, value: str) -> None:
+        self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
+
     def add_vision_projection_dim(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
 
-    def add_vision_has_vision_encoder(self, value: bool) -> None:
-        self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
-
     def add_vision_patch_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
 
@@ -957,9 +964,6 @@ class GGUFWriter:
     def add_vision_head_count(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
 
-    def add_vision_projector_type(self, value: str) -> None:
-        self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
-
     def add_vision_attention_layernorm_eps(self, value: float) -> None:
         self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
 
@@ -987,13 +991,39 @@ class GGUFWriter:
     def add_vision_n_wa_pattern(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
 
+    # audio models
+
+    def add_audio_projection_dim(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
+
+    def add_audio_embedding_length(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.EMBEDDING_LENGTH, value)
+
+    def add_audio_feed_forward_length(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.FEED_FORWARD_LENGTH, value)
+
+    def add_audio_block_count(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.BLOCK_COUNT, value)
+
+    def add_audio_head_count(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.Attention.HEAD_COUNT, value)
+
+    def add_audio_attention_layernorm_eps(self, value: float) -> None:
+        self.add_float32(Keys.ClipAudio.Attention.LAYERNORM_EPS, value)
+
+    def add_audio_num_mel_bins(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.NUM_MEL_BINS, value)
+
+    def add_audio_stack_factor(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
+
     def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
         pack_prefix = ''
         if not skip_pack_prefix:
             pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
         return struct.pack(f'{pack_prefix}{fmt}', value)
 
-    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
+    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool, sub_type: GGUFValueType | None = None) -> bytes:
         kv_data = bytearray()
 
         if add_vtype:
@@ -1014,7 +1044,9 @@ class GGUFWriter:
             if len(val) == 0:
                 raise ValueError("Invalid GGUF metadata array. Empty array")
 
-            if isinstance(val, bytes):
+            if sub_type is not None:
+                ltype = sub_type
+            elif isinstance(val, bytes):
                 ltype = GGUFValueType.UINT8
             else:
                 ltype = GGUFValueType.get_type(val[0])
diff --git a/gguf-py/gguf/scripts/gguf_editor_gui.py b/gguf-py/gguf/scripts/gguf_editor_gui.py
index 3d38b5cba..05f4db0f8 100755
--- a/gguf-py/gguf/scripts/gguf_editor_gui.py
+++ b/gguf-py/gguf/scripts/gguf_editor_gui.py
@@ -1521,19 +1521,21 @@ class GGUFEditorWindow(QMainWindow):
                     continue
 
                 # Apply changes if any
+                sub_type = None
                 if field.name in self.metadata_changes:
                     value_type, value = self.metadata_changes[field.name]
                     if value_type == GGUFValueType.ARRAY:
                         # Handle array values
-                        element_type, array_values = value
-                        writer.add_array(field.name, array_values)
-                    else:
-                        writer.add_key_value(field.name, value, value_type)
+                        sub_type, value = value
                 else:
                     # Copy original value
                     value = field.contents()
-                    if value is not None and field.types:
-                        writer.add_key_value(field.name, value, field.types[0])
+                    value_type = field.types[0]
+                    if value_type == GGUFValueType.ARRAY:
+                        sub_type = field.types[-1]
+
+                if value is not None:
+                    writer.add_key_value(field.name, value, value_type, sub_type=sub_type)
 
             # Add new metadata
             for key, (value_type, value) in self.metadata_changes.items():
@@ -1541,7 +1543,12 @@ class GGUFEditorWindow(QMainWindow):
                 if self.reader.get_field(key) is not None:
                     continue
 
-                writer.add_key_value(key, value, value_type)
+                sub_type = None
+                if value_type == GGUFValueType.ARRAY:
+                    # Handle array values
+                    sub_type, value = value
+
+                writer.add_key_value(key, value, value_type, sub_type=sub_type)
 
             # Add tensors (including data)
             for tensor in self.reader.tensors:
diff --git a/gguf-py/gguf/scripts/gguf_new_metadata.py b/gguf-py/gguf/scripts/gguf_new_metadata.py
index 7aff6c925..63f230034 100755
--- a/gguf-py/gguf/scripts/gguf_new_metadata.py
+++ b/gguf-py/gguf/scripts/gguf_new_metadata.py
@@ -24,6 +24,7 @@ class MetadataDetails(NamedTuple):
     type: gguf.GGUFValueType
     value: Any
     description: str = ''
+    sub_type: gguf.GGUFValueType | None = None
 
 
 def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
@@ -57,7 +58,9 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
             logger.debug(f'Removing {field.name}')
             continue
 
-        old_val = MetadataDetails(field.types[0], field.contents())
+        val_type = field.types[0]
+        sub_type = field.types[-1] if val_type == gguf.GGUFValueType.ARRAY else None
+        old_val = MetadataDetails(val_type, field.contents(), sub_type=sub_type)
         val = new_metadata.get(field.name, old_val)
 
         if field.name in new_metadata:
@@ -67,7 +70,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
             logger.debug(f'Copying {field.name}')
 
         if val.value is not None:
-            writer.add_key_value(field.name, val.value, val.type)
+            writer.add_key_value(field.name, val.value, val.type, sub_type=sub_type if val.sub_type is None else val.sub_type)
 
     if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
         logger.debug('Adding chat template(s)')
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index b6eb770d8..93dd1d802 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -157,7 +157,7 @@ class TensorNameMap:
             "h.{bid}.attn.c_attn",                                                 # gpt2
             "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
             "encoder.layers.{bid}.attn.Wqkv",                                      # nomic-bert
-            "encoder.layers.{bid}.mixer.Wqkv",                                     # jina-bert-v3
+            "encoder.layers.{bid}.mixer.Wqkv",                                     # jina
             "model.layers.{bid}.self_attn.qkv_proj",                               # phi3
             "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
             "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
@@ -169,6 +169,7 @@ class TensorNameMap:
             "model.layers.{bid}.self_attn.q_proj_no_perm",               # llama-custom
             "layers.{bid}.attention.wq",                                 # llama-pth
             "encoder.layer.{bid}.attention.self.query",                  # bert
+            "transformer.layer.{bid}.attention.q_lin",                   # distillbert
             "transformer.h.{bid}.attn.q_proj",                           # gpt-j
             "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
             "model.layers.{bid}.attention.wq",                           # internlm2
@@ -183,6 +184,7 @@ class TensorNameMap:
             "model.layers.{bid}.self_attn.k_proj_no_perm",             # llama-custom
             "layers.{bid}.attention.wk",                               # llama-pth
             "encoder.layer.{bid}.attention.self.key",                  # bert
+            "transformer.layer.{bid}.attention.k_lin",                 # distillbert
             "transformer.h.{bid}.attn.k_proj",                         # gpt-j
             "transformer.h.{bid}.attn.k",                              # refact
             "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
@@ -197,6 +199,7 @@ class TensorNameMap:
             "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
             "layers.{bid}.attention.wv",                                 # llama-pth
             "encoder.layer.{bid}.attention.self.value",                  # bert
+            "transformer.layer.{bid}.attention.v_lin",                   # distillbert
             "transformer.h.{bid}.attn.v_proj",                           # gpt-j
             "transformer.h.{bid}.attn.v",                                # refact
             "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
@@ -217,6 +220,7 @@ class TensorNameMap:
             "model.layers.{bid}.self_attn.linear_attn",                     # deci
             "layers.{bid}.attention.wo",                                    # llama-pth
             "encoder.layer.{bid}.attention.output.dense",                   # bert
+            "transformer.layer.{bid}.attention.out_lin",                    # distillbert
             "transformer.h.{bid}.attn.out_proj",                            # gpt-j
             "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
             "model.layers.{bid}.self_attn.dense",                           # persimmon
@@ -225,7 +229,7 @@ class TensorNameMap:
             "model.layers.layers.{bid}.self_attn.o_proj",                   # plamo
             "model.layers.{bid}.attention.wo",                              # internlm2
             "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
-            "encoder.layers.{bid}.mixer.out_proj",                          # jina-bert-v3
+            "encoder.layers.{bid}.mixer.out_proj",                          # jina
             "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
             "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
             "encoder.layers.{bid}.self_attention.dense",                    # chatglm
@@ -237,6 +241,7 @@ class TensorNameMap:
         # Attention output norm
         MODEL_TENSOR.ATTN_OUT_NORM: (
             "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
+            "transformer.layer.{bid}.sa_layer_norm",           # distillbert
             "encoder.layers.{bid}.norm1",                      # nomic-bert
             "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
             "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
@@ -313,6 +318,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
             "layers.{bid}.feed_forward.w3",                           # llama-pth
             "encoder.layer.{bid}.intermediate.dense",                 # bert
+            "transformer.layer.{bid}.ffn.lin1",                       # distillbert
             "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
             "transformer.h.{bid}.mlp.linear_3",                       # refact
             "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
@@ -396,6 +402,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
             "layers.{bid}.feed_forward.w2",                           # llama-pth
             "encoder.layer.{bid}.output.dense",                       # bert
+            "transformer.layer.{bid}.ffn.lin2",                       # distillbert
             "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
             "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
             "model.layers.{bid}.mlp.dense_4h_to_h",                   # persimmon
@@ -457,6 +464,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.LAYER_OUT_NORM: (
             "encoder.layer.{bid}.output.LayerNorm",         # bert
+            "transformer.layer.{bid}.output_layer_norm",    # distillbert
             "encoder.layers.{bid}.norm2",                   # nomic-bert
             "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
             "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
@@ -827,6 +835,7 @@ class TensorNameMap:
         MODEL_TENSOR.CLS: (
             "classifier",       # jina
             "classifier.dense", # roberta
+            "pre_classifier",   # distillbert
         ),
 
         MODEL_TENSOR.CLS_OUT: (
@@ -904,7 +913,6 @@ class TensorNameMap:
 
         MODEL_TENSOR.V_MMPROJ_FC: (
             "model.connector.modality_projection.proj", # SmolVLM
-            "multi_modal_projector.linear_1", # llama 4
         ),
 
         MODEL_TENSOR.V_MMPROJ_MLP: (
@@ -1112,6 +1120,77 @@ class TensorNameMap:
         MODEL_TENSOR.V_MM_PATCH_MERGER: (
             "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
         ),
+
+        # audio (mtmd)
+
+        MODEL_TENSOR.A_ENC_EMBD_POS: (
+            "audio_tower.embed_positions", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_CONV1D: (
+            "audio_tower.conv{bid}", # ultravox
+        ),
+
+        MODEL_TENSOR.A_PRE_NORM: (),
+
+        MODEL_TENSOR.A_POST_NORM: (
+            "audio_tower.layer_norm", # ultravox
+            "audio_tower.ln_post", # qwen2omni
+        ),
+
+        MODEL_TENSOR.A_ENC_ATTN_Q: (
+            "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_ATTN_K: (
+            "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_ATTN_V: (
+            "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_INPUT_NORM: (
+            "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_OUTPUT: (
+            "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
+            "audio_tower.layers.{bid}.final_layer_norm", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_UP: (
+            "audio_tower.layers.{bid}.fc1", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_GATE: (),
+
+        MODEL_TENSOR.A_ENC_FFN_DOWN: (
+            "audio_tower.layers.{bid}.fc2", # ultravox
+        ),
+
+        # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
+        # this prefix is added in the conversion code in modify_tensors()
+
+        MODEL_TENSOR.A_MMPROJ: (
+            "audio.multi_modal_projector.linear_{bid}", # ultravox
+        ),
+
+        MODEL_TENSOR.A_MMPROJ_FC: (
+            "audio.multi_modal_projector.linear", # qwen2audio
+            "audio_tower.proj", # qwen2omni
+        ),
+
+        MODEL_TENSOR.A_MM_NORM_PRE: (
+            "audio.multi_modal_projector.ln_pre", # ultravox
+        ),
+
+        MODEL_TENSOR.A_MM_NORM_MID: (
+            "audio.multi_modal_projector.ln_mid", # ultravox
+        ),
     }
 
     # architecture-specific block mappings
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
index e5251aef8..00adcbc93 100644
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@@ -231,7 +231,7 @@ class SafetensorRemote:
         response.raise_for_status()
 
         # Get raw byte data
-        return response.content[:size]
+        return response.content[slice(size if size > -1 else None)]
 
     @classmethod
     def check_file_exist(cls, url: str) -> bool:
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index bb9b86ace..f11351cba 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.16.3"
+version = "0.17.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
diff --git a/include/llama.h b/include/llama.h
index 52cd7a5a0..da0f652cf 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -259,9 +259,9 @@ extern "C" {
         llama_token  *  token;
         float        *  embd;
         llama_pos    *  pos;
-        int32_t      *  n_seq_id;
-        llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
+        int32_t      *  n_seq_id; // TODO: remove, should belong to only 1 sequence
+        llama_seq_id ** seq_id;   // TODO: become llama_seq_id * seq_id;
+        int8_t       *  logits;   // TODO: rename this to "output"
     } llama_batch;
 
     enum llama_model_kv_override_type {
@@ -366,6 +366,8 @@ extern "C" {
         bool no_perf;     // measure performance timings
         bool op_offload;  // offload host tensor operations to device
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
     };
 
     // model quantization parameters
@@ -471,6 +473,7 @@ extern "C" {
     LLAMA_API int64_t llama_time_us(void);
 
     LLAMA_API size_t llama_max_devices(void);
+    LLAMA_API size_t llama_max_parallel_sequences(void);
 
     LLAMA_API bool llama_supports_mmap       (void);
     LLAMA_API bool llama_supports_mlock      (void);
@@ -501,6 +504,7 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
 
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -611,11 +615,11 @@ extern "C" {
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
     DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
-               "Use llama_kv_self_seq_pos_max() instead");
+               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
     DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
-               "Use llama_kv_self_seq_pos_max() instead");
+               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
     LLAMA_API void llama_kv_self_clear(
@@ -651,7 +655,6 @@ extern "C" {
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_self_seq_add(
@@ -664,7 +667,6 @@ extern "C" {
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_self_seq_div(
@@ -676,12 +678,14 @@ extern "C" {
 
     // Returns the smallest position present in the KV cache for the specified sequence
     // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_min(
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
 
     // Returns the largest position present in the KV cache for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
@@ -690,14 +694,15 @@ extern "C" {
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
-    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
+            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
 
     // Check if the context supports KV cache shifting
     LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
 
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
+            "simply remove this call, updates are applied lazily on the next llama_decode()");
 
     //
     // State / sessions
diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-bert-bge.gguf.inp
+++ b/models/ggml-vocab-bert-bge.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out
index a62566ce7..b1c49672f 100644
--- a/models/ggml-vocab-bert-bge.gguf.out
+++ b/models/ggml-vocab-bert-bge.gguf.out
@@ -1,5 +1,5 @@
  29464 2094 1018 1092 2706
- 11865 17875
+ 9706 7959 2140
 
 
 
diff --git a/models/ggml-vocab-chameleon.gguf.inp b/models/ggml-vocab-chameleon.gguf.inp
deleted file mode 100644
index 9baf7d77a..000000000
--- a/models/ggml-vocab-chameleon.gguf.inp
+++ /dev/null
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
diff --git a/models/ggml-vocab-chameleon.gguf.out b/models/ggml-vocab-chameleon.gguf.out
deleted file mode 100644
index 7c5413fee..000000000
--- a/models/ggml-vocab-chameleon.gguf.out
+++ /dev/null
@@ -1,46 +0,0 @@
- 17245 16604 16403 16604 33583 18355
- 16421 51153
-
- 16604
- 16650
- 16650 16604
- 16581
- 16582
- 16582 16582
- 16582 16582 16582
- 16581 16582
- 31596 17394
- 34926 17394
- 31596 18671
- 34926 18671
- 34926 18671 16384
- 31596 16395 17394 16384
- 34926 16395 17394 16384
- 16811 16704 20410 16483 16631 16397 52854
- 16470 16399 16403 16407 16604 16406 35764 38185 51595 22592 26639
- 29479 23955 17012 20103 25527 27670 17408 19005 21473 24774
- 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 21954 16607 21954 16633 21954 16611 29409 16607 21954 16615
- 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 16604 16391 24664 17153 57169 16721 16872 17073 17304 28729 16392
- 31596
- 34926
- 16650 31596
- 16650 34926
- 16696 31596
- 16696 31596 16582 16696 31596
- 16604 16391
- 16582 16604 16412
- 16390 22623
- 31596 16395 16712 16390 16828 16384 17674 16769 16732 23686 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636
- 16384 16384 16384 16384 16384 16384
- 16402
- 16402 16402
- 16402 16402 16402
- 16402 16402 16402 16402
- 16402 16402 16402 16402 16402
- 16402 16402 16402 16402 16402 16402
- 16402 16402 16402 16402 16402 16402 16402
- 16402 16402 16402 16402 16402 16402 16402 16402
- 16402 16402 16402 16402 16402 16402 16402 16402 16402
- 16418 19038 16639 16448 24315 33727 16467
- 18765 17981
- 16582 16604 16582 16582 16604 16582 16582 16582 16604 16581 16604 16581 16581 16604 16581 16582 16650 16582 16650 16604 16582 16696 16582 16696 16604 16582 52351 16604 16391 25825 16392 23686 16498 39161 18885 16618 16488 30853 16604 16391 54124 17153 25134 16656 18476 26169 16895 16392 62193 16611 20410 16483 16631 18885 16483 16631 16604 16402 16604 16402 16402 16604 16402 16402 16402 16604 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16604 16402 16402 16402 16402 16402 16402 16402 16402 16604 16402 16397 16402 16604 16402 16397 16397 16402 16604 16402 16397 16397 16397 16402 16604 54254 42231 48084 29409 16617 61889 29409 16608 21954 16628 21954 16499 58445 29409 16607 58445 21954 16479 42231 21954 16611 27683 16607 16604 16414 24427 16623 41809 16495 28999 36469 45292 30197 16400 16402 16400 16403 16400 16404 16400 43969 65211 16636 16604 16396 16396 16396 16396 16396 16396 16412 16412 16412 16412 16412 16412 16412 27268 23955 17012 20103 25527 27670 17408 19005 21473 24774 16604 16390 16390 16390 16390 16390 16390 16447 16447 16447 16447 16447 16447 16447 16385 16385 16385 16385 16397 16397 16397 16397 16397 16397 16384 16384 16384 16384 16384 16384 16414 16414 16414 16414 16414 16414 16687 16390 16690 16992 16604 16390 61797 16733 16390 16466 16986 16395 16604 16390 17879 16732 17811 16414 16604 16390 16428 16804 17811 16687 16390 16683 17190 16728 16395 16604 16390 16419 16732 16945 16991 25251 16414 17119 16390 38127 16641 16390 16459 16427
diff --git a/models/ggml-vocab-command-r.gguf.inp b/models/ggml-vocab-command-r.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-command-r.gguf.inp
+++ b/models/ggml-vocab-command-r.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-command-r.gguf.out b/models/ggml-vocab-command-r.gguf.out
index 3f6b41888..0e3af72eb 100644
--- a/models/ggml-vocab-command-r.gguf.out
+++ b/models/ggml-vocab-command-r.gguf.out
@@ -1,5 +1,5 @@
  2536 228 27 228 22957 6983
- 45 193433
+ 90711 87 20910
 
  228
  1667
diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-deepseek-coder.gguf.inp
+++ b/models/ggml-vocab-deepseek-coder.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out
index 52c4111a1..ef6bc5b8a 100644
--- a/models/ggml-vocab-deepseek-coder.gguf.out
+++ b/models/ggml-vocab-deepseek-coder.gguf.out
@@ -1,5 +1,5 @@
  1050 207 19 207 19192 4217
- 37 32009 71 6247
+ 125 213 26862 282
 
  207
  243
diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-deepseek-llm.gguf.inp
+++ b/models/ggml-vocab-deepseek-llm.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out
index 0191b7a11..f9d49c9af 100644
--- a/models/ggml-vocab-deepseek-llm.gguf.out
+++ b/models/ggml-vocab-deepseek-llm.gguf.out
@@ -1,5 +1,5 @@
  1052 207 19 207 19109 4223
- 37 100014 71 6245
+ 82077 26723 282
 
  207
  243
diff --git a/models/ggml-vocab-deepseek-r1-qwen.gguf.inp b/models/ggml-vocab-deepseek-r1-qwen.gguf.inp
deleted file mode 100644
index 9baf7d77a..000000000
--- a/models/ggml-vocab-deepseek-r1-qwen.gguf.inp
+++ /dev/null
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
diff --git a/models/ggml-vocab-deepseek-r1-qwen.gguf.out b/models/ggml-vocab-deepseek-r1-qwen.gguf.out
deleted file mode 100644
index 18b4b45cd..000000000
--- a/models/ggml-vocab-deepseek-r1-qwen.gguf.out
+++ /dev/null
@@ -1,46 +0,0 @@
- 1122 220 19 220 26062 3951
- 37 50753 261
-
- 220
- 256
- 262
- 197
- 198
- 271
- 1406
- 1572
- 9707 1879
- 21927 1879
- 9707 4337
- 21927 4337
- 21927 4337 0
- 9707 11 1879 0
- 21927 11 1879 0
- 419 374 11162 99 247 13 10821
- 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
- 78762 14144 1456 13073 63471 33594 3038 133178 79012
- 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
- 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
- 9707
- 21927
- 220 21927
- 256 21927
- 262 21927
- 262 21927 198 262 21927
- 320
- 198 284
- 6 11385
- 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
- 17085 2928
- 18
- 18 18
- 18 18 18
- 18 18 18 18
- 18 18 18 18 18
- 18 18 18 18 18 18
- 18 18 18 18 18 18 18
- 18 18 18 18 18 18 18 18
- 18 18 18 18 18 18 18 18 18
- 34 90063 128324
- 2560 2347
- 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-falcon.gguf.inp
+++ b/models/ggml-vocab-falcon.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out
index 64a48d97f..6319de60e 100644
--- a/models/ggml-vocab-falcon.gguf.out
+++ b/models/ggml-vocab-falcon.gguf.out
@@ -1,5 +1,5 @@
  878 204 31 3068 133 2137
- 28611 132 30042
+ 34502 18614 286
 
  204
  258
diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-gpt-2.gguf.inp
+++ b/models/ggml-vocab-gpt-2.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out
index 17a13bdfc..6464ded3d 100644
--- a/models/ggml-vocab-gpt-2.gguf.out
+++ b/models/ggml-vocab-gpt-2.gguf.out
@@ -1,5 +1,5 @@
  798 604 25208 1933
- 37 9116 71 11751
+ 127 226 79 69 417
 
  220
  220 220
diff --git a/models/ggml-vocab-gpt-4o.gguf.inp b/models/ggml-vocab-gpt-4o.gguf.inp
deleted file mode 100644
index 9baf7d77a..000000000
--- a/models/ggml-vocab-gpt-4o.gguf.inp
+++ /dev/null
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
diff --git a/models/ggml-vocab-gpt-4o.gguf.out b/models/ggml-vocab-gpt-4o.gguf.out
deleted file mode 100644
index 478df726f..000000000
--- a/models/ggml-vocab-gpt-4o.gguf.out
+++ /dev/null
@@ -1,46 +0,0 @@
- 1165 220 19 220 27124 5503
- 37 19194 259
-
- 220
- 256
- 271
- 197
- 198
- 279
- 2499
- 2775
- 13225 2375
- 32949 2375
- 13225 5922
- 32949 5922
- 32949 5922 0
- 13225 11 2375 0
- 32949 11 2375 0
- 495 382 9552 99 247 13 17159
- 86 45404 220 22 10191 2852 22924 4750 6916
- 3907 53641 1235 185386 8118
- 11400 107516 15867 20804 22851 134178 77431 32010 104312 37984 16329 27751 89335
- 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 350 7393 74471 484 853 1617 2316 6602 8
- 13225
- 32949
- 220 32949
- 256 32949
- 271 32949
- 271 32949 198 271 32949
- 350
- 198 314
- 6 6837
- 13225 11 342 70653 0 3253 553 481 22861 223 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208
- 147475
- 18
- 2546
- 15517
- 15517 18
- 15517 2546
- 15517 15517
- 15517 15517 18
- 15517 15517 2546
- 15517 15517 15517
- 34 60213 53904
- 2960 3098
- 126470 25980 160432 16609 2775 4066 172261 19432 112927 222 350 14559 8 22861 114 2524 64364 104 15148 350 76466 166700 121942 780 8 91349 9552 99 247 4103 99 247 220 18 220 2546 220 15517 220 15517 18 220 15517 2546 220 15517 15517 220 15517 15517 18 220 15517 15517 2546 220 18 13 18 220 18 485 18 220 18 1008 18 44735 107516 15867 20804 22851 134178 77431 32010 104312 156437 1423 7522 18165 2178 34058 22369 16412 32999 16 867 8208 105024 106657 1967 53641 1235 185386 8118 22434 39336 26178 26178 168394 194663 27271 147475 25883 6961 9790 1339 461 83 1280 19016 1354 11 461 1099 481 3239 30 461 44 625 3239 17291 1520 480 11 461 35 481 1299 1236 17966 30 1416 6 27493 261 54602 43
diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-llama-bpe.gguf.inp
+++ b/models/ggml-vocab-llama-bpe.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out
index 4b35cf93f..a77376625 100644
--- a/models/ggml-vocab-llama-bpe.gguf.out
+++ b/models/ggml-vocab-llama-bpe.gguf.out
@@ -1,5 +1,5 @@
  1142 220 19 220 27154 4038
- 37 51853 261
+ 88075 16276 301
 
  220
  256
diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-llama-spm.gguf.inp
+++ b/models/ggml-vocab-llama-spm.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out
index 93aacf8ba..2a71a6ef8 100644
--- a/models/ggml-vocab-llama-spm.gguf.out
+++ b/models/ggml-vocab-llama-spm.gguf.out
@@ -1,5 +1,5 @@
  474 287 29871 29946 29871 30226 7378
- 383 4000 261
+ 11585 7810 295
 
  259
  1678
diff --git a/models/ggml-vocab-llama4.gguf.inp b/models/ggml-vocab-llama4.gguf.inp
deleted file mode 100644
index 9baf7d77a..000000000
--- a/models/ggml-vocab-llama4.gguf.inp
+++ /dev/null
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
diff --git a/models/ggml-vocab-llama4.gguf.out b/models/ggml-vocab-llama4.gguf.out
deleted file mode 100644
index 7ca46ce59..000000000
--- a/models/ggml-vocab-llama4.gguf.out
+++ /dev/null
@@ -1,46 +0,0 @@
- 1190 220 32 220 18215 7112
- 50 16800 258
-
- 220
- 256
- 277
- 197
- 198
- 368
- 2946
- 3271
- 19873 3817
- 39715 3817
- 19873 7353
- 39715 7353
- 39715 7353 13
- 19873 24 3817 13
- 39715 24 3817 13
- 544 373 9522 112 247 26 36315
- 99 39923 220 35 9607 21498 21470 3679 9433
- 1595 7653 633 79829 34051 1636
- 8755 102595 115960 21125 148305 96819 102816 39048 14105 22528 160234
- 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 330 7384 88230 511 947 1492 3742 7233 21
- 19873
- 39715
- 220 39715
- 256 39715
- 277 39715
- 277 39715 198 277 39715
- 330
- 198 319
- 19 7359
- 19873 24 386 87799 13 2403 583 650 51358 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645
- 17931 4959
- 31
- 1922
- 12325
- 12325 31
- 12325 1922
- 12325 12325
- 12325 12325 31
- 12325 12325 1922
- 12325 12325 12325
- 47 19811 12077
- 3260 3579
- 198 7283 51499 191231 20192 3271 3322 9287 2143 17860 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 9522 112 247 172394 247 220 31 220 1922 220 12325 220 12325 31 220 12325 1922 220 12325 12325 220 12325 12325 31 220 12325 12325 1922 220 31 26 31 220 31 396 31 220 31 1043 31 117131 102595 115960 21125 148305 96819 102816 80883 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645 79745 150278 117079 633 79829 34051 1636 25611 41990 109428 1488 91054 24072 17931 4959 29795 9296 16517 1806 481 96 1386 36633 1609 24 481 1109 650 5074 43 481 57 702 5074 27088 2170 536 24 481 48 650 1933 1696 30262 43 1665 19 32818 262 27236 56
diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-mpt.gguf.inp
+++ b/models/ggml-vocab-mpt.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out
index 372c751bf..ca62669ad 100644
--- a/models/ggml-vocab-mpt.gguf.out
+++ b/models/ggml-vocab-mpt.gguf.out
@@ -1,5 +1,5 @@
  728 577 24142 2607
- 39 26288 6554
+ 37515 18569 293
 
  209
  50276
diff --git a/models/ggml-vocab-nomic-bert-moe.gguf b/models/ggml-vocab-nomic-bert-moe.gguf
new file mode 100644
index 000000000..b6f4d9441
Binary files /dev/null and b/models/ggml-vocab-nomic-bert-moe.gguf differ
diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-phi-3.gguf.inp
+++ b/models/ggml-vocab-phi-3.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out
index 93aacf8ba..2a71a6ef8 100644
--- a/models/ggml-vocab-phi-3.gguf.out
+++ b/models/ggml-vocab-phi-3.gguf.out
@@ -1,5 +1,5 @@
  474 287 29871 29946 29871 30226 7378
- 383 4000 261
+ 11585 7810 295
 
  259
  1678
diff --git a/models/ggml-vocab-pixtral.gguf.inp b/models/ggml-vocab-pixtral.gguf.inp
deleted file mode 100644
index 9baf7d77a..000000000
--- a/models/ggml-vocab-pixtral.gguf.inp
+++ /dev/null
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
diff --git a/models/ggml-vocab-pixtral.gguf.out b/models/ggml-vocab-pixtral.gguf.out
deleted file mode 100644
index 53309d1bc..000000000
--- a/models/ggml-vocab-pixtral.gguf.out
+++ /dev/null
@@ -1,46 +0,0 @@
- 2014 1032 1052 1032 28504 6972
- 1070 7088 1258
-
- 1032
- 1256
- 1293
- 1009
- 1010
- 1267
- 4688
- 1009 1010
- 22177 4304
- 45383 4304
- 22177 5325
- 45383 5325
- 45383 5325 1033
- 22177 1044 4304 1033
- 45383 1044 4304 1033
- 1593 1395 119685 1166 1153 1046 51228
- 1119 1048 1052 1056 1032 1055 17391 23216 30203 7785 17279
- 3337 30757 1902 4200 63073 3671
- 1225 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1225 1158 1129 1225 1158 1155 1225 1158 1133 1225 21359 1225 1158 1137
- 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 1319 11234 1873 26303 1455 1934 2246 3754 10835 1041
- 22177
- 45383
- 1032 45383
- 1256 45383
- 1293 45383
- 1293 45383 1010 1293 45383
- 1319
- 1010 1376
- 1039 4033
- 22177 1044 1404 48054 1033 3075 1584 1636 119685 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749
- 7290 7290 7290
- 1051
- 1051 1051
- 1051 1051 1051
- 1051 1051 1051 1051
- 1051 1051 1051 1051 1051
- 1051 1051 1051 1051 1051 1051
- 1051 1051 1051 1051 1051 1051 1051
- 1051 1051 1051 1051 1051 1051 1051 1051
- 1051 1051 1051 1051 1051 1051 1051 1051 1051
- 1067 59503 28783
- 3724 4058
- 1010 1032 1267 1032 4688 1032 17152 1458 29356 1010 1256 1010 1293 1010 1260 1010 1652 1010 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 119685 1166 1153 1240 1159 1166 1153 1032 1051 1032 1051 1051 1032 1051 1051 1051 1032 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1051 1032 1051 1046 1051 1032 1051 1791 1051 1032 1051 2880 1051 71881 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1240 1159 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749 45577 1045 6626 43555 2843 30757 1902 4200 63073 3671 14931 20040 20040 1657 1657 1975 14135 14135 83923 7290 7290 7290 45509 45509 45509 1362 6483 2151 1576 1116 2189 1514 1681 2156 1044 1576 3609 1636 5257 1063 1576 1077 1605 5257 1362 7534 3180 1494 1044 1576 1068 1636 2479 2269 26883 1063 2837 1039 45654 1261 54297 1076
diff --git a/models/ggml-vocab-qwen2.gguf.inp b/models/ggml-vocab-qwen2.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-qwen2.gguf.inp
+++ b/models/ggml-vocab-qwen2.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-qwen2.gguf.out b/models/ggml-vocab-qwen2.gguf.out
index 18b4b45cd..595d59a44 100644
--- a/models/ggml-vocab-qwen2.gguf.out
+++ b/models/ggml-vocab-qwen2.gguf.out
@@ -1,5 +1,5 @@
  1122 220 19 220 26062 3951
- 37 50753 261
+ 86975 15897 301
 
  220
  256
diff --git a/models/ggml-vocab-refact.gguf.inp b/models/ggml-vocab-refact.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-refact.gguf.inp
+++ b/models/ggml-vocab-refact.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-refact.gguf.out b/models/ggml-vocab-refact.gguf.out
index 63d8305c3..f13dda52c 100644
--- a/models/ggml-vocab-refact.gguf.out
+++ b/models/ggml-vocab-refact.gguf.out
@@ -1,5 +1,5 @@
  4833 225 38 225 143 140 17723
- 56 2006 3935 265
+ 144 231 7132 342
 
  225
  261
diff --git a/models/ggml-vocab-roberta-bpe.gguf.inp b/models/ggml-vocab-roberta-bpe.gguf.inp
deleted file mode 100644
index 9baf7d77a..000000000
--- a/models/ggml-vocab-roberta-bpe.gguf.inp
+++ /dev/null
@@ -1,112 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-!!!!!!
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-Cửa Việt
-__ggml_vocab_test__
- discards
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
diff --git a/models/ggml-vocab-roberta-bpe.gguf.out b/models/ggml-vocab-roberta-bpe.gguf.out
deleted file mode 100644
index f181ac3dc..000000000
--- a/models/ggml-vocab-roberta-bpe.gguf.out
+++ /dev/null
@@ -1,46 +0,0 @@
- 2550 204 18430 377
- 597 2768 298 8564
-
- 1437
- 1437 1437
- 1437 1437 1437
- 50117
- 50118
- 50140
- 50140 50118
- 50117 50118
- 31414 232
- 20920 232
- 31414 623
- 20920 623
- 20920 623 328
- 31414 6 232 328
- 20920 6 232 328
- 42 16 8103 18164 27 4 49317
- 605 40976 262 10109 18474 385 29 36807 6455
- 36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328
- 1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171
- 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43
- 31414
- 20920
- 1437 20920
- 1437 1437 20920
- 1437 1437 1437 20920
- 1437 1437 1437 20920 50118 1437 1437 1437 20920
- 36
- 50118 5457
- 108 3567
- 31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772
- 32376 12846
- 246
- 3103
- 25631
- 46152
- 3103 25631
- 46152 3103
- 46152 25631
- 46152 46152
- 46152 3103 25631
- 347 1376 2023 12410 102 16376 1376 2023 6382 90
- 9553 5954
- 50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574
diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp
index 9baf7d77a..86b934e40 100644
--- a/models/ggml-vocab-starcoder.gguf.inp
+++ b/models/ggml-vocab-starcoder.gguf.inp
@@ -1,6 +1,6 @@
 ied 4 ½ months
 __ggml_vocab_test__
-Führer
+Äpfel
 __ggml_vocab_test__
 
 __ggml_vocab_test__
diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out
index 87e2465d3..4698e2c3c 100644
--- a/models/ggml-vocab-starcoder.gguf.out
+++ b/models/ggml-vocab-starcoder.gguf.out
@@ -1,5 +1,5 @@
  4850 244 57 244 162 159 17722
- 75 2022 3943 284
+ 163 250 7146 361
 
  244
  280
diff --git a/models/templates/Qwen-QwQ-32B.jinja b/models/templates/Qwen-QwQ-32B.jinja
new file mode 100644
index 000000000..d475f7068
--- /dev/null
+++ b/models/templates/Qwen-QwQ-32B.jinja
@@ -0,0 +1,62 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- '' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+  {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" and not message.tool_calls %}
+        {%- set content = message.content %}
+        {%- if not loop.last %}
+            {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+        {%- endif %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- if not loop.last %}
+            {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+        {%- endif %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n<think>\n' }}
+{%- endif %}
diff --git a/models/templates/Qwen-Qwen3-0.6B.jinja b/models/templates/Qwen-Qwen3-0.6B.jinja
new file mode 100644
index 000000000..699ff8df4
--- /dev/null
+++ b/models/templates/Qwen-Qwen3-0.6B.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/models/templates/README.md b/models/templates/README.md
index e4fd104fc..35b6386dd 100644
--- a/models/templates/README.md
+++ b/models/templates/README.md
@@ -19,4 +19,6 @@ These templates can be updated with the following commands:
 ./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
 ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use   > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
 ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct                      > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
+./scripts/get_chat_template.py Qwen/QwQ-32B                                  > models/templates/Qwen-QwQ-32B.jinja
+./scripts/get_chat_template.py Qwen/Qwen3-0.6B                               > models/templates/Qwen-Qwen3-0.6B.jinja
 ```
\ No newline at end of file
diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
index 8cb9c354f..431c596c1 100644
--- a/requirements/requirements-convert_hf_to_gguf.txt
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,3 +1,7 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1
+torch~=2.2.1; platform_machine != "s390x"
+
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
+torch>=0.0.0.dev0; platform_machine == "s390x"
diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt
index 8cb9c354f..431c596c1 100644
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,3 +1,7 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1
+torch~=2.2.1; platform_machine != "s390x"
+
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
+torch>=0.0.0.dev0; platform_machine == "s390x"
diff --git a/requirements/requirements-convert_lora_to_gguf.txt b/requirements/requirements-convert_lora_to_gguf.txt
index 5758076c4..d091d5648 100644
--- a/requirements/requirements-convert_lora_to_gguf.txt
+++ b/requirements/requirements-convert_lora_to_gguf.txt
@@ -1,2 +1,4 @@
 -r ./requirements-convert_hf_to_gguf.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
diff --git a/requirements/requirements-gguf_editor_gui.txt b/requirements/requirements-gguf_editor_gui.txt
index 920dc7cf9..fd253364e 100644
--- a/requirements/requirements-gguf_editor_gui.txt
+++ b/requirements/requirements-gguf_editor_gui.txt
@@ -1,3 +1,3 @@
 numpy~=1.26.4
 PySide6~=6.9.0
-gguf>=0.16.0
+gguf>=0.17.0
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index e40d1cc6d..94a8eceb3 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -17,14 +17,14 @@ rm -f llama-bench.sqlite > /dev/null
 
 # to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
 if [ -n "$GGML_CUDA" ]; then
-    cmake_opts="-DGGML_CUDA=ON"
+    CMAKE_OPTS="${CMAKE_OPTS} -DGGML_CUDA=ON"
 fi
 
 dir="build-bench"
 
 function run {
     rm -fr ${dir} > /dev/null
-    cmake -B ${dir} -S . $cmake_opts > /dev/null
+    cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
     cmake --build ${dir} -t llama-bench > /dev/null
     ${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
 }
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index b141cabc9..aa0fb8fb0 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-7c06c10c532a6cda913c17fc56341e8880ae341d
+94a83ba5a725ae2aee79df75dd99b2119d0478cc
diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py
new file mode 100755
index 000000000..1151c9f01
--- /dev/null
+++ b/scripts/sync_vendor.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+import urllib.request
+
+vendor = {
+    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
+    "https://github.com/nlohmann/json/releases/latest/download/json_fwd.hpp": "vendor/nlohmann/json_fwd.hpp",
+
+    # sync manually
+    # "https://raw.githubusercontent.com/ochafik/minja/refs/heads/main/include/minja/minja.hpp":         "vendor/minja/minja.hpp",
+    # "https://raw.githubusercontent.com/ochafik/minja/refs/heads/main/include/minja/chat-template.hpp": "vendor/minja/chat-template.hpp",
+
+    "https://raw.githubusercontent.com/nothings/stb/refs/heads/master/stb_image.h": "vendor/stb/stb_image.h",
+
+    "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.22/miniaudio.h": "vendor/miniaudio/miniaudio.h",
+
+    "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.20.1/httplib.h": "vendor/cpp-httplib/httplib.h",
+}
+
+for url, filename in vendor.items():
+    print(f"downloading {url} to {filename}") # noqa: NP100
+    urllib.request.urlretrieve(url, filename)
diff --git a/scripts/tool_bench.py b/scripts/tool_bench.py
index a2f2a2eb0..d8018e2e2 100755
--- a/scripts/tool_bench.py
+++ b/scripts/tool_bench.py
@@ -12,6 +12,7 @@
         export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
         export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}
 
+        ./scripts/tool_bench.py run --n 10 --temp -1 --temp 0 --temp 1 --temp 2 --temp 5 --llama-baseline $PWD/buildMaster/bin/llama-server --output qwen14b.jsonl --hf bartowski/Qwen2.5-14B-Instruct-GGUF:Q4_K_L
         ./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 1.5B Q4_K_M"      --output qwen1.5b.jsonl  --hf bartowski/Qwen2.5-1.5B-Instruct-GGUF      --ollama qwen2.5:1.5b-instruct-q4_K_M
         ./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 Coder 7B Q4_K_M"  --output qwenc7b.jsonl   --hf bartowski/Qwen2.5-Coder-7B-Instruct-GGUF  --ollama qwen2.5-coder:7b
 
@@ -205,6 +206,7 @@ def run(
     model: Annotated[Optional[str], typer.Option(help="Name of the model to test (server agnostic)")] = None,
     hf: Annotated[Optional[str], typer.Option(help="GGUF huggingface model repo id (+ optional quant) to test w/ llama-server")] = None,
     chat_template: Annotated[Optional[str], typer.Option(help="Chat template override for llama-server")] = None,
+    chat_template_file: Annotated[Optional[str], typer.Option(help="Chat template file override for llama-server")] = None,
     ollama: Annotated[Optional[str], typer.Option(help="Ollama model tag to test")] = None,
     llama_baseline: Annotated[Optional[str], typer.Option(help="llama-server baseline binary path to use as baseline")] = None,
     n: Annotated[int, typer.Option(help="Number of times to run each test")] = 10,
@@ -229,6 +231,12 @@ def run(
     # n_ctx = 8192
     n_ctx = 2048
 
+    if model is None:
+        if hf is not None:
+            model = hf.split("/")[-1]
+        elif ollama is not None:
+            model = ollama
+
     assert force or append or not output.exists(), f"Output file already exists: {output}; use --force to overwrite"
 
     with output.open('a' if append else 'w') as output_file:
@@ -320,6 +328,7 @@ def run(
                     server.model_hf_repo = hf
                     server.model_hf_file = None
                     server.chat_template = chat_template
+                    server.chat_template_file = chat_template_file
                     server.server_path = server_path
                     if port is not None:
                         server.server_port = port
@@ -335,6 +344,7 @@ def run(
                                 temp=t,
                                 output_kwargs=dict(
                                     chat_template=chat_template,
+                                    chat_template_file=chat_template_file,
                                 ),
                                 request_kwargs=dict(
                                     ignore_chat_grammar=ignore_chat_grammar,
@@ -355,6 +365,7 @@ def run(
                         temp=t,
                         output_kwargs=dict(
                             chat_template=None,
+                            chat_template_file=None,
                         ),
                         request_kwargs=dict(
                             model=ollama,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d4bf37b1c..d20bd4fe2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,12 +14,16 @@ add_library(llama
             llama-batch.cpp
             llama-chat.cpp
             llama-context.cpp
+            llama-cparams.cpp
             llama-grammar.cpp
             llama-graph.cpp
             llama-hparams.cpp
             llama-impl.cpp
             llama-io.cpp
             llama-kv-cache.cpp
+            llama-kv-cache-unified.cpp
+            llama-kv-cache-unified-iswa.cpp
+            llama-kv-cache-recurrent.cpp
             llama-memory.cpp
             llama-mmap.cpp
             llama-model-loader.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 19f58ba82..0ef81054b 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -175,6 +175,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
     { LLM_KV_CONVNEXT_BLOCK_COUNT,      "%s.convnext.block_count"      },
 
+    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
+
     { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
     { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
     { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
@@ -449,6 +451,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
             { LLM_TENSOR_POS_EMBD,        "position_embd" },
             { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
             { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
             { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
             { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index c696b4113..1dcd4fa35 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -214,6 +214,8 @@ enum llm_kv {
     LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
     LLM_KV_CONVNEXT_BLOCK_COUNT,
 
+    LLM_KV_CLASSIFIER_OUTPUT_LABELS,
+
     // deprecated:
     LLM_KV_TOKENIZER_PREFIX_ID,
     LLM_KV_TOKENIZER_SUFFIX_ID,
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index b98e3256c..6a19a2431 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -15,24 +15,31 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
             break;
         }
     }
-    ubatch_token.resize(!has_embd ? n_ubatch : 0);
-    ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
-    ubatch_pos.resize(n_ubatch);
-    ubatch_n_seq_id.resize(n_ubatch);
-    ubatch_seq_id.resize(n_ubatch);
-    ubatch_output.resize(n_ubatch);
+
+    udatas.push_back({});
+
+    auto & udata = udatas.back();
+
+    udata.token.resize(!has_embd ? n_ubatch : 0);
+    udata.embd.resize(has_embd ? n_embd * n_ubatch : 0);
+    udata.pos.resize(n_ubatch);
+    udata.n_seq_id.resize(n_ubatch);
+    udata.seq_id.resize(n_ubatch);
+    udata.output.resize(n_ubatch);
+
     llama_ubatch ubatch = {
         /*equal_seqs   =*/ true,
         /*n_tokens     =*/ 0,
         /*n_seq_tokens =*/ 0,
         /*n_seqs       =*/ 0,
-        /*token        =*/ !has_embd ? ubatch_token.data() : nullptr,
-        /*embd         =*/ has_embd  ? ubatch_embd.data()  : nullptr,
-        /*pos          =*/ ubatch_pos.data(),
-        /*n_seq_id     =*/ ubatch_n_seq_id.data(),
-        /*seq_id       =*/ ubatch_seq_id.data(),
-        /*output       =*/ ubatch_output.data(),
+        /*token        =*/ !has_embd ? udata.token.data() : nullptr,
+        /*embd         =*/ has_embd  ? udata.embd.data()  : nullptr,
+        /*pos          =*/ udata.pos.data(),
+        /*n_seq_id     =*/ udata.n_seq_id.data(),
+        /*seq_id       =*/ udata.seq_id.data(),
+        /*output       =*/ udata.output.data(),
     };
+
     return ubatch;
 }
 
diff --git a/src/llama-batch.h b/src/llama-batch.h
index 6305051b6..b8260b94f 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -11,15 +11,15 @@ struct llama_ubatch {
     bool equal_seqs;
     // TODO: whole_seqs for embeddings?
 
-    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
     uint32_t n_seq_tokens; // tokens per sequence
     uint32_t n_seqs;
 
     llama_token  *  token;    // [n_tokens]
     float        *  embd;     // [n_embd, n_tokens]
     llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
+    int32_t      *  n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
+    llama_seq_id ** seq_id;   // [n_seqs] // TODO: become llama_seq_id * seq_id;
     int8_t       *  output;   // [n_tokens]
 };
 
@@ -49,13 +49,18 @@ struct llama_sbatch {
 
     const llama_batch * batch = nullptr;
 
-    // buffers for the ubatch
-    std::vector<llama_token>    ubatch_token;
-    std::vector<float>          ubatch_embd;
-    std::vector<llama_pos>      ubatch_pos;
-    std::vector<int32_t>        ubatch_n_seq_id;
-    std::vector<llama_seq_id *> ubatch_seq_id;
-    std::vector<int8_t>         ubatch_output;
+    // buffers for the ubatches
+    // TODO: very hacky, this needs a complete rework
+    struct ubatch_data {
+        std::vector<llama_token>    token;
+        std::vector<float>          embd;
+        std::vector<llama_pos>      pos;
+        std::vector<int32_t>        n_seq_id;
+        std::vector<llama_seq_id *> seq_id;
+        std::vector<int8_t>         output;
+    };
+
+    std::vector<ubatch_data> udatas;
 
     llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 85b4324b6..4ab574387 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -6,9 +6,10 @@
 #include "llama-model.h"
 #include "llama-kv-cache.h"
 
-#include <cstring>
-#include <stdexcept>
 #include <cinttypes>
+#include <cstring>
+#include <limits>
+#include <stdexcept>
 
 //
 // llama_context
@@ -25,7 +26,11 @@ llama_context::llama_context(
 
     const auto & hparams = model.hparams;
 
-    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
+    cparams.n_seq_max = std::max(1u, params.n_seq_max);
+    if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
+        throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
+    }
+
     cparams.n_threads        = params.n_threads;
     cparams.n_threads_batch  = params.n_threads_batch;
     cparams.yarn_ext_factor  = params.yarn_ext_factor;
@@ -118,6 +123,11 @@ llama_context::llama_context(
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
     }
 
+    if (!params.swa_full && cparams.n_seq_max > 1) {
+        LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
+                __func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
+    }
+
     if (!hparams.vocab_only) {
         // GPU backends
         for (auto * dev : model.devices) {
@@ -255,15 +265,9 @@ llama_context::llama_context(
 
     // reserve worst-case graph
     if (!hparams.vocab_only && memory) {
-        const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        const uint32_t n_seqs = cparams.n_seq_max;
         const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-
-        // restore later
-        // TODO: something cleaner
-        const auto n_outputs_save = n_outputs;
-
         LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
 
         int n_splits_pp = -1;
@@ -275,23 +279,17 @@ llama_context::llama_context(
         // simulate full KV cache
         llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
-        kv_self->set_full();
+        const auto kv_state = kv_self->init_full();
+        if (!kv_state) {
+            throw std::runtime_error("failed to initialize KV cache");
+        }
 
         cross.v_embd.clear();
 
         // reserve pp graph first so that buffers are only allocated once
         {
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            // max number of outputs
-            n_outputs = ubatch_pp.n_tokens;
-
-            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
-
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
-
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
+            if (!gf) {
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
 
@@ -301,16 +299,8 @@ llama_context::llama_context(
 
         // reserve with tg graph to get the number of splits and nodes
         {
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            n_outputs = ubatch_tg.n_tokens;
-
-            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_tg.n_tokens, ubatch_tg.n_seqs);
-
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_tg, LLM_GRAPH_TYPE_DEFAULT);
-
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            auto * gf = graph_reserve(1, 1, 1, kv_state.get());
+            if (!gf) {
                 throw std::runtime_error("failed to allocate compute tg buffers");
             }
 
@@ -320,22 +310,12 @@ llama_context::llama_context(
 
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
         {
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            n_outputs = ubatch_pp.n_tokens;
-
-            LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
-
-            auto * gf = graph_init();
-            graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
-
-            if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+            auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
+            if (!gf) {
                 throw std::runtime_error("failed to allocate compute pp buffers");
             }
         }
 
-        n_outputs = n_outputs_save;
-
         for (size_t i = 0; i < backend_ptrs.size(); ++i) {
             ggml_backend_t             backend = backend_ptrs[i];
             ggml_backend_buffer_type_t buft    = backend_buft[i];
@@ -449,36 +429,33 @@ const llama_kv_cache * llama_context::get_kv_self() const {
     return kv_self;
 }
 
-void llama_context::kv_self_update() {
-    bool need_reserve = false;
+bool llama_context::kv_self_update() {
+    if (!memory) {
+        return false;
+    }
 
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
-    need_reserve = kv_self->update(*this);
-
-    // reserve a worst case graph if needed
-    if (need_reserve) {
-        LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
-
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-        // simulate full KV cache
-        kv_self->set_full();
-
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-        auto * gf = graph_init();
-        graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(sched.get());
-        if (!ggml_backend_sched_reserve(sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
+    if (!kv_self->update(*this)) {
+        // no updates have been performed
+        return false;
     }
+
+    // if the KV cache did any computation, we have to reserve a new worst-case graph
+    const auto kv_state = kv_self->init_full();
+    if (!kv_state) {
+        throw std::runtime_error("failed to initialize KV cache");
+    }
+
+    const uint32_t n_seqs   = cparams.n_seq_max;
+    const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+    auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
+    if (!gf) {
+        LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
+    }
+
+    return true;
 }
 
 enum llama_pooling_type llama_context::pooling_type() const {
@@ -672,6 +649,49 @@ bool llama_context::apply_adapter_cvec(
     return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }
 
+llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_state_i * mstate, ggml_status & ret) {
+    if (mstate && !mstate->apply()) {
+        LLAMA_LOG_ERROR("%s: failed to apply memory state\n", __func__);
+        ret = GGML_STATUS_FAILED;
+        return nullptr;
+    }
+
+    auto * gf = graph_init();
+    if (!gf) {
+        LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
+        ret = GGML_STATUS_FAILED;
+        return nullptr;
+    }
+
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mstate);
+    if (!res) {
+        LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
+        ret = GGML_STATUS_FAILED;
+        return nullptr;
+    }
+
+    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+    if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
+        LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
+        ret = GGML_STATUS_ALLOC_FAILED;
+        return nullptr;
+    }
+
+    res->set_inputs(&ubatch);
+
+    const auto status = graph_compute(gf, ubatch.n_tokens > 1);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
+        ret = status;
+        return nullptr;
+    }
+
+    ret = GGML_STATUS_SUCCESS;
+
+    return res;
+}
+
 int llama_context::encode(llama_batch & inp_batch) {
     if (inp_batch.n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -689,12 +709,18 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
+    // TODO: move the validation to the llama_batch_allocr
     if (batch.token) {
         for (int32_t i = 0; i < n_tokens; ++i) {
             if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
                 LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
                 return -1;
             }
+
+            if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
+                throw -1;
+            }
         }
     }
 
@@ -727,8 +753,6 @@ int llama_context::encode(llama_batch & inp_batch) {
 
     n_outputs = n_tokens;
 
-    //batch_manager->prepare(ubatch);
-
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
@@ -739,26 +763,18 @@ int llama_context::encode(llama_batch & inp_batch) {
     //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
     cparams.causal_attn = false;
 
-    auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);
-
-    ggml_backend_sched_alloc_graph(sched.get(), gf);
-
-    res->set_inputs(&ubatch);
+    ggml_status status;
+    const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
 
     cparams.causal_attn = causal_attn_org;
 
-    const auto compute_status = graph_compute(gf, n_tokens > 1);
-    switch (compute_status) {
-        case GGML_STATUS_SUCCESS:
-            break;
-        case GGML_STATUS_ABORTED:
-            return 2;
-        case GGML_STATUS_ALLOC_FAILED:
-            return -2;
-        case GGML_STATUS_FAILED:
-        default:
-            return -3;
+    if (!res) {
+        switch (status) {
+            case GGML_STATUS_ABORTED:      return  2;
+            case GGML_STATUS_ALLOC_FAILED: return -2;
+            case GGML_STATUS_FAILED:       return -3;
+            case GGML_STATUS_SUCCESS:      GGML_ABORT("should not happen");
+        }
     }
 
     auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
@@ -848,7 +864,7 @@ int llama_context::encode(llama_batch & inp_batch) {
 
 int llama_context::decode(llama_batch & inp_batch) {
     if (!memory) {
-        LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
+        LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
         return encode(inp_batch);
     }
 
@@ -879,15 +895,19 @@ int llama_context::decode(llama_batch & inp_batch) {
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
 
-    llama_kv_cache_guard kv_guard(kv_self);
-
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
+    // TODO: move the validation to the llama_batch_allocr
     if (batch.token) {
         for (int64_t i = 0; i < n_tokens_all; ++i) {
             if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
                 LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
-                throw std::runtime_error("invalid token");
+                return -1;
+            }
+
+            if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
+                return -1;
             }
         }
     }
@@ -920,7 +940,48 @@ int llama_context::decode(llama_batch & inp_batch) {
         n_outputs_all = 1;
     }
 
-    llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all);
+    // handle any pending defrags/shifts
+    kv_self_update();
+
+    llama_memory_state_ptr kv_state;
+
+    bool did_defrag = false;
+
+    while (true) {
+        kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
+        if (!kv_state) {
+            return -2;
+        }
+
+        switch (kv_state->get_status()) {
+            case LLAMA_MEMORY_STATUS_SUCCESS:
+                {
+                } break;
+            case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+                {
+                    if (!did_defrag) {
+                        did_defrag = true;
+
+                        kv_self->defrag_sched(-1.0f);
+                        if (kv_self_update()) {
+                            LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens);
+
+                            continue;
+                        }
+                    }
+
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
+
+                    return 1;
+                }
+            case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+                {
+                    return -2;
+                }
+        }
+
+        break;
+    }
 
     // reserve output buffer
     if (output_reserve(n_outputs_all) < n_outputs_all) {
@@ -928,13 +989,10 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -2;
     };
 
-    // handle any pending defrags/shifts
-    kv_self_update();
-
     int64_t n_outputs_prev = 0;
 
-    while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
+    do {
+        const auto & ubatch = kv_state->get_ubatch();
 
         // count the outputs in this u_batch
         {
@@ -953,33 +1011,37 @@ int llama_context::decode(llama_batch & inp_batch) {
             n_outputs = n_outputs_new;
         }
 
-        // find KV slot
-        if (!kv_self->find_slot(ubatch)) {
-            return 1;
-        }
-
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-        auto * gf = graph_init();
-        auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DECODER);
+        ggml_status status;
+        const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, kv_state.get(), status);
 
-        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+        if (!res) {
+            // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
+            llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES] = { std::numeric_limits<llama_pos>::max() };
 
-        ggml_backend_sched_alloc_graph(sched.get(), gf);
+            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+                const auto & seq_id = ubatch.seq_id[i][0];
 
-        res->set_inputs(&ubatch);
+                pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
+            }
 
-        const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
-        if (compute_status != GGML_STATUS_SUCCESS) {
-            switch (compute_status) {
-                case GGML_STATUS_ABORTED:
-                    return 2;
-                case GGML_STATUS_ALLOC_FAILED:
-                    return -2;
-                case GGML_STATUS_FAILED:
-                default:
-                    return -3;
+            for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+                if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
+                    continue;
+                }
+
+                LLAMA_LOG_WARN("%s: removing KV cache entries for seq_id = %d, pos = [%d, +inf)\n", __func__, s, pos_min[s]);
+
+                llama_kv_self_seq_rm(this, s, pos_min[s], -1);
+            }
+
+            switch (status) {
+                case GGML_STATUS_ABORTED:      return  2;
+                case GGML_STATUS_ALLOC_FAILED: return -2;
+                case GGML_STATUS_FAILED:       return -3;
+                case GGML_STATUS_SUCCESS:      GGML_ABORT("should not happen");
             }
         }
 
@@ -1066,10 +1128,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         }
 
         n_outputs_prev += n_outputs;
-    }
-
-    // finalize the batch processing
-    kv_guard.commit();
+    } while (kv_state->next());
 
     // set to total number of outputs in the batch, for use in llama_get_logits_ith
     n_outputs = n_outputs_all;
@@ -1078,7 +1137,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     {
         bool sorted_output = true;
 
-        auto & out_ids = sbatch.out_ids;
+        auto & out_ids = kv_state->out_ids();
 
         GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
 
@@ -1238,11 +1297,52 @@ ggml_cgraph * llama_context::graph_init() {
     return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
 }
 
+ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate) {
+    LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
+
+    if (n_tokens % n_seqs != 0) {
+        n_tokens = (n_tokens / n_seqs) * n_seqs;
+        n_outputs = std::min(n_outputs, n_tokens);
+
+        LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
+    }
+
+    // store the n_outputs as it is, and restore it afterwards
+    // TODO: not sure if needed, might simplify in the future by removing this
+    const auto save_n_outputs = this->n_outputs;
+
+    this->n_outputs = n_outputs;
+
+    llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+    llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+    auto * gf = graph_init();
+    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mstate);
+
+    this->n_outputs = save_n_outputs;
+
+    if (!res) {
+        LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
+        return nullptr;
+    }
+
+    ggml_backend_sched_reset(sched.get());
+
+    // initialize scheduler with the specified graph
+    if (!ggml_backend_sched_reserve(sched.get(), gf)) {
+        LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        return nullptr;
+    }
+
+    return gf;
+}
+
 llm_graph_result_ptr llama_context::graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-            llm_graph_type gtype) {
+                    ggml_context * ctx,
+                     ggml_cgraph * gf,
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+      const llama_memory_state_i * mstate) {
     return model.build_graph(
             {
                 /*.ctx         =*/ ctx,
@@ -1254,7 +1354,7 @@ llm_graph_result_ptr llama_context::graph_build(
                 /*.backend_cpu =*/ backend_cpu,
                 /*.cvec        =*/ &cvec,
                 /*.loras       =*/ &loras,
-                /*.memory      =*/ memory.get(),
+                /*.mstate      =*/ mstate,
                 /*.cross       =*/ &cross,
                 /*.n_outputs   =*/ n_outputs,
                 /*.cb          =*/ graph_get_cb(),
@@ -1935,7 +2035,6 @@ void llama_context::opt_epoch_iter(
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
     kv_self->clear();
-    llama_kv_cache_guard kv_guard(kv_self);
 
     for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) {
         batch.n_tokens = n_batch;
@@ -1958,7 +2057,11 @@ void llama_context::opt_epoch_iter(
 
         int64_t n_outputs_all = n_tokens_all;
 
-        llama_sbatch sbatch = kv_self->sbatch_init(batch, /*logits_all =*/ true);
+        auto kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true);
+        if (!kv_state || kv_state->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
+            LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
+            break;
+        }
 
         // reserve output buffer
         if (output_reserve(n_outputs_all) < n_outputs_all) {
@@ -1966,20 +2069,19 @@ void llama_context::opt_epoch_iter(
             GGML_ABORT("TODO: handle this error");
         };
 
-        for (uint32_t pos_batch = 0; pos_batch < n_batch; pos_batch += n_ubatch) {
-            llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
+        uint32_t pos_batch = 0;
+        do {
+            const auto & ubatch = kv_state->get_ubatch();
 
             n_outputs = ubatch.n_tokens;
 
-            // TODO: not sure if this is needed
-            if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-
-                GGML_ABORT("TODO: handle this error");
+            if (!kv_state->apply()) {
+                LLAMA_LOG_ERROR("%s: failed to update the memory state\n", __func__);
+                break;
             }
 
             auto * gf = graph_init();
-            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, kv_state.get());
 
             struct ggml_context * ctx_compute_opt;
             {
@@ -1994,6 +2096,7 @@ void llama_context::opt_epoch_iter(
             }
             ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
             ggml_opt_alloc(opt_ctx, train);
+
             res->set_inputs(&ubatch);
             {
                 struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
@@ -2011,10 +2114,10 @@ void llama_context::opt_epoch_iter(
                 callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start);
             }
             ggml_free(ctx_compute_opt);
-        }
-    }
 
-    kv_guard.commit();
+            pos_batch += ubatch.n_tokens;
+        } while (kv_state->next());
+    }
 }
 
 void llama_context::opt_epoch(
@@ -2178,6 +2281,7 @@ llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
     return ctx->get_kv_self();
 }
 
+// deprecated
 void llama_kv_self_update(llama_context * ctx) {
     ctx->kv_self_update();
 }
@@ -2432,6 +2536,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
     return kv->seq_pos_max(seq_id);
 }
 
+// deprecated
 void llama_kv_self_defrag(llama_context * ctx) {
     auto * kv = ctx->get_kv_self();
     if (!kv) {
@@ -2573,22 +2678,8 @@ int32_t llama_encode(
 int32_t llama_decode(
         llama_context * ctx,
           llama_batch   batch) {
-    int ret = ctx->decode(batch);
-
-    // defrag and try again
-    // TODO: distinguish return code when we are sure that even after defrag there is no space available
-    if (ret == 1) {
-        llama_kv_self_defrag(ctx);
-        ret = ctx->decode(batch);
-
-        if (ret == 1) {
-            LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
-
-            return ret;
-        }
-    }
-
-    if (ret != 0) {
+    const int ret = ctx->decode(batch);
+    if (ret != 0 && ret != 1) {
         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index c0ceacb10..3b880286b 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -18,6 +18,9 @@ struct llama_kv_cache;
 class llama_io_read_i;
 class llama_io_write_i;
 
+class llama_memory_i;
+class llama_memory_state_i;
+
 struct llama_context {
     // init scheduler and compute buffers, reserve worst-case graphs
     llama_context(
@@ -47,7 +50,9 @@ struct llama_context {
           llama_kv_cache * get_kv_self();
     const llama_kv_cache * get_kv_self() const;
 
-    void kv_self_update();
+    // return true of the KV cache was updated
+    // TODO: remove
+    bool kv_self_update();
 
     enum llama_pooling_type pooling_type() const;
 
@@ -88,6 +93,16 @@ struct llama_context {
                 int32_t   il_start,
                 int32_t   il_end);
 
+    // process a single ubatch with a specific graph type
+    // if memory_state is provided, it will be applied first to the context's memory
+    // ret contains the status of the graph computation
+    // returns nullptr only if ret != GGML_STATUS_SUCCESS
+    llm_graph_result_ptr process_ubatch(
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+            llama_memory_state_i * mstate,
+                     ggml_status & ret);
+
     int encode(llama_batch & inp_batch);
     int decode(llama_batch & inp_batch);
 
@@ -180,16 +195,18 @@ public:
     ggml_cgraph * graph_init();
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
-    ggml_status graph_compute(
-            ggml_cgraph * gf,
-                   bool   batched);
+    ggml_status graph_compute(ggml_cgraph * gf, bool batched);
+
+    // reserve a graph with a dummy ubatch of the specified size
+    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate);
 
 private:
     llm_graph_result_ptr graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-          llm_graph_type   gtype);
+                    ggml_context * ctx,
+                     ggml_cgraph * gf,
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+      const llama_memory_state_i * mstate);
 
     llm_graph_cb graph_get_cb() const;
 
diff --git a/src/llama-cparams.cpp b/src/llama-cparams.cpp
index 28369be36..f7b36590f 100644
--- a/src/llama-cparams.cpp
+++ b/src/llama-cparams.cpp
@@ -1 +1,5 @@
 #include "llama-cparams.h"
+
+size_t llama_max_parallel_sequences(void) {
+    return LLAMA_MAX_PARALLEL_SEQUENCES;
+}
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 246fa5777..2871031ef 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -4,6 +4,8 @@
 
 #include <cstdint>
 
+#define LLAMA_MAX_PARALLEL_SEQUENCES 64
+
 struct llama_cparams {
     uint32_t n_ctx;           // context size used during inference
     uint32_t n_batch;
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index 973b47ae0..bed706bb2 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -1177,8 +1177,18 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
             for (const auto & trigger_pattern : grammar.trigger_patterns) {
                 if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                     grammar.awaiting_trigger = false;
-                    // get from the first match to the end of the string
-                    auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
+                    // get from the first matched capturing group to the end of the string
+                    size_t start = std::string::npos;
+                    for (auto i = 1u; i < match.size(); i++) {
+                        if (match.length(i) > 0) {
+                            start = match.position(i);
+                            break;
+                        }
+                    }
+                    if (start == std::string::npos) {
+                        start = match.position(0);
+                    }
+                    auto constrained_str = grammar.trigger_buffer.substr(start);
                     // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                     grammar.trigger_buffer.clear();
                     llama_grammar_accept_str(grammar, constrained_str);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 13e36d161..727e119e3 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -3,7 +3,10 @@
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
-#include "llama-kv-cache.h"
+
+#include "llama-kv-cache-unified.h"
+#include "llama-kv-cache-unified-iswa.h"
+#include "llama-kv-cache-recurrent.h"
 
 #include <cassert>
 #include <cmath>
@@ -83,7 +86,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
     if (pos_bucket) {
-        kv_self->set_input_pos_bucket(pos_bucket, ubatch);
+        kv_state->set_input_pos_bucket(pos_bucket, ubatch);
     }
 }
 
@@ -234,7 +237,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
 void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
     GGML_UNUSED(ubatch);
 
-    const int64_t n_kv = kv_self->n;
+    const int64_t n_kv = kv_state->get_n_kv();
 
     if (s_copy) {
         GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
@@ -242,7 +245,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
         for (uint32_t i = 0; i < n_kv; ++i) {
-            data[i] = kv_self->s_copy(i);
+            data[i] = kv_state->s_copy(i);
         }
     }
 }
@@ -250,7 +253,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
 void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
     GGML_UNUSED(ubatch);
 
-    const int64_t n_kv = kv_self->n;
+    const int64_t n_kv = kv_state->get_n_kv();
 
     if (s_mask) {
         GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
@@ -258,7 +261,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
 
         // clear unused states
         for (int i = 0; i < n_kv; ++i) {
-            data[i] = kv_self->s_mask(i);
+            data[i] = kv_state->s_mask(i);
         }
     }
 }
@@ -362,17 +365,17 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 
 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        kv_state->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
 }
 
 void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
     if (self_kq_mask) {
-        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+        kv_state->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
     }
 
     if (self_kq_mask_swa) {
-        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+        kv_state->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
     }
 }
 
@@ -448,14 +451,14 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     backend_cpu      (params.backend_cpu),
     cvec             (params.cvec),
     loras            (params.loras),
-    memory           (params.memory),
+    mstate           (params.mstate),
     cross            (params.cross),
     cb_func          (params.cb),
     res              (std::make_unique<llm_graph_result>()) {
     }
 
 int64_t llm_graph_context::n_pos_per_embd() const {
-    return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+    return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
 }
 
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@@ -954,11 +957,11 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_s_copy() const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
 
-    auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
+    auto inp = std::make_unique<llm_graph_input_s_copy>(kv_state);
 
-    const auto n_kv = kv_self->n;
+    const auto n_kv = kv_state->get_n_kv();
 
     auto & cur = inp->s_copy;
 
@@ -971,11 +974,11 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_s_mask() const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
 
-    auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
+    auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);
 
-    const auto n_kv = kv_self->n;
+    const auto n_kv = kv_state->get_n_kv();
 
     auto & cur = inp->s_mask;
 
@@ -1025,11 +1028,11 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
 
-    auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
+    auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_state);
 
-    const auto n_kv = kv_self->get_n();
+    const auto n_kv = kv_state->get_n_kv();
 
     auto & cur = inp->pos_bucket;
 
@@ -1231,14 +1234,14 @@ ggml_tensor * llm_graph_context::build_attn(
 }
 
 llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
 
-    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_state);
 
     {
         GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
 
-        const auto n_kv = kv_self->get_n();
+        const auto n_kv = kv_state->get_n_kv();
 
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1268,25 +1271,29 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
 
-    const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_unified_state *>(mstate);
 
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
     }
 
     const auto & kq_mask = inp->get_kq_mask();
 
     ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv_self->get_k(ctx0, il);
-    ggml_tensor * v = kv_self->get_v(ctx0, il);
+    ggml_tensor * k = kv_state->get_k(ctx0, il);
+    ggml_tensor * v = kv_state->get_v(ctx0, il);
 
     ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
     }
 
     if (wo_b) {
@@ -1297,12 +1304,12 @@ ggml_tensor * llm_graph_context::build_attn(
 }
 
 llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
-    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_unified_iswa_state *>(mstate);
 
-    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_state);
 
     {
-        const auto n_kv = kv_self->get_kv_base()->get_n();
+        const auto n_kv = kv_state->get_base()->get_n_kv();
 
         inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask, "KQ_mask", -1);
@@ -1314,7 +1321,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
     {
         GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
 
-        const auto n_kv = kv_self->get_kv_swa()->get_n();
+        const auto n_kv = kv_state->get_swa()->get_n_kv();
 
         inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
@@ -1344,33 +1351,29 @@ ggml_tensor * llm_graph_context::build_attn(
     ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
 
+    const auto * kv_state_iswa = static_cast<const llama_kv_cache_unified_iswa_state *>(mstate);
+
     const bool is_swa = hparams.is_swa(il);
 
-    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
-
-    const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
+    const auto * kv_state = is_swa ? kv_state_iswa->get_swa() : kv_state_iswa->get_base();
 
     // store to KV cache
     {
-        ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
-        ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_k(ctx0, k_cur, il));
+        ggml_build_forward_expand(gf, kv_state->cpy_v(ctx0, v_cur, il));
     }
 
     const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
 
     ggml_tensor * q = q_cur;
-    ggml_tensor * k = kv->get_k(ctx0, il);
-    ggml_tensor * v = kv->get_v(ctx0, il);
+    ggml_tensor * k = kv_state->get_k(ctx0, il);
+    ggml_tensor * v = kv_state->get_v(ctx0, il);
 
     ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4) {
-            // GLM4 seems to have numerical issues with half-precision accumulators
-            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
-        }
     }
 
     if (wo_b) {
@@ -1446,12 +1449,12 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
          ggml_tensor * state_mask,
              int32_t   n_state,
              int32_t   n_seqs) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
 
-    const auto n_kv    = kv_self->n;
-    const auto kv_head = kv_self->head;
+    const auto n_kv    = kv_state->get_n_kv();
+    const auto kv_head = kv_state->get_head();
 
-    ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self->size);
+    ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_state->get_size());
 
     // copy states
     // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
@@ -1478,13 +1481,13 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
          ggml_tensor * state_mask,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
 
     const auto token_shift_count = hparams.token_shift_count;
 
     const int64_t n_seqs  = ubatch.n_seqs;
 
-    ggml_tensor * token_shift_all = kv_self->k_l[il];
+    ggml_tensor * token_shift_all = kv_state->get_k_l(il);
 
     ggml_tensor * token_shift = build_copy_mask_state(
             gf, token_shift_all, state_copy, state_mask,
@@ -1499,19 +1502,19 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
          ggml_tensor * token_shift,
   const llama_ubatch & ubatch,
                  int   il) const {
-    const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+    const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
 
     const auto token_shift_count = hparams.token_shift_count;
     const auto n_embd = hparams.n_embd;
 
     const int64_t n_seqs = ubatch.n_seqs;
 
-    const auto kv_head = kv_self->head;
+    const auto kv_head = kv_state->get_head();
 
     return ggml_cpy(
         ctx0,
         ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
-        ggml_view_1d(ctx0, kv_self->k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self->k_l[il]))
+        ggml_view_1d(ctx0, kv_state->get_k_l(il), hparams.n_embd_k_s()*n_seqs, hparams.n_embd_k_s()*kv_head*ggml_element_size(kv_state->get_k_l(il)))
     );
 }
 
@@ -1562,20 +1565,25 @@ void llm_graph_context::build_pooling(
                 ggml_tensor * inp_cls = build_inp_cls();
                 inp = ggml_get_rows(ctx0, inp, inp_cls);
 
-                // classification head
-                // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
-                GGML_ASSERT(cls   != nullptr);
-                GGML_ASSERT(cls_b != nullptr);
+                if (cls != nullptr && cls_b != nullptr) {
+                    // classification head
+                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
+                    cur = ggml_tanh(ctx0, cur);
 
-                cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
-                cur = ggml_tanh(ctx0, cur);
-
-                // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-                // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
-                if (cls_out) {
+                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                    if (cls_out) {
+                        GGML_ASSERT(cls_out_b != nullptr);
+                        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
+                    }
+                } else if (cls_out) {
+                    // Single layer classification head (direct projection)
+                    // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
                     GGML_ASSERT(cls_out_b != nullptr);
-
-                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
+                } else {
+                    GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
                 }
             } break;
         default:
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 2b85bb25b..d1c5dd1bf 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -17,10 +17,11 @@ struct ggml_tensor;
 struct llama_ubatch;
 struct llama_cparams;
 
-class llama_memory_i;
-class llama_kv_cache_unified;
-class llama_kv_cache_unified_iswa;
-class llama_kv_cache_recurrent;
+class llama_memory_state_i;
+
+class llama_kv_cache_unified_state;
+class llama_kv_cache_unified_iswa_state;
+class llama_kv_cache_recurrent_state;
 
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -133,7 +134,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 public:
     llm_graph_input_pos_bucket_kv(
             const llama_hparams & hparams,
-            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
+            const llama_kv_cache_unified_state * kv_state) : hparams(hparams), kv_state(kv_state) {}
     virtual ~llm_graph_input_pos_bucket_kv() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
@@ -141,7 +142,7 @@ public:
     ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
 
     const llama_hparams & hparams;
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_state * kv_state;
 };
 
 class llm_graph_input_out_ids : public llm_graph_input_i {
@@ -188,26 +189,26 @@ public:
 
 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_copy(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
     virtual ~llm_graph_input_s_copy() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * s_copy; // I32 [kv_size]
 
-    const llama_kv_cache_recurrent * kv_self;
+    const llama_kv_cache_recurrent_state * kv_state;
 };
 
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
-    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
+    llm_graph_input_s_mask(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
     virtual ~llm_graph_input_s_mask() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * s_mask; // F32 [1, n_kv]
 
-    const llama_kv_cache_recurrent * kv_self;
+    const llama_kv_cache_recurrent_state * kv_state;
 };
 
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -247,10 +248,10 @@ public:
     llm_graph_input_attn_kv_unified(
             const llama_hparams & hparams,
             const llama_cparams & cparams,
-            const llama_kv_cache_unified * kv_self) :
+            const llama_kv_cache_unified_state * kv_state) :
         hparams(hparams),
         cparams(cparams),
-        kv_self(kv_self) {
+        kv_state(kv_state) {
     }
     ~llm_graph_input_attn_kv_unified() = default;
 
@@ -264,7 +265,7 @@ public:
     const llama_hparams & hparams;
     const llama_cparams & cparams;
 
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_state * kv_state;
 };
 
 class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
@@ -272,10 +273,10 @@ public:
     llm_graph_input_attn_kv_unified_iswa(
             const llama_hparams & hparams,
             const llama_cparams & cparams,
-            const llama_kv_cache_unified_iswa * kv_self) :
+            const llama_kv_cache_unified_iswa_state * kv_state) :
         hparams(hparams),
         cparams(cparams),
-        kv_self(kv_self) {
+        kv_state(kv_state) {
     }
     ~llm_graph_input_attn_kv_unified_iswa() = default;
 
@@ -292,7 +293,7 @@ public:
     const llama_hparams & hparams;
     const llama_cparams & cparams;
 
-    const llama_kv_cache_unified_iswa * kv_self;
+    const llama_kv_cache_unified_iswa_state * kv_state;
 };
 
 class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -383,10 +384,10 @@ struct llm_graph_params {
     ggml_backend_sched_t sched;
     ggml_backend_t backend_cpu;
 
-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec   * cvec;
+    const llama_adapter_loras  * loras;
+    const llama_memory_state_i * mstate;
+    const llama_cross          * cross;
 
     int32_t n_outputs;
 
@@ -435,10 +436,10 @@ struct llm_graph_context {
 
     ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
 
-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec   * cvec;
+    const llama_adapter_loras  * loras;
+    const llama_memory_state_i * mstate;
+    const llama_cross          * cross;
 
     const llm_graph_cb & cb_func;
 
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 4f84e56b3..1499eb08a 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -2,6 +2,22 @@
 
 #include "ggml.h"
 
+void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
+    }
+}
+
+bool llama_hparams::is_swa_any() const {
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        if (swa_layers[il]) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 uint32_t llama_hparams::n_head(uint32_t il) const {
     if (il < n_layer) {
         return n_head_arr[il];
@@ -72,7 +88,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
 
 bool llama_hparams::is_swa(uint32_t il) const {
     if (il < n_layer) {
-        return n_swa_pattern == 0 || (il % n_swa_pattern < (n_swa_pattern - 1));
+        return swa_layers[il];
     }
 
     GGML_ABORT("fatal error");
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 5222eedcf..b2bcb8b01 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -102,20 +102,12 @@ struct llama_hparams {
 
     // Sliding Window Attention (SWA)
     llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
-
-    uint32_t n_swa = 0;         // the size of the sliding window (0 - no SWA)
-    uint32_t n_swa_pattern = 1; // this value n means that every nth layer is dense (i.e. non-SWA)
-                                // by default n == 1, all layers are dense
-                                // note that if n_swa_pattern == 0, all layers are SWA
-                                // example: n_swa_pattern = 3
-                                //   il == 0: swa
-                                //   il == 1: swa
-                                //   il == 2: dense
-                                //   il == 3: swa
-                                //   il == 4: swa
-                                //   il == 5: dense
-                                //   il == 6: swa
-                                //   etc ...
+    // the size of the sliding window (0 - no SWA)
+    uint32_t n_swa = 0;
+    // if swa_layers[il] == true, then layer il is SWA
+    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
+    // by default, all layers are dense
+    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
 
     // for State Space Models
     uint32_t ssm_d_conv  = 0;
@@ -139,6 +131,9 @@ struct llama_hparams {
     bool attn_soft_cap = false;
     bool use_kq_norm   = true;
 
+    // for Classifiers
+    uint32_t n_cls_out = 1;
+
     // llama4
     uint32_t n_moe_layer_step        = 0;
     uint32_t n_no_rope_layer_step    = 4;
@@ -153,6 +148,23 @@ struct llama_hparams {
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
     enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
 
+    // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
+    // note that if n_pattern == 0, all layers are SWA
+    //           if n_pattern == 1, all layers are dense
+    // example: n_pattern = 3
+    //   il == 0: swa
+    //   il == 1: swa
+    //   il == 2: dense
+    //   il == 3: swa
+    //   il == 4: swa
+    //   il == 5: dense
+    //   il == 6: swa
+    //   etc ...
+    void set_swa_pattern(uint32_t n_pattern);
+
+    // return true if one of the layers is SWA
+    bool is_swa_any() const;
+
     uint32_t n_head(uint32_t il = 0) const;
 
     uint32_t n_head_kv(uint32_t il = 0) const;
diff --git a/src/llama-kv-cache-recurrent.cpp b/src/llama-kv-cache-recurrent.cpp
new file mode 100644
index 000000000..641eab2f3
--- /dev/null
+++ b/src/llama-kv-cache-recurrent.cpp
@@ -0,0 +1,1132 @@
+#include "llama-kv-cache-recurrent.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+//
+// llama_kv_cache_recurrent
+//
+
+llama_kv_cache_recurrent::llama_kv_cache_recurrent(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   offload,
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
+    const int32_t n_layer = hparams.n_layer;
+
+    LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
+            __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
+
+    head = 0;
+    size = kv_size;
+    used = 0;
+
+    cells.clear();
+    cells.resize(kv_size);
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+
+        return it->second;
+    };
+
+    k_l.reserve(n_layer);
+    v_l.reserve(n_layer);
+
+    for (int i = 0; i < n_layer; i++) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
+
+        const char * dev_name = "CPU";
+
+        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+        if (offload) {
+            auto * dev = model.dev_layer(i);
+            buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
+        }
+
+        LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            throw std::runtime_error("failed to create ggml context for kv cache");
+        }
+
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        ggml_format_name(k, "cache_k_l%d", i);
+        ggml_format_name(v, "cache_v_l%d", i);
+        k_l.push_back(k);
+        v_l.push_back(v);
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            throw std::runtime_error("failed to allocate buffer for kv cache");
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        bufs.emplace_back(buf);
+    }
+
+    {
+        const size_t memory_size_k = size_k_bytes();
+        const size_t memory_size_v = size_v_bytes();
+
+        LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+    }
+}
+
+void llama_kv_cache_recurrent::clear() {
+    for (int32_t i = 0; i < (int32_t) size; ++i) {
+        cells[i].pos = -1;
+        cells[i].seq_id.clear();
+        cells[i].src = -1;
+        cells[i].tail = -1;
+    }
+    head = 0;
+    used = 0;
+
+    for (auto & buf : bufs) {
+        ggml_backend_buffer_clear(buf.get(), 0);
+    }
+}
+
+bool llama_kv_cache_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // models like Mamba or RWKV can't have a state partially erased
+    if (seq_id >= (int64_t) size) {
+        // could be fatal
+        return false;
+    }
+    if (0 <= seq_id) {
+        int32_t & tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            const kv_cell & cell = cells[tail_id];
+            // partial intersection is invalid
+            if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                return false;
+            }
+            // invalidate tails which will be cleared
+            if (p0 <= cell.pos && cell.pos < p1) {
+                tail_id = -1;
+            }
+        }
+    } else {
+        // seq_id is negative, then the range should include everything or nothing
+        if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+            return false;
+        }
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].pos >= p0 && cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+                cells[i].pos = -1;
+                cells[i].src = -1;
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_kv_cache_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+        kv_cell & tail_src = cells[seq_id_src];
+        kv_cell & tail_dst = cells[seq_id_dst];
+        if (tail_dst.tail >= 0) {
+            // clear destination seq_id if it wasn't empty
+            kv_cell & cell_dst = cells[tail_dst.tail];
+
+            cell_dst.seq_id.erase(seq_id_dst);
+            tail_dst.tail = -1;
+            if (cell_dst.seq_id.empty()) {
+                cell_dst.pos = -1;
+                cell_dst.src = -1;
+                used -= 1;
+            }
+        }
+        if (tail_src.tail >= 0) {
+            kv_cell & cell_src = cells[tail_src.tail];
+
+            cell_src.seq_id.insert(seq_id_dst);
+            tail_dst.tail = tail_src.tail;
+        }
+    }
+}
+
+void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = size;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if ((llama_seq_id) i != seq_id) {
+            cells[i].tail = -1;
+        }
+
+        if (!cells[i].has_seq_id(seq_id)) {
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+            cells[i].seq_id.clear();
+
+            if (new_head == size){
+                new_head = i;
+            }
+        } else {
+            cells[i].seq_id.clear();
+            cells[i].seq_id.insert(seq_id);
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    if (shift == 0) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the
+    if (p0 == p1) {
+        return;
+    }
+
+    // for Mamba-like or RWKV models, only the pos needs to be shifted
+    if (0 <= seq_id && seq_id < (int64_t) size) {
+        const int32_t tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            kv_cell & cell = cells[tail_id];
+            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                cell.pos += shift;
+            }
+        }
+    }
+}
+
+void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    // for Mamba-like or RWKV models, only the pos needs to be changed
+    if (0 <= seq_id && seq_id < (int64_t) size) {
+        const int32_t tail_id = cells[seq_id].tail;
+        if (tail_id >= 0) {
+            kv_cell & cell = cells[tail_id];
+            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                cell.pos /= d;
+            }
+        }
+    }
+}
+
+llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
+    llama_pos result = std::numeric_limits<llama_pos>::max();
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::min(result, cells[i].pos);
+        }
+    }
+
+    if (result == std::numeric_limits<llama_pos>::max()) {
+        result = -1;
+    }
+
+    return result;
+}
+
+llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
+    llama_pos result = -1;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
+llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
+    GGML_UNUSED(embd_pooled);
+
+    auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all);
+
+    std::vector<llama_ubatch> ubatches;
+
+    while (sbatch.n_tokens > 0) {
+        llama_ubatch ubatch;
+
+        if (embd_pooled) {
+            // Pooled embeddings cannot be split across ubatches (yet)
+            ubatch = sbatch.split_seq(n_ubatch);
+        } else {
+            ubatch = sbatch.split_equal(n_ubatch);
+        }
+
+        ubatches.push_back(ubatch);
+    }
+
+    if (!prepare(ubatches)) {
+        return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+    }
+
+    return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this, std::move(sbatch), std::move(ubatches));
+}
+
+llama_memory_state_ptr llama_kv_cache_recurrent::init_full() {
+    return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
+}
+
+bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
+    // simply remember the full state because it is very small for this type of cache
+    // TODO: optimize
+    auto org_cells = cells;
+    auto org_used = used;
+    auto org_head = head;
+
+    bool success = true;
+
+    // TODO: here we have to verify that all ubatches can fit in the cells
+    //       however, the current implementation is broken because it relies on s_copy() and s_mask() to update the cells
+    //         during the compute of each ubatch. to reproduce, uncomment the following loop and run:
+    //
+    //           $ llama-parallel -m ./mamba-130m/ggml-model-f16.gguf -np 5 -ns 8
+    //
+    //       recovery from failures when the batch does not fit in the KV cache will not work correctly until this is fixed
+    //
+    GGML_UNUSED(ubatches);
+    //for (const auto & ubatch : ubatches) {
+    //    if (!find_slot(ubatch)) {
+    //        success = false;
+    //        break;
+    //    }
+    //}
+
+    // restore the original state
+    cells = std::move(org_cells);
+    used = org_used;
+    head = org_head;
+
+    return success;
+}
+
+bool llama_kv_cache_recurrent::update(llama_context & lctx) {
+    GGML_UNUSED(lctx);
+    // noop
+    return false;
+}
+
+void llama_kv_cache_recurrent::defrag_sched(float thold) {
+    GGML_UNUSED(thold);
+    // noop
+}
+
+bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
+    const uint32_t n_tokens = ubatch.n_tokens;
+    const uint32_t n_seqs   = ubatch.n_seqs;
+
+    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (head > used + 2*n_tokens) {
+        head = 0;
+    }
+
+    // For recurrent state architectures (like Mamba or RWKV),
+    // each cache cell can store the state for a whole sequence.
+    // A slot should be always be contiguous.
+
+    // can only process batches with an equal number of new tokens in each sequence
+    GGML_ASSERT(ubatch.equal_seqs);
+
+    int32_t min = size - 1;
+    int32_t max = 0;
+
+    // everything should fit if all seq_ids are smaller than the max
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const uint32_t n_seq_id = ubatch.n_seq_id[s];
+        for (uint32_t j = 0; j < n_seq_id; ++j) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][j];
+
+            if (seq_id < 0 || (uint32_t) seq_id >= size) {
+                // too big seq_id
+                // TODO: would it be possible to resize the cache instead?
+                LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
+                return false;
+            }
+            if (j > 0) {
+                kv_cell & seq = cells[seq_id];
+                if (seq.tail >= 0) {
+                    kv_cell & cell = cells[seq.tail];
+                    // clear cells from seq_ids that become shared
+                    // (should not normally happen, but let's handle it anyway)
+                    cell.seq_id.erase(seq_id);
+                    seq.tail = -1;
+                    if (cell.seq_id.empty()) {
+                        cell.pos = -1;
+                        cell.src = -1;
+                        used -= 1;
+                    }
+                }
+            }
+        }
+    }
+
+#ifndef NDEBUG
+    {
+        std::vector<int32_t> tails_verif;
+        tails_verif.assign(size, -1);
+        for (uint32_t i = 0; i < size; ++i) {
+            kv_cell & cell = cells[i];
+            for (llama_seq_id seq_id : cell.seq_id) {
+                if (tails_verif[seq_id] != -1) {
+                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
+                }
+                tails_verif[seq_id] = i;
+            }
+        }
+        for (uint32_t i = 0; i < size; ++i) {
+            if (tails_verif[i] != cells[i].tail) {
+                LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
+            }
+        }
+    }
+#endif
+
+    // find next empty cell
+    uint32_t next_empty_cell = head;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (next_empty_cell >= size) { next_empty_cell -= size; }
+        kv_cell & cell = cells[next_empty_cell];
+        if (cell.is_empty()) { break; }
+        next_empty_cell += 1;
+    }
+
+    // find usable cell range
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+        kv_cell & seq_meta = cells[seq_id];
+        bool has_cell = false;
+        if (seq_meta.tail >= 0) {
+            kv_cell & cell = cells[seq_meta.tail];
+            GGML_ASSERT(cell.has_seq_id(seq_id));
+            // does this seq_id "own" the cell?
+            if (cell.seq_id.size() == 1) { has_cell = true; }
+        }
+        if (!has_cell) {
+            kv_cell & empty_cell = cells[next_empty_cell];
+            GGML_ASSERT(empty_cell.is_empty());
+            // copy old tail into the empty cell
+            if (seq_meta.tail >= 0) {
+                kv_cell & orig_cell = cells[seq_meta.tail];
+                empty_cell.pos = orig_cell.pos;
+                empty_cell.src = orig_cell.src;
+                orig_cell.seq_id.erase(seq_id);
+                empty_cell.seq_id.insert(seq_id); // will be overwritten
+            }
+            seq_meta.tail = next_empty_cell;
+            // find next empty cell
+            if (s + 1 < n_seqs) {
+                next_empty_cell += 1;
+                for (uint32_t i = 0; i < size; ++i) {
+                    if (next_empty_cell >= size) { next_empty_cell -= size; }
+                    kv_cell & cell = cells[next_empty_cell];
+                    if (cell.is_empty()) { break; }
+                    next_empty_cell += 1;
+                }
+            }
+        }
+        if (min > seq_meta.tail) { min = seq_meta.tail; }
+        if (max < seq_meta.tail) { max = seq_meta.tail; }
+    }
+
+    // gather and re-order
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        int32_t dst_id = s + min;
+        int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
+        if (dst_id != src_id) {
+            kv_cell & dst_cell = cells[dst_id];
+            kv_cell & src_cell = cells[src_id];
+
+            std::swap(dst_cell.pos, src_cell.pos);
+            std::swap(dst_cell.src, src_cell.src);
+            std::swap(dst_cell.seq_id, src_cell.seq_id);
+
+            // swap tails (assuming they NEVER overlap)
+            for (const llama_seq_id seq_id : src_cell.seq_id) {
+                cells[seq_id].tail = src_id;
+            }
+            for (const llama_seq_id seq_id : dst_cell.seq_id) {
+                cells[seq_id].tail = dst_id;
+            }
+        }
+    }
+
+    // update the pos of the used seqs
+    for (uint32_t s = 0; s < n_seqs; ++s) {
+        const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+        int32_t cell_id = s + min;
+        kv_cell & cell = cells[cell_id];
+
+        if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
+            // What should happen when the pos backtracks or skips a value?
+            // Clearing the state mid-batch would require special-casing which isn't done.
+            LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
+                __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
+        }
+        cell.pos = last_pos;
+        cell.seq_id.clear();
+        for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][j];
+            cell.seq_id.insert(seq_id);
+            cells[seq_id].tail = cell_id;
+        }
+    }
+
+    // allow getting the range of used cells, from head to head + n
+    head = min;
+    n    = max - min + 1;
+    used = std::count_if(cells.begin(), cells.end(),
+        [](const kv_cell & cell){ return !cell.is_empty(); });
+
+    // sanity check
+    return n >= n_seqs;
+}
+
+bool llama_kv_cache_recurrent::get_can_shift() const {
+    return false;
+}
+
+int32_t llama_kv_cache_recurrent::s_copy(int i) const {
+    const uint32_t cell_id = i + head;
+
+    //////////////////////////////////////////////
+    // TODO: this should not mutate the KV cache !
+    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
+
+    // prevent out-of-bound sources
+    if (cell.src < 0 || (uint32_t) cell.src >= size) {
+        cell.src = cell_id;
+    }
+
+    int32_t res = cell.src;
+
+    // TODO: do not mutate the KV cache
+    // ensure copy only happens once
+    if (cell.src != (int32_t) cell_id) {
+        cell.src = cell_id;
+    }
+
+    return res;
+}
+
+float llama_kv_cache_recurrent::s_mask(int i) const {
+    const uint32_t cell_id = i + head;
+
+    //////////////////////////////////////////////
+    // TODO: this should not mutate the KV cache !
+    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
+
+    float res = (float) (cell.src >= 0);
+
+    // only clear once
+    if (cell.src < 0) {
+        cell.src = cell_id;
+    }
+
+    return res;
+}
+
+size_t llama_kv_cache_recurrent::total_size() const {
+    size_t size = 0;
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+size_t llama_kv_cache_recurrent::size_k_bytes() const {
+    size_t size_k_bytes = 0;
+
+    for (const auto & k : k_l) {
+        size_k_bytes += ggml_nbytes(k);
+    }
+
+    return size_k_bytes;
+}
+
+size_t llama_kv_cache_recurrent::size_v_bytes() const {
+    size_t size_v_bytes = 0;
+
+    for (const auto & v : v_l) {
+        size_v_bytes += ggml_nbytes(v);
+    }
+
+    return size_v_bytes;
+}
+
+void llama_kv_cache_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = size;
+    for (uint32_t i = 0; i < size; ++i) {
+        const auto & cell = cells[i];
+        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+            ++cell_count;
+            if (cell_range_begin == size) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != size) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = size;
+            }
+        }
+    }
+    if (cell_range_begin != size) {
+        cell_ranges.emplace_back(cell_range_begin, size);
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges);
+}
+
+void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear();
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_kv_cache_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            const auto & cell = cells[i];
+            const llama_pos pos      = cell.pos;
+            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id) {
+                for (auto seq_id : cell.seq_id) {
+                    io.write(&seq_id, sizeof(seq_id));
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+    const uint32_t v_trans = 0;
+    const uint32_t n_layer = hparams.n_layer;
+
+    io.write(&v_trans, sizeof(v_trans));
+    io.write(&n_layer, sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Write key type
+        const int32_t k_type_i = (int32_t)k_l[il]->type;
+        io.write(&k_type_i, sizeof(k_type_i));
+
+        // Write row size of key
+        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        io.write(&k_size_row, sizeof(k_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * k_size_row;
+            io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write row size of value
+            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            io.write(&v_size_row, sizeof(v_size_row));
+
+            // Read each range of cells of v_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * v_size_row;
+                io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t kv_size = size;
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write element size
+            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
+            io.write(&v_size_el, sizeof(v_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                    const size_t buf_size = range_size * v_size_el;
+                    io.write_tensor(v_l[il], src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_sbatch sbatch;
+        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+
+        batch.n_tokens = cell_count;
+        batch.n_seq_tokens = cell_count;
+        batch.n_seqs = 1;
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 0) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            batch.pos[i] = pos;
+        }
+        batch.n_seq_id[0] = 1;
+        batch.seq_id[0] = &dest_seq_id;
+
+        if (!find_slot(batch)) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+
+        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head + cell_count <= size);
+        GGML_ASSERT(cells[head].pos == batch.pos[0]);
+        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
+        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear();
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            kv_cell & cell = cells[i];
+
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cell.pos = pos;
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                // TODO: llama_kv_cache_recurrent should have a notion of max sequences
+                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                if (seq_id < 0) {
+                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+                    return false;
+                }
+
+                cell.seq_id.insert(seq_id);
+
+                int32_t & tail = cells[seq_id].tail;
+                if (tail != -1) {
+                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                    return false;
+                }
+                tail = i;
+            }
+        }
+
+        head = 0;
+        used = cell_count;
+    }
+
+    for (uint32_t i = 0; i < cell_count; ++i) {
+        uint32_t cell_id = head + i;
+        // make sure the recurrent states will keep their restored state
+        cells[cell_id].src = cell_id;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+    uint32_t v_trans;
+    uint32_t n_layer;
+    io.read_to(&v_trans, sizeof(v_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != hparams.n_layer) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+        return false;
+    }
+    if (cell_count > size) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
+        return false;
+    }
+    if (false != (bool) v_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Read type of key
+        int32_t k_type_i_ref;
+        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+        const int32_t k_type_i = (int32_t) k_l[il]->type;
+        if (k_type_i != k_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t k_size_row_ref;
+        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        if (k_size_row != k_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t v_size_row_ref;
+            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            if (v_size_row != v_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t v_size_el_ref;
+            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+            const size_t v_size_el = ggml_type_size(v_l[il]->type);
+            if (v_size_el != v_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                return false;
+            }
+
+            // Read GQA embedding size
+            uint32_t n_embd_v_gqa_ref;
+            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    const size_t dst_offset = (head + j * size) * v_size_el;
+                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+//
+// llama_kv_cache_recurrent_state
+//
+
+llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(
+        llama_memory_status status,
+        llama_kv_cache_recurrent * kv) : status(status), kv(kv), is_full(true) {
+}
+
+llama_kv_cache_recurrent_state::llama_kv_cache_recurrent_state(
+        llama_memory_status status,
+        llama_kv_cache_recurrent * kv,
+        llama_sbatch sbatch,
+        std::vector<llama_ubatch> ubatches) : status(status), kv(kv), sbatch(std::move(sbatch)), ubatches(std::move(ubatches)) {}
+
+llama_kv_cache_recurrent_state::~llama_kv_cache_recurrent_state() = default;
+
+bool llama_kv_cache_recurrent_state::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_recurrent_state::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    kv->find_slot(ubatches[i_next]);
+
+    return true;
+}
+
+std::vector<int64_t> & llama_kv_cache_recurrent_state::out_ids() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return sbatch.out_ids;
+}
+
+llama_memory_status llama_kv_cache_recurrent_state::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_recurrent_state::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+uint32_t llama_kv_cache_recurrent_state::get_n_kv() const {
+    return is_full ? kv->size : kv->n;
+}
+
+uint32_t llama_kv_cache_recurrent_state::get_head() const {
+    return is_full ? 0 : kv->head;
+}
+
+uint32_t llama_kv_cache_recurrent_state::get_size() const {
+    return kv->size;
+}
+
+ggml_tensor * llama_kv_cache_recurrent_state::get_k_l(int32_t il) const {
+    return kv->k_l[il];
+}
+
+ggml_tensor * llama_kv_cache_recurrent_state::get_v_l(int32_t il) const {
+    return kv->v_l[il];
+}
+
+int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
+    return kv->s_copy(i);
+}
+
+float llama_kv_cache_recurrent_state::s_mask(int i) const {
+    return kv->s_mask(i);
+}
diff --git a/src/llama-kv-cache-recurrent.h b/src/llama-kv-cache-recurrent.h
new file mode 100644
index 000000000..a178ae85c
--- /dev/null
+++ b/src/llama-kv-cache-recurrent.h
@@ -0,0 +1,191 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cache.h"
+
+#include <set>
+#include <vector>
+
+//
+// llama_kv_cache_recurrent
+//
+
+// TODO: extract the KV cache state used for graph computation into llama_kv_cache_recurrent_state_i
+//       see the implementation of llama_kv_cache_unified_state_i for an example how to do it
+class llama_kv_cache_recurrent : public llama_kv_cache {
+public:
+    llama_kv_cache_recurrent(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   offload,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max);
+
+    ~llama_kv_cache_recurrent() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    void clear() override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    //
+    // llama_kv_cache
+    //
+
+    llama_memory_state_ptr init_batch(
+            const llama_batch & batch,
+            uint32_t n_ubatch,
+            bool embd_pooled,
+            bool logits_all) override;
+
+    llama_memory_state_ptr init_full() override;
+
+    bool update(llama_context & lctx) override;
+
+    void defrag_sched(float thold) override;
+
+    bool prepare(const std::vector<llama_ubatch> & ubatches);
+
+    // find a contiguous slot of kv cells and emplace the ubatch there
+    bool find_slot(const llama_ubatch & ubatch);
+
+    bool get_can_shift() const override;
+
+    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
+    int32_t s_copy(int i) const;
+    float   s_mask(int i) const;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
+
+    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
+    uint32_t size = 0; // total number of cells, shared across all sequences
+    uint32_t used = 0; // used cells (i.e. at least one seq_id)
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    // TODO: optimize for recurrent state needs
+    struct kv_cell {
+        llama_pos pos  = -1;
+        int32_t   src  = -1; // used to copy states
+        int32_t   tail = -1;
+
+        std::set<llama_seq_id> seq_id;
+
+        bool has_seq_id(const llama_seq_id & id) const {
+            return seq_id.find(id) != seq_id.end();
+        }
+
+        bool is_empty() const {
+            return seq_id.empty();
+        }
+
+        bool is_same_seq(const kv_cell & other) const {
+            return seq_id == other.seq_id;
+        }
+    };
+
+    std::vector<kv_cell> cells;
+
+    std::vector<ggml_tensor *> k_l; // per layer
+    std::vector<ggml_tensor *> v_l;
+
+private:
+    //const llama_model & model;
+    const llama_hparams & hparams;
+
+    const uint32_t n_seq_max = 1;
+
+    std::vector<ggml_context_ptr>        ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    size_t total_size() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
+
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
+};
+
+class llama_kv_cache_recurrent_state : public llama_memory_state_i {
+public:
+    // used for errors
+    llama_kv_cache_recurrent_state(llama_memory_status status);
+
+    // used to create a full-cache state
+    llama_kv_cache_recurrent_state(
+            llama_memory_status status,
+            llama_kv_cache_recurrent * kv);
+
+    // used to create a state from a batch
+    llama_kv_cache_recurrent_state(
+            llama_memory_status status,
+            llama_kv_cache_recurrent * kv,
+            llama_sbatch sbatch,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_recurrent_state();
+
+    //
+    // llama_memory_state_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    std::vector<int64_t> & out_ids() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_recurrent_state specific API
+    //
+
+    uint32_t get_n_kv() const;
+    uint32_t get_head() const;
+    uint32_t get_size() const;
+
+    ggml_tensor * get_k_l(int32_t il) const;
+    ggml_tensor * get_v_l(int32_t il) const;
+
+    int32_t s_copy(int i) const;
+    float   s_mask(int i) const;
+
+private:
+    const llama_memory_status status;
+
+    llama_kv_cache_recurrent * kv;
+
+    llama_sbatch sbatch;
+
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    //
+    // data needed for building the compute graph for the current ubatch:
+    // TODO: extract all the state like `head` and `n` here
+    //
+
+    const bool is_full = false;
+};
diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp
new file mode 100644
index 000000000..0eb045634
--- /dev/null
+++ b/src/llama-kv-cache-unified-iswa.cpp
@@ -0,0 +1,249 @@
+#include "llama-kv-cache-unified-iswa.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+
+//
+// llama_kv_cache_unified_iswa
+//
+
+llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
+                     bool   swa_full,
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max,
+                 uint32_t   n_ubatch,
+                 uint32_t   n_pad) : hparams(model.hparams) {
+    llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
+    llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
+
+    const uint32_t size_base = kv_size;
+
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
+
+    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
+    if (swa_full) {
+        LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+
+        size_swa = size_base;
+    }
+
+    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
+
+    kv_base = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_base), type_k, type_v,
+            v_trans, offload, size_base, n_seq_max, n_pad,
+            0, LLAMA_SWA_TYPE_NONE);
+
+    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
+
+    kv_swa = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_swa), type_k, type_v,
+            v_trans, offload, size_swa, n_seq_max, n_pad,
+            hparams.n_swa, hparams.swa_type);
+}
+
+void llama_kv_cache_unified_iswa::clear() {
+    kv_base->clear();
+    kv_swa ->clear();
+}
+
+bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    bool res = true;
+
+    res = res & kv_base->seq_rm(seq_id, p0, p1);
+    res = res & kv_swa ->seq_rm(seq_id, p0, p1);
+
+    return res;
+}
+
+void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
+    kv_base->seq_keep(seq_id);
+    kv_swa ->seq_keep(seq_id);
+}
+
+void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    kv_base->seq_add(seq_id, p0, p1, shift);
+    kv_swa ->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    kv_base->seq_div(seq_id, p0, p1, d);
+    kv_swa ->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
+    // the base cache is a superset of the SWA cache, so we can just check the SWA cache
+    return kv_swa->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
+    return kv_swa->seq_pos_max(seq_id);
+}
+
+llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
+    GGML_UNUSED(embd_pooled);
+
+    // TODO: if we fail with split_simple, we should attempt different splitting strategies
+    //       but to do that properly, we first have to refactor the batches to be more flexible
+
+    auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
+
+    std::vector<llama_ubatch> ubatches;
+
+    while (sbatch.n_tokens > 0) {
+        auto ubatch = sbatch.split_simple(n_ubatch);
+
+        ubatches.push_back(ubatch);
+    }
+
+    auto heads_base = kv_base->prepare(ubatches);
+    if (heads_base.empty()) {
+        return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+    }
+
+    auto heads_swa = kv_swa->prepare(ubatches);
+    if (heads_swa.empty()) {
+        return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+    }
+
+    assert(heads_base.size() == heads_swa.size());
+
+    return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_SUCCESS,
+            this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
+}
+
+llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
+    return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
+}
+
+bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
+    bool res = false;
+
+    res = res | kv_base->update(lctx);
+    res = res | kv_swa ->update(lctx);
+
+    return res;
+}
+
+void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
+    kv_base->defrag_sched(thold);
+    kv_swa ->defrag_sched(thold);
+}
+
+bool llama_kv_cache_unified_iswa::get_can_shift() const {
+    return kv_base->get_size() == kv_swa->get_size();
+}
+
+void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    kv_base->state_write(io, seq_id);
+    kv_swa ->state_write(io, seq_id);
+}
+
+void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    kv_base->state_read(io, seq_id);
+    kv_swa ->state_read(io, seq_id);
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
+    return kv_base.get();
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
+    return kv_swa.get();
+}
+
+//
+// llama_kv_cache_unified_iswa_state
+//
+
+llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
+        llama_memory_status status,
+        llama_kv_cache_unified_iswa * kv) : status(status) {
+    state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base()));
+    state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa ()));
+}
+
+llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
+        llama_memory_status status,
+        llama_kv_cache_unified_iswa * kv,
+        llama_sbatch sbatch,
+        std::vector<uint32_t> heads_base,
+        std::vector<uint32_t> heads_swa,
+        std::vector<llama_ubatch> ubatches)
+    : status(status),
+    sbatch(std::move(sbatch)),
+    ubatches(std::move(ubatches)) {
+        // note: here we copy the ubatches. not sure if this is ideal
+        state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base(), {}, std::move(heads_base), this->ubatches));
+        state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa (), {}, std::move(heads_swa),  this->ubatches));
+    }
+
+llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default;
+
+bool llama_kv_cache_unified_iswa_state::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    state_base->next();
+    state_swa ->next();
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified_iswa_state::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    bool res = true;
+
+    res = res & state_base->apply();
+    res = res & state_swa ->apply();
+
+    return res;
+}
+
+std::vector<int64_t> & llama_kv_cache_unified_iswa_state::out_ids() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return sbatch.out_ids;
+}
+
+llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+    return ubatches[i_next];
+}
+
+const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_base() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return state_base.get();
+}
+
+const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_swa()  const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return state_swa.get();
+}
diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-unified-iswa.h
new file mode 100644
index 000000000..8b067da03
--- /dev/null
+++ b/src/llama-kv-cache-unified-iswa.h
@@ -0,0 +1,136 @@
+#pragma once
+
+#include "llama-kv-cache-unified.h"
+
+#include <vector>
+
+//
+// llama_kv_cache_unified_iswa
+//
+
+// utilizes two instances of llama_kv_cache_unified
+//   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
+
+class llama_kv_cache_unified_iswa : public llama_kv_cache {
+public:
+    llama_kv_cache_unified_iswa(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                         bool   swa_full,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_ubatch,
+                     uint32_t   n_pad);
+
+    ~llama_kv_cache_unified_iswa() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    void clear() override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    //
+    // llama_kv_cache
+    //
+
+    llama_memory_state_ptr init_batch(
+            const llama_batch & batch,
+            uint32_t n_ubatch,
+            bool embd_pooled,
+            bool logits_all) override;
+
+    llama_memory_state_ptr init_full() override;
+
+    bool update(llama_context & lctx) override;
+
+    void defrag_sched(float thold) override;
+
+    bool get_can_shift() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_kv_cache_unified_iswa specific API
+    //
+
+    llama_kv_cache_unified * get_base() const;
+    llama_kv_cache_unified * get_swa () const;
+
+private:
+    const llama_hparams & hparams;
+
+    std::unique_ptr<llama_kv_cache_unified> kv_base;
+    std::unique_ptr<llama_kv_cache_unified> kv_swa;
+};
+
+class llama_kv_cache_unified_iswa_state : public llama_memory_state_i {
+public:
+    // used for errors
+    llama_kv_cache_unified_iswa_state(llama_memory_status status);
+
+    // used to create a full-cache state
+    llama_kv_cache_unified_iswa_state(
+            llama_memory_status status,
+            llama_kv_cache_unified_iswa * kv);
+
+    // used to create a state from a batch
+    llama_kv_cache_unified_iswa_state(
+            llama_memory_status status,
+            llama_kv_cache_unified_iswa * kv,
+            llama_sbatch sbatch,
+            std::vector<uint32_t> heads_base,
+            std::vector<uint32_t> heads_swa,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_unified_iswa_state();
+
+    //
+    // llama_memory_state_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    std::vector<int64_t> & out_ids() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_unified_iswa_state specific API
+    //
+
+    const llama_kv_cache_unified_state * get_base() const;
+    const llama_kv_cache_unified_state * get_swa()  const;
+
+private:
+    const llama_memory_status status;
+
+    //llama_kv_cache_unified_iswa * kv;
+
+    llama_sbatch sbatch;
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    std::unique_ptr<llama_kv_cache_unified_state> state_base;
+    std::unique_ptr<llama_kv_cache_unified_state> state_swa;
+};
diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp
new file mode 100644
index 000000000..a81715476
--- /dev/null
+++ b/src/llama-kv-cache-unified.cpp
@@ -0,0 +1,1717 @@
+#include "llama-kv-cache-unified.h"
+
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-context.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+//
+// llama_kv_cache_unified
+//
+
+llama_kv_cache_unified::llama_kv_cache_unified(
+        const llama_model &  model,
+          layer_filter_cb && filter,
+                ggml_type    type_k,
+                ggml_type    type_v,
+                     bool    v_trans,
+                     bool    offload,
+                 uint32_t    kv_size,
+                 uint32_t    n_seq_max,
+                 uint32_t    n_pad,
+                 uint32_t    n_swa,
+           llama_swa_type    swa_type) :
+    model(model), hparams(model.hparams), v_trans(v_trans),
+    n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
+
+    GGML_ASSERT(kv_size % n_pad == 0);
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+
+        return it->second;
+    };
+
+    head = 0;
+
+    cells.resize(kv_size);
+
+    for (uint32_t il = 0; il < hparams.n_layer; il++) {
+        if (filter && !filter(il)) {
+            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
+            continue;
+        }
+
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+        const char * dev_name = "CPU";
+
+        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+        if (offload) {
+            auto * dev = model.dev_layer(il);
+            buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
+        }
+
+        LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            throw std::runtime_error("failed to create ggml context for kv cache");
+        }
+
+        ggml_tensor * k;
+        ggml_tensor * v;
+
+        k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
+        v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
+
+        ggml_format_name(k, "cache_k_l%d", il);
+        ggml_format_name(v, "cache_v_l%d", il);
+
+        map_layer_ids[il] = layers.size();
+        layers.push_back({ il, k, v });
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            throw std::runtime_error("failed to allocate buffer for kv cache");
+        }
+
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+
+        ggml_backend_buffer_clear(buf, 0);
+        bufs.emplace_back(buf);
+    }
+
+    {
+        const size_t memory_size_k = size_k_bytes();
+        const size_t memory_size_v = size_v_bytes();
+
+        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+    }
+}
+
+void llama_kv_cache_unified::clear() {
+    cells.reset();
+
+    head = 0;
+
+    for (auto & buf : bufs) {
+        ggml_backend_buffer_clear(buf.get(), 0);
+    }
+}
+
+bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = cells.size();
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.pos_in(i, p0, p1)) {
+            continue;
+        }
+
+        if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) {
+            if (new_head == cells.size()) {
+                new_head = i;
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cells.size() && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.pos_in(i, p0, p1)) {
+            continue;
+        }
+
+        if (cells.seq_has(i, seq_id_src)) {
+            cells.seq_add(i, seq_id_dst);
+        }
+    }
+}
+
+void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = cells.size();
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (cells.seq_keep(i, seq_id)) {
+            if (new_head == cells.size()) {
+                new_head = i;
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != cells.size() && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    if (shift == 0) {
+        return;
+    }
+
+    uint32_t new_head = cells.size();
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over all cells.
+    if (p0 == p1) {
+        return;
+    }
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.pos_in(i, p0, p1)) {
+            continue;
+        }
+
+        if (cells.seq_has(i, seq_id)) {
+            if (cells.pos_add(i, shift)) {
+                if (new_head == cells.size()) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    head = new_head != cells.size() ? new_head : 0;
+}
+
+void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.pos_in(i, p0, p1)) {
+            continue;
+        }
+
+        if (cells.seq_has(i, seq_id)) {
+            cells.pos_div(i, d);
+        }
+    }
+}
+
+llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
+    return cells.seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
+    return cells.seq_pos_max(seq_id);
+}
+
+llama_memory_state_ptr llama_kv_cache_unified::init_batch(
+            const llama_batch & batch,
+            uint32_t n_ubatch,
+            bool embd_pooled,
+            bool logits_all) {
+    GGML_UNUSED(embd_pooled);
+
+    auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
+
+    std::vector<llama_ubatch> ubatches;
+    while (sbatch.n_tokens > 0) {
+        ubatches.push_back(sbatch.split_simple(n_ubatch));
+    }
+
+    auto heads = prepare(ubatches);
+    if (heads.empty()) {
+        return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+    }
+
+    return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_SUCCESS,
+            this, std::move(sbatch), std::move(heads), std::move(ubatches));
+}
+
+llama_memory_state_ptr llama_kv_cache_unified::init_full() {
+    return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
+}
+
+std::vector<uint32_t> llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
+    std::vector<uint32_t> res;
+
+    struct state {
+        uint32_t head_old; // old position of the head, before placing the ubatch
+        uint32_t head_new; // new position of the head, after placing the ubatch
+
+        llama_kv_cells_unified cells; // copy of the old cells, before placing the ubatch
+    };
+
+    // remember the old state of the cells so we can restore it in the end
+    std::vector<state> states;
+
+    bool success = true;
+
+    for (const auto & ubatch : ubatches) {
+        // only find a suitable slot for the ubatch. don't modify the cells yet
+        const int32_t head_new = find_slot(ubatch);
+        if (head_new < 0) {
+            success = false;
+            break;
+        }
+
+        // remeber the position that we found
+        res.push_back(head_new);
+
+        // store the old state of the cells in the recovery stack
+        states.push_back({head, (uint32_t) head_new, cells.cp(head_new, ubatch.n_tokens)});
+
+        // now emplace the ubatch
+        apply_ubatch(head_new, ubatch);
+    }
+
+    // iterate backwards and restore the cells to their original state
+    for (auto it = states.rbegin(); it != states.rend(); ++it) {
+        cells.set(it->head_new, it->cells);
+        head = it->head_old;
+    }
+
+    if (!success) {
+        return {};
+    }
+
+    return res;
+}
+
+bool llama_kv_cache_unified::update(llama_context & lctx) {
+    bool updated = false;
+
+    auto * sched = lctx.get_sched();
+
+    if (cells.get_has_shift()) {
+        if (!get_can_shift()) {
+            GGML_ABORT("The current KV cache / model configuration does not support K-shift");
+        }
+
+        LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
+
+        // apply K-shift if needed
+        if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            ggml_backend_sched_reset(sched);
+
+            auto * gf = lctx.graph_init();
+
+            auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+            if (!res) {
+                LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
+                return updated;
+            }
+
+            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
+                return updated;
+            }
+
+            res->set_inputs(nullptr);
+
+            if (lctx.graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
+                LLAMA_LOG_ERROR("%s: failed to compute K-shift\n", __func__);
+                return updated;
+            }
+
+            updated = true;
+        }
+
+        cells.reset_shift();
+    }
+
+    if (do_defrag) {
+        LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+
+        if (defrag_prepare(lctx.graph_max_nodes())) {
+            ggml_backend_sched_reset(sched);
+
+            auto * gf = lctx.graph_init();
+
+            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
+            if (!res) {
+                LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
+                return updated;
+            }
+
+            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
+                return updated;
+            }
+
+            res->set_inputs(nullptr);
+
+            if (lctx.graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
+                LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
+                return updated;
+            }
+
+            updated = true;
+        }
+
+        do_defrag = false;
+    }
+
+    return updated;
+}
+
+void llama_kv_cache_unified::defrag_sched(float thold) {
+    const auto n_kv = cells.used_max_p1();
+
+    // - do not defrag small contexts (i.e. < 2048 tokens)
+    // - count the padding towards the number of used tokens
+    const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
+
+    // queue defragmentation for next llama_kv_cache_update
+    if (fragmentation > thold) {
+        LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
+
+        do_defrag = true;
+    }
+}
+
+int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
+    const uint32_t n_tokens = ubatch.n_tokens;
+
+    uint32_t head_cur = this->head;
+
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (head_cur > cells.get_used() + 2*ubatch.n_tokens) {
+        head_cur = 0;
+    }
+
+    // otherwise, one cell per token.
+
+    if (n_tokens > cells.size()) {
+        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
+        return -1;
+    }
+
+//#define FIND_SLOT_DEBUG 1
+#if FIND_SLOT_DEBUG
+    LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
+
+    // for debugging
+    {
+        std::string ss;
+        if (n_swa > 0) {
+            for (uint32_t i = 0; i < cells.size(); ++i) {
+                if (cells.is_empty(i)) {
+                    ss += '.';
+                } else {
+                    ss += std::to_string(cells.seq_get(i));
+                }
+                if (i%256 == 255) {
+                    ss += '\n';
+                }
+            }
+        }
+        LLAMA_LOG_WARN("\n%s\n", ss.c_str());
+    }
+
+    for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+        if (cells.seq_pos_min(s) < 0) {
+            continue;
+        }
+
+        LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
+    }
+#endif
+
+    uint32_t n_tested = 0;
+
+    while (true) {
+        if (head_cur + n_tokens > cells.size()) {
+            n_tested += cells.size() - head_cur;
+            head_cur = 0;
+            continue;
+        }
+
+        // keep track of what the minimum sequence positions would be if we accept the ubatch
+        llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
+        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            seq_pos_min[s] = cells.seq_pos_min(s);
+        }
+
+        bool found = true;
+        for (uint32_t i = 0; i < n_tokens; i++) {
+            const llama_pos    pos    = ubatch.pos[i];
+            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+
+            // can we use this cell? either:
+            //  - the cell is empty
+            //  - the cell is occupied only by one sequence:
+            //    - mask causally, if the sequence is the same as the one we are inserting
+            //    - mask SWA, using current max pos for that sequence in the cache
+            //                always insert in the cell with minimum pos
+            bool can_use = cells.is_empty(head_cur + i);
+
+            if (!can_use && cells.seq_count(head_cur + i) == 1) {
+                const llama_pos pos_cell = cells.pos_get(head_cur + i);
+
+                // causal mask
+                if (cells.seq_has(head_cur + i, seq_id)) {
+                    can_use = pos_cell >= pos;
+                }
+
+                if (!can_use) {
+                    const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
+
+                    // SWA mask
+                    // note: we insert only in the cell with minimum pos in order to preserve the invariant that
+                    //       all positions between [pos_min, pos_max] for each sequence will be present in the cache
+                    //       ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
+                    if (pos_cell == seq_pos_min[seq_id_cell] &&
+                        is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
+                        seq_pos_min[seq_id_cell]++;
+                        can_use = true;
+                    }
+                }
+            }
+
+            if (!can_use) {
+                found = false;
+                head_cur += i + 1;
+                n_tested += i + 1;
+                break;
+            }
+        }
+
+        if (found) {
+            break;
+        }
+
+        if (n_tested >= cells.size()) {
+            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            return -1;
+        }
+    }
+
+    return head_cur;
+}
+
+void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
+    for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+        if (!cells.is_empty(head_cur + i)) {
+            cells.rm(head_cur + i);
+        }
+
+        cells.pos_set(head_cur + i, ubatch.pos[i]);
+
+        for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
+            cells.seq_add(head_cur + i, ubatch.seq_id[i][j]);
+        }
+    }
+
+    // move the head at the end of the slot
+    head = head_cur + ubatch.n_tokens;
+}
+
+bool llama_kv_cache_unified::get_can_shift() const {
+    return true;
+}
+
+uint32_t llama_kv_cache_unified::get_size() const {
+    return cells.size();
+}
+
+uint32_t llama_kv_cache_unified::get_n_kv() const {
+    return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
+}
+
+ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * k = layers[ikv].k;
+
+    return ggml_view_3d(ctx, k,
+            hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv,
+            ggml_row_size(k->type, hparams.n_embd_head_k),
+            ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
+            0);
+}
+
+ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * v = layers[ikv].v;
+
+    if (!v_trans) {
+        // note: v->nb[1] <= v->nb[2]
+        return ggml_view_3d(ctx, v,
+                hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv,
+                ggml_row_size(v->type, hparams.n_embd_head_v),    // v->nb[1]
+                ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
+                0);
+    }
+
+    // note: v->nb[1] > v->nb[2]
+    return ggml_view_3d(ctx, v,
+            n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v,
+            ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
+            ggml_row_size(v->type, v->ne[1]),                       // v->nb[2]
+            0);
+}
+
+ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * k = layers[ikv].k;
+
+    const int64_t n_tokens = k_cur->ne[2];
+
+    ggml_tensor * k_view = ggml_view_1d(ctx, k,
+            n_tokens*hparams.n_embd_k_gqa(il),
+            ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head_cur);
+
+    return ggml_cpy(ctx, k_cur, k_view);
+}
+
+ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const {
+    const int32_t ikv = map_layer_ids.at(il);
+
+    auto * v = layers[ikv].v;
+
+    const int64_t n_tokens = v_cur->ne[2];
+
+    v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
+
+    ggml_tensor * v_view = nullptr;
+
+    if (!v_trans) {
+        v_view = ggml_view_1d(ctx, v,
+                n_tokens*hparams.n_embd_v_gqa(il),
+                ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head_cur);
+    } else {
+        // note: the V cache is transposed when not using flash attention
+        v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
+                (v->ne[1])*ggml_element_size(v),
+                (head_cur)*ggml_element_size(v));
+
+        v_cur = ggml_transpose(ctx, v_cur);
+    }
+
+    return ggml_cpy(ctx, v_cur, v_view);
+}
+
+void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+    const int64_t n_tokens     = ubatch->n_tokens;
+    const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+    const int64_t n_seqs       = ubatch->n_seqs;
+
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    float * data = (float *) dst->data;
+
+    const auto n_kv = dst->ne[0];
+
+    // Use only the previous KV cells of the correct sequence for each token of the ubatch.
+    // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
+    // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
+    //   Causal mask:
+    //      xxx-------
+    //      xxxx------
+    //      xxxxx-----
+    //   Non-causal mask:
+    //      xxxxx-----
+    //      xxxxx-----
+    //      xxxxx-----
+    // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
+    for (int h = 0; h < 1; ++h) {
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            for (int j = 0; j < n_seq_tokens; ++j) {
+                const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
+
+                for (uint32_t i = 0; i < n_kv; ++i) {
+                    float f = 0.0f;
+
+                    bool masked = false;
+
+                    if (cells.is_empty(i)) {
+                        masked = true;
+                    } else {
+                        const llama_pos p0 = cells.pos_get(i);
+
+                        // mask the token if not the same sequence
+                        masked = masked || (!cells.seq_has(i, seq_id));
+
+                        // mask future tokens
+                        masked = masked || (causal_attn && p0 > p1);
+
+                        // apply SWA if any
+                        masked = masked || (is_masked_swa(p0, p1));
+
+                        if (!masked && hparams.use_alibi) {
+                            f = -std::abs(p0 - p1);
+                        }
+                    }
+
+                    if (masked) {
+                        f = -INFINITY;
+                    }
+
+                    data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
+                }
+            }
+        }
+
+        // mask padded tokens
+        if (data) {
+            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                for (uint32_t j = 0; j < n_kv; ++j) {
+                    data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+
+    int32_t * data = (int32_t *) dst->data;
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        data[i] = cells.is_empty(i) ? 0 : cells.get_shift(i);
+    }
+}
+
+void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+    const int64_t n_tokens = ubatch->n_tokens;
+
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
+
+    int32_t * data = (int32_t *) dst->data;
+
+    const int32_t n_kv = dst->ne[0];
+
+    for (int h = 0; h < 1; ++h) {
+        for (int j = 0; j < n_tokens; ++j) {
+            for (int i = 0; i < n_kv; ++i) {
+                // the position when the cells is empty is irrelevant - it will be masked out later in the attention
+                const llama_pos p0 = cells.is_empty(i) ? -1 : cells.pos_get(i);
+
+                data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(p0, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
+            }
+        }
+    }
+}
+
+size_t llama_kv_cache_unified::total_size() const {
+    size_t size = 0;
+
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+size_t llama_kv_cache_unified::size_k_bytes() const {
+    size_t size_k_bytes = 0;
+
+    for (const auto & layer : layers) {
+        size_k_bytes += ggml_nbytes(layer.k);
+    }
+
+    return size_k_bytes;
+}
+
+size_t llama_kv_cache_unified::size_v_bytes() const {
+    size_t size_v_bytes = 0;
+
+    for (const auto & layer : layers) {
+        size_v_bytes += ggml_nbytes(layer.v);
+    }
+
+    return size_v_bytes;
+}
+
+ggml_tensor * llama_kv_cache_unified::build_rope_shift(
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_tensor * cur,
+                ggml_tensor * shift,
+                ggml_tensor * factors,
+                      float   freq_base,
+                      float   freq_scale) const {
+    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
+
+    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
+    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
+
+    const auto & n_rot     = hparams.n_rot;
+    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
+                                // @ngxson : this is a workaround
+                                // for M-RoPE, we want to rotate the whole vector when doing KV shift
+                                // a normal RoPE should work, we just need to use the correct ordering
+                                // ref: https://github.com/ggml-org/llama.cpp/pull/13870
+                                ? LLAMA_ROPE_TYPE_NEOX
+                                : hparams.rope_type;
+
+    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
+    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
+                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
+                                    : cparams.yarn_attn_factor;
+
+    ggml_tensor * tmp;
+
+    if (ggml_is_quantized(cur->type)) {
+        // dequantize to f32 -> RoPE -> quantize back
+        tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
+
+        tmp = ggml_rope_ext(ctx, tmp,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+
+        tmp = ggml_cpy(ctx, tmp, cur);
+    } else {
+        // we rotate only the first n_rot dimensions
+        tmp = ggml_rope_ext_inplace(ctx, cur,
+                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+    }
+
+    return tmp;
+}
+
+class llm_graph_input_k_shift : public llm_graph_input_i {
+public:
+    llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
+    virtual ~llm_graph_input_k_shift() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * k_shift; // I32 [kv_size]
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
+    GGML_UNUSED(ubatch);
+
+    if (k_shift) {
+        kv_self->set_input_k_shift(k_shift);
+    }
+}
+
+llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_cgraph * gf) const {
+    auto res = std::make_unique<llm_graph_result>();
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    //GGML_ASSERT(kv_self->size == n_ctx);
+
+    auto inp = std::make_unique<llm_graph_input_k_shift>(this);
+
+    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
+    ggml_set_input(inp->k_shift);
+
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
+        const int64_t n_head_kv    = hparams.n_head_kv(il);
+        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
+        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+        ggml_tensor * k =
+            ggml_view_3d(ctx, layer.k,
+                n_embd_head_k, n_head_kv, cells.size(),
+                ggml_row_size(layer.k->type, n_embd_head_k),
+                ggml_row_size(layer.k->type, n_embd_k_gqa),
+                0);
+
+        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+
+        ggml_build_forward_expand(gf, cur);
+    }
+
+    res->add_input(std::move(inp));
+
+    return res;
+}
+
+llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_cgraph * gf) const {
+    auto res = std::make_unique<llm_graph_result>();
+
+    const auto & ids = defrag_info.ids;
+
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
+
+        const size_t v_size_el = ggml_type_size(v_l[il]->type);
+        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
+
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
+
+        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
+
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == n_kv) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
+
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
+                }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+#else
+    for (uint32_t i = 0; i < ids.size(); ++i) {
+        const uint32_t id = ids[i];
+
+        if (i == id || id == ids.size()) {
+            continue;
+        }
+
+        uint32_t nm = 1;
+
+        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+            nm++;
+        }
+
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(layer.k->type, n_embd_k_gqa),
+                    ggml_row_size(layer.k->type, n_embd_k_gqa*i));
+
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(layer.k->type, n_embd_k_gqa),
+                    ggml_row_size(layer.k->type, n_embd_k_gqa*id));
+
+            ggml_tensor * view_v_src;
+            ggml_tensor * view_v_dst;
+
+            if (cparams.flash_attn) {
+                // NOTE: the V cache is not transposed when using flash attention
+                view_v_src = ggml_view_2d(ctx, layer.v,
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(layer.v->type, n_embd_v_gqa),
+                        ggml_row_size(layer.v->type, n_embd_v_gqa*i));
+
+                view_v_dst = ggml_view_2d(ctx, layer.v,
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(layer.v->type, n_embd_v_gqa),
+                        ggml_row_size(layer.v->type, n_embd_v_gqa*id));
+            } else {
+                view_v_src = ggml_view_2d(ctx, layer.v,
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(layer.v->type, cells.size()),
+                        ggml_row_size(layer.v->type, i));
+
+                view_v_dst = ggml_view_2d(ctx, layer.v,
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(layer.v->type, cells.size()),
+                        ggml_row_size(layer.v->type, id));
+            }
+
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
+        }
+
+        i += nm - 1;
+    }
+
+    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+#endif
+
+    return res;
+}
+
+bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+    const uint32_t n_layer = layers.size();
+
+    const uint32_t n_kv   = cells.used_max_p1();
+    const uint32_t n_used = cells.get_used();
+
+    assert(n_used <= n_kv);
+
+    //const int64_t t_start = ggml_time_us();
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    //const uint32_t max_moves = max_nodes()/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
+
+    // determine which KV cells to move where
+    //
+    //  cell i moves to ids[i]
+    //
+    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+    //
+    auto & ids = defrag_info.ids;
+
+    ids.clear();
+    ids.resize(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+        if (!cells.is_empty(i0)) {
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        uint32_t nh = 1;
+
+        // determine the size of the hole
+        while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
+            nh++;
+        }
+
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+
+        // starting from the end, find nh non-empty cells
+        for (; is > i0; --is) {
+            if (cells.is_empty(is) || ids[is] != n_kv) {
+                continue;
+            }
+
+            // non-empty cell which is not yet moved
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        // this can only happen if `n_used` is not accurate, which would be a bug
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        uint32_t i1 = is;
+
+        // are we moving a continuous block of memory?
+        bool cont = false;
+
+        // should we stop searching for the next move?
+        bool stop = false;
+
+        // go back and move the nf cells to the hole
+        for (; i1 < n_kv; ++i1) {
+            if (cells.is_empty(i1) || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
+                cont = false;
+                continue;
+            }
+
+            // this cell goes to (i0 + nf)
+            ids[i1] = i0 + nf;
+
+            // move the cell meta data
+            cells.mv(i1, i0 + nf);
+
+            head = n_used;
+
+            if (!cont) {
+                n_moves++;
+                cont = true;
+            }
+
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
+        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+
+        i0 += nh - 1;
+    }
+
+    if (n_moves == 0) {
+        return false;
+    }
+
+    LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
+
+    LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
+
+    return true;
+}
+
+bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
+    assert(p0 >= 0 && p1 >= 0);
+
+    switch (swa_type) {
+        case LLAMA_SWA_TYPE_NONE:
+            {
+            } break;
+        case LLAMA_SWA_TYPE_STANDARD:
+            {
+                if (p1 - p0 >= (int32_t) n_swa) {
+                    return true;
+                }
+            } break;
+        case LLAMA_SWA_TYPE_CHUNKED:
+            {
+                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
+
+                if (p0 < pos_chunk_start) {
+                    return true;
+                }
+            } break;
+    }
+
+    return false;
+}
+
+void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = cells.size();
+
+    for (uint32_t i = 0; i < cells.size(); ++i) {
+        if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
+            ++cell_count;
+            if (cell_range_begin == cells.size()) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != cells.size()) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = cells.size();
+            }
+        }
+    }
+
+    if (cell_range_begin != cells.size()) {
+        cell_ranges.emplace_back(cell_range_begin, cells.size());
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges);
+}
+
+void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear();
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            std::vector<llama_seq_id> seq_ids;
+
+            for (llama_seq_id cur = 0; cur < (int) n_seq_max; ++cur) {
+                if (cur == seq_id || seq_id == -1) {
+                    if (cells.seq_has(i, cur)) {
+                        seq_ids.push_back(cur);
+                    }
+                }
+            }
+
+            const llama_pos pos     = cells.pos_get(i);
+            const uint32_t n_seq_id = seq_ids.size();
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            for (const auto & seq_id : seq_ids) {
+                io.write(&seq_id, sizeof(seq_id));
+            }
+        }
+    }
+}
+
+void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+    const uint32_t v_trans = this->v_trans ? 1 : 0;
+    const uint32_t n_layer = layers.size();
+
+    io.write(&v_trans, sizeof(v_trans));
+    io.write(&n_layer, sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Write key type
+        const int32_t k_type_i = (int32_t)layer.k->type;
+        io.write(&k_type_i, sizeof(k_type_i));
+
+        // Write row size of key
+        const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
+        io.write(&k_size_row, sizeof(k_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * k_size_row;
+            io.write_tensor(layer.k, range.first * k_size_row, buf_size);
+        }
+    }
+
+    if (!v_trans) {
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)layer.v->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write row size of value
+            const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
+            io.write(&v_size_row, sizeof(v_size_row));
+
+            // Read each range of cells of v_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * v_size_row;
+                io.write_tensor(layer.v, range.first * v_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t kv_size = cells.size();
+
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)layer.v->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write element size
+            const uint32_t v_size_el = ggml_type_size(layer.v->type);
+            io.write(&v_size_el, sizeof(v_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                    const size_t buf_size = range_size * v_size_el;
+                    io.write_tensor(layer.v, src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_sbatch sbatch;
+        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+
+        batch.n_tokens = cell_count;
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 1) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            // read the sequence id, but directly discard it - we will use dest_seq_id instead
+            {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+            }
+
+            batch.pos[i]      = pos;
+            batch.n_seq_id[i] = n_seq_id;
+            batch.seq_id[i]   = &dest_seq_id;
+        }
+
+        const auto head_cur = find_slot(batch);
+        if (head_cur < 0) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+
+        apply_ubatch(head_cur, batch);
+
+        // keep the head at the old position because we will read the KV data into it in state_read_data()
+        head = head_cur;
+
+        // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head_cur + cell_count <= cells.size());
+        GGML_ASSERT(cells.pos_get(head_cur)                  == batch.pos[0]);
+        GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == batch.pos[cell_count - 1]);
+        GGML_ASSERT(cells.seq_has(head_cur,                  dest_seq_id));
+        GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > cells.size()) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear();
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cells.pos_set(i, pos);
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
+                    return false;
+                }
+
+                cells.seq_add(i, seq_id);
+            }
+        }
+
+        head = 0;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+    uint32_t v_trans;
+    uint32_t n_layer;
+
+    io.read_to(&v_trans, sizeof(v_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != layers.size()) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
+        return false;
+    }
+
+    if (cell_count > cells.size()) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, cells.size());
+        return false;
+    }
+
+    if (this->v_trans != (bool) v_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (const auto & layer : layers) {
+        const uint32_t il = layer.il;
+
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Read type of key
+        int32_t k_type_i_ref;
+        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+        const int32_t k_type_i = (int32_t) layer.k->type;
+        if (k_type_i != k_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t k_size_row_ref;
+        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+        const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
+        if (k_size_row != k_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+        }
+    }
+
+    if (!this->v_trans) {
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)layer.v->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t v_size_row_ref;
+            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+            const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
+            if (v_size_row != v_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (const auto & layer : layers) {
+            const uint32_t il = layer.il;
+
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)layer.v->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t v_size_el_ref;
+            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+            const size_t v_size_el = ggml_type_size(layer.v->type);
+            if (v_size_el != v_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                return false;
+            }
+
+            // Read GQA embedding size
+            uint32_t n_embd_v_gqa_ref;
+            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    const size_t dst_offset = (head + j * cells.size()) * v_size_el;
+                    ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+//
+// llama_kv_cache_unified_state
+//
+
+llama_kv_cache_unified_state::llama_kv_cache_unified_state(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_unified_state::llama_kv_cache_unified_state(
+            llama_memory_status status,
+            llama_kv_cache_unified * kv) : status(status), kv(kv) {
+        n_kv = kv->get_size();
+        head = 0;
+    }
+
+llama_kv_cache_unified_state::llama_kv_cache_unified_state(
+            llama_memory_status status,
+            llama_kv_cache_unified * kv,
+            llama_sbatch sbatch,
+            std::vector<uint32_t> heads,
+            std::vector<llama_ubatch> ubatches)
+            : status(status),
+              kv(kv),
+              sbatch(std::move(sbatch)),
+              heads(std::move(heads)),
+              ubatches(std::move(ubatches)) {
+    }
+
+llama_kv_cache_unified_state::~llama_kv_cache_unified_state() = default;
+
+bool llama_kv_cache_unified_state::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified_state::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    kv->apply_ubatch(heads[i_next], ubatches[i_next]);
+
+    n_kv = kv->get_n_kv();
+    head = heads[i_next];
+
+    return true;
+}
+
+std::vector<int64_t> & llama_kv_cache_unified_state::out_ids() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return sbatch.out_ids;
+}
+
+llama_memory_status llama_kv_cache_unified_state::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_unified_state::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+uint32_t llama_kv_cache_unified_state::get_n_kv() const {
+    return n_kv;
+}
+
+ggml_tensor * llama_kv_cache_unified_state::get_k(ggml_context * ctx, int32_t il) const {
+    return kv->get_k(ctx, il, n_kv);
+}
+
+ggml_tensor * llama_kv_cache_unified_state::get_v(ggml_context * ctx, int32_t il) const {
+    return kv->get_v(ctx, il, n_kv);
+}
+
+ggml_tensor * llama_kv_cache_unified_state::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
+    return kv->cpy_k(ctx, k_cur, il, head);
+}
+
+ggml_tensor * llama_kv_cache_unified_state::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
+    return kv->cpy_v(ctx, v_cur, il, head);
+}
+
+void llama_kv_cache_unified_state::set_input_k_shift(ggml_tensor * dst) const {
+    kv->set_input_k_shift(dst);
+}
+
+void llama_kv_cache_unified_state::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
+    kv->set_input_kq_mask(dst, ubatch, causal_attn);
+}
+
+void llama_kv_cache_unified_state::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+    kv->set_input_pos_bucket(dst, ubatch);
+}
+
+uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
+}
diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h
new file mode 100644
index 000000000..1f1d44b97
--- /dev/null
+++ b/src/llama-kv-cache-unified.h
@@ -0,0 +1,278 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cache.h"
+#include "llama-kv-cells.h"
+
+#include <unordered_map>
+#include <vector>
+
+struct llama_cparams;
+struct llama_hparams;
+struct llama_model;
+struct llama_context;
+
+//
+// llama_kv_cache_unified
+//
+
+class llama_kv_cache_unified : public llama_kv_cache {
+public:
+    static uint32_t get_padding(const llama_cparams & cparams);
+
+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+
+    llama_kv_cache_unified(
+            const llama_model &  model,
+              layer_filter_cb && filter,
+                    ggml_type    type_k,
+                    ggml_type    type_v,
+                         bool    v_trans,
+                         bool    offload,
+                     uint32_t    kv_size,
+                     uint32_t    n_seq_max,
+                     uint32_t    n_pad,
+                     uint32_t    n_swa,
+               llama_swa_type    swa_type);
+
+    ~llama_kv_cache_unified() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    void clear() override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    //
+    // llama_kv_cache
+    //
+
+    llama_memory_state_ptr init_batch(
+            const llama_batch & batch,
+            uint32_t n_ubatch,
+            bool embd_pooled,
+            bool logits_all) override;
+
+    llama_memory_state_ptr init_full() override;
+
+    bool update(llama_context & lctx) override;
+
+    void defrag_sched(float thold) override;
+
+    bool get_can_shift() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_kv_cache_unified specific API
+    //
+
+    uint32_t get_size() const;
+
+    //
+    // graph_build API
+    //
+
+    uint32_t get_n_kv() const;
+
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
+
+    // store k_cur and v_cur in the cache based on the provided head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const;
+
+    //
+    // preparation API
+    //
+
+    // find places for the provided ubatches in the cache, returns the head locations
+    // return empty vector on failure
+    std::vector<uint32_t> prepare(const std::vector<llama_ubatch> & ubatches);
+
+    // return the cell position where we can insert the ubatch
+    // return -1 on failure to find a contiguous slot of kv cells
+    int32_t find_slot(const llama_ubatch & ubatch) const;
+
+    // emplace the ubatch context into slot: [head_cur, head_cur + ubatch.n_tokens)
+    void apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch);
+
+    //
+    // set_input API
+    //
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+    void set_input_k_shift   (ggml_tensor * dst) const;
+    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+private:
+    const llama_model & model;
+    const llama_hparams & hparams;
+
+    struct kv_layer {
+        // layer index in the model
+        // note: can be different from the layer index in the KV cache
+        uint32_t il;
+
+        ggml_tensor * k;
+        ggml_tensor * v;
+    };
+
+    bool do_defrag = false;
+    bool v_trans   = true;  // the value tensor is transposed
+
+    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
+    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
+    uint32_t head = 0;
+
+    const uint32_t n_seq_max = 1;
+
+    // required padding
+    const uint32_t n_pad = 1;
+
+    // SWA
+    const uint32_t n_swa = 0;
+
+    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
+    std::vector<ggml_context_ptr>        ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    llama_kv_cells_unified cells;
+
+    std::vector<kv_layer> layers;
+
+    // model layer id -> KV cache layer id
+    std::unordered_map<int32_t, int32_t> map_layer_ids;
+
+    // defrag
+    struct {
+        std::vector<uint32_t> ids;
+    } defrag_info;
+
+    // return true if cells have been moved
+    bool defrag_prepare(int32_t n_max_nodes);
+
+    size_t total_size() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
+    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
+
+    ggml_tensor * build_rope_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_tensor * cur,
+                    ggml_tensor * shift,
+                    ggml_tensor * factors,
+                          float   freq_base,
+                          float   freq_scale) const;
+
+    llm_graph_result_ptr build_graph_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf) const;
+
+    llm_graph_result_ptr build_graph_defrag(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf) const;
+
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
+
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
+};
+
+class llama_kv_cache_unified_state : public llama_memory_state_i {
+public:
+    // used for errors
+    llama_kv_cache_unified_state(llama_memory_status status);
+
+    // used to create a full-cache state
+    llama_kv_cache_unified_state(
+            llama_memory_status status,
+            llama_kv_cache_unified * kv);
+
+    // used to create a state from a batch
+    llama_kv_cache_unified_state(
+            llama_memory_status status,
+            llama_kv_cache_unified * kv,
+            llama_sbatch sbatch,
+            std::vector<uint32_t> heads,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_unified_state();
+
+    //
+    // llama_memory_state_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    std::vector<int64_t> & out_ids() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_unified_state specific API
+    //
+
+    uint32_t get_n_kv() const;
+
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
+
+    // store k_cur and v_cur in the cache based on the provided head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
+
+    void set_input_k_shift(ggml_tensor * dst) const;
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+private:
+    const llama_memory_status status;
+
+    llama_kv_cache_unified * kv;
+
+    llama_sbatch sbatch;
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<uint32_t> heads;
+    std::vector<llama_ubatch> ubatches;
+
+    //
+    // data needed for building the compute graph for the current ubatch:
+    //
+
+    // a heuristic, to avoid attending the full cache if it is not yet utilized
+    // as the cache gets filled, the benefit from this heuristic disappears
+    int32_t n_kv;
+
+    // the beginning of the current slot in which the ubatch will be inserted
+    int32_t head;
+};
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a2624d715..aefd23e32 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1,2827 +1 @@
 #include "llama-kv-cache.h"
-
-#include "llama-impl.h"
-#include "llama-batch.h"
-#include "llama-cparams.h"
-#include "llama-model.h"
-#include "llama-context.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <limits>
-#include <map>
-#include <stdexcept>
-
-//
-// llama_kv_cache_unified
-//
-
-uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) {
-    // the FA kernels require padding to avoid extra runtime boundary checks
-    return cparams.flash_attn ? 256u : 32u;
-}
-
-llama_kv_cache_unified::llama_kv_cache_unified(
-        const llama_model &  model,
-          layer_filter_cb && filter,
-                ggml_type    type_k,
-                ggml_type    type_v,
-                     bool    v_trans,
-                     bool    offload,
-                 uint32_t    kv_size,
-                 uint32_t    n_seq_max,
-                 uint32_t    n_pad,
-                 uint32_t    n_swa,
-           llama_swa_type    swa_type) :
-    model(model), hparams(model.hparams), v_trans(v_trans),
-    n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
-
-    GGML_ASSERT(kv_size % n_pad == 0);
-
-    // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        auto it = ctx_map.find(buft);
-        if (it == ctx_map.end()) {
-            ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ true,
-            };
-
-            ggml_context * ctx = ggml_init(params);
-            if (!ctx) {
-                return nullptr;
-            }
-
-            ctx_map[buft] = ctx;
-            ctxs.emplace_back(ctx);
-
-            return ctx;
-        }
-
-        return it->second;
-    };
-
-    head = 0;
-    size = kv_size;
-    used = 0;
-
-    cells.resize(kv_size);
-
-    for (uint32_t il = 0; il < hparams.n_layer; il++) {
-        if (filter && !filter(il)) {
-            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
-            continue;
-        }
-
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-        const char * dev_name = "CPU";
-
-        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
-
-        if (offload) {
-            auto * dev = model.dev_layer(il);
-            buft = ggml_backend_dev_buffer_type(dev);
-
-            dev_name = ggml_backend_dev_name(dev);
-        }
-
-        LLAMA_LOG_DEBUG("%s: layer %3d: dev = %s\n", __func__, il, dev_name);
-
-        ggml_context * ctx = ctx_for_buft(buft);
-        if (!ctx) {
-            throw std::runtime_error("failed to create ggml context for kv cache");
-        }
-
-        ggml_tensor * k;
-        ggml_tensor * v;
-
-        k = ggml_new_tensor_2d(ctx, type_k, n_embd_k_gqa, kv_size);
-        v = ggml_new_tensor_2d(ctx, type_v, n_embd_v_gqa, kv_size);
-
-        ggml_format_name(k, "cache_k_l%d", il);
-        ggml_format_name(v, "cache_v_l%d", il);
-
-        map_layer_ids[il] = layers.size();
-        layers.push_back({ il, k, v });
-    }
-
-    // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        auto * buft = it.first;
-        auto * ctx  = it.second;
-
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-        if (!buf) {
-            throw std::runtime_error("failed to allocate buffer for kv cache");
-        }
-
-        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-
-        ggml_backend_buffer_clear(buf, 0);
-        bufs.emplace_back(buf);
-    }
-
-    {
-        const size_t memory_size_k = size_k_bytes();
-        const size_t memory_size_v = size_v_bytes();
-
-        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-    }
-}
-
-void llama_kv_cache_unified::clear() {
-    for (uint32_t i = 0; i < size; ++i) {
-        cells[i].pos = -1;
-        cells[i].seq_id.clear();
-    }
-
-    head = 0;
-    used = 0;
-
-    for (auto & buf : bufs) {
-        ggml_backend_buffer_clear(buf.get(), 0);
-    }
-}
-
-bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    uint32_t new_head = size;
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].pos >= p0 && cells[i].pos < p1) {
-            if (seq_id < 0) {
-                cells[i].seq_id.clear();
-            } else if (cells[i].has_seq_id(seq_id)) {
-                cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-
-            if (cells[i].is_empty()) {
-                // keep count of the number of used cells
-                if (cells[i].pos >= 0) {
-                    used--;
-                }
-
-                cells[i].pos = -1;
-
-                if (new_head == size) {
-                    new_head = i;
-                }
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != size && new_head < head) {
-        head = new_head;
-    }
-
-    return true;
-}
-
-void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    if (seq_id_src == seq_id_dst) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // otherwise, this is the KV of a Transformer-like model
-    head = 0;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
-            cells[i].seq_id.insert(seq_id_dst);
-        }
-    }
-}
-
-void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
-    uint32_t new_head = size;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (!cells[i].has_seq_id(seq_id)) {
-            if (cells[i].pos >= 0) {
-                used--;
-            }
-
-            cells[i].pos = -1;
-            cells[i].seq_id.clear();
-
-            if (new_head == size){
-                new_head = i;
-            }
-        } else {
-            cells[i].seq_id.clear();
-            cells[i].seq_id.insert(seq_id);
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != size && new_head < head) {
-        head = new_head;
-    }
-}
-
-void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    if (delta == 0) {
-        return;
-    }
-
-    uint32_t new_head = size;
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // If there is no range then return early to avoid looping over the
-    if (p0 == p1) {
-        return;
-    }
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
-            has_shift = true;
-
-            cells[i].pos   += delta;
-            cells[i].delta += delta;
-
-            if (cells[i].pos < 0) {
-                if (!cells[i].is_empty()) {
-                    used--;
-                }
-                cells[i].pos = -1;
-                cells[i].seq_id.clear();
-                if (new_head == size) {
-                    new_head = i;
-                }
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    // Otherwise we just start the next search from the beginning.
-    head = new_head != size ? new_head : 0;
-}
-
-void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    if (d == 1) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) {
-        return;
-    }
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
-            has_shift = true;
-
-            {
-                llama_pos p_old = cells[i].pos;
-                cells[i].pos   /= d;
-                cells[i].delta += cells[i].pos - p_old;
-            }
-        }
-    }
-}
-
-llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
-    llama_pos result = std::numeric_limits<llama_pos>::max();
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id)) {
-            result = std::min(result, cells[i].pos);
-        }
-    }
-
-    if (result == std::numeric_limits<llama_pos>::max()) {
-        result = -1;
-    }
-
-    return result;
-}
-
-llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
-    llama_pos result = -1;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id)) {
-            result = std::max(result, cells[i].pos);
-        }
-    }
-
-    return result;
-}
-
-void llama_kv_cache_unified::restore() {
-    for (const auto & [id, cell] : recovery.cells) {
-        // TODO: move to new `struct kv_cells`
-        const bool is_empty0 = cells[id].is_empty();
-        const bool is_empty1 = cell.is_empty();
-
-        if (!is_empty0 && is_empty1) {
-            used--;
-        } else if (is_empty0 && !is_empty1) {
-            used++;
-        }
-
-        cells[id] = cell;
-    }
-
-    recovery.clear();
-}
-
-void llama_kv_cache_unified::commit() {
-    if (recovery.cells.empty()) {
-        LLAMA_LOG_WARN("%s: the recovery information upon a commit was empty - might indicate a bug (ref: %s)\n",
-                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194");
-        return;
-    }
-
-    recovery.clear();
-}
-
-bool llama_kv_cache_unified::update(llama_context & lctx) {
-    bool need_reserve = false;
-
-    auto * sched = lctx.get_sched();
-
-    if (has_shift) {
-        if (!get_can_shift()) {
-            GGML_ABORT("The current KV cache / model configuration does not support K-shift");
-        }
-
-        LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
-
-        // apply K-shift if needed
-        if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
-            ggml_backend_sched_reset(sched);
-
-            auto * gf = lctx.graph_init();
-
-            auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
-
-            ggml_backend_sched_alloc_graph(sched, gf);
-
-            res->set_inputs(nullptr);
-
-            lctx.graph_compute(gf, false);
-
-            need_reserve = true;
-        }
-
-        {
-            has_shift = false;
-
-            for (uint32_t i = 0; i < size; ++i) {
-                cells[i].delta = 0;
-            }
-        }
-    }
-
-    if (do_defrag) {
-        LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
-
-        if (defrag_prepare(lctx.graph_max_nodes())) {
-            ggml_backend_sched_reset(sched);
-
-            auto * gf = lctx.graph_init();
-
-            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
-
-            ggml_backend_sched_alloc_graph(sched, gf);
-
-            res->set_inputs(nullptr);
-
-            lctx.graph_compute(gf, false);
-
-            need_reserve = true;
-        }
-
-        do_defrag = false;
-    }
-
-    return need_reserve;
-}
-
-void llama_kv_cache_unified::defrag_sched(float thold) {
-    // - do not defrag small contexts (i.e. < 2048 tokens)
-    // - count the padding towards the number of used tokens
-    const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
-
-    // queue defragmentation for next llama_kv_cache_update
-    if (fragmentation > thold) {
-        LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
-
-        do_defrag = true;
-    }
-}
-
-void llama_kv_cache_unified::set_full() {
-    n = size;
-
-    // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
-    //   affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
-    //   we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
-    //   setting it to 0 is the simplest way to achieve that
-    // ref: https://github.com/ggml-org/llama.cpp/issues/13359
-    head = 0;
-}
-
-llama_sbatch llama_kv_cache_unified::sbatch_init(const llama_batch & batch, bool logits_all) {
-    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
-}
-
-llama_ubatch llama_kv_cache_unified::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
-    GGML_UNUSED(embd_pooled);
-    return sbatch.split_simple(n_ubatch);
-}
-
-bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
-    const uint32_t n_tokens = ubatch.n_tokens;
-
-    // if we have enough unused cells before the current head ->
-    //   better to start searching from the beginning of the cache, hoping to fill it
-    if (head > used + 2*ubatch.n_tokens) {
-        head = 0;
-    }
-
-    // otherwise, one cell per token.
-
-    if (n_tokens > size) {
-        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
-        return false;
-    }
-
-//#define FIND_SLOT_DEBUG 1
-#if FIND_SLOT_DEBUG
-    LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
-
-    // for debugging
-    {
-        std::string ss;
-        if (n_swa > 0) {
-            for (uint32_t i = 0; i < size; ++i) {
-                if (cells[i].pos == -1) {
-                    ss += '.';
-                } else {
-                    ss += std::to_string(*cells[i].seq_id.begin());
-                }
-                if (i%256 == 255) {
-                    ss += '\n';
-                }
-            }
-        }
-        LLAMA_LOG_WARN("\n%s\n", ss.c_str());
-    }
-#endif
-
-    uint32_t n_tested = 0;
-
-    while (true) {
-        if (head + n_tokens > size) {
-            n_tested += size - head;
-            head = 0;
-            continue;
-        }
-
-        bool found = true;
-        for (uint32_t i = 0; i < n_tokens; i++) {
-            if (cells[head + i].pos >= 0) {
-                found = false;
-                head     += i + 1;
-                n_tested += i + 1;
-                break;
-            }
-        }
-
-        if (found) {
-            break;
-        }
-
-        if (n_tested >= size) {
-            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return false;
-        }
-    }
-
-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        // remember the original state
-        if (recovery.cells.find(head + i) == recovery.cells.end()) {
-            recovery.cells[head + i] = cells[head + i];
-        }
-
-        cells[head + i].pos = ubatch.pos[i];
-
-        for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
-            cells[head + i].seq_id.insert(ubatch.seq_id[i][j]);
-        }
-    }
-
-    used += n_tokens;
-
-    // a heuristic, to avoid attending the full cache if it is not yet utilized
-    // after enough generations, the benefit from this heuristic disappears
-    // if we start defragmenting the cache, the benefit from this will be more important
-    n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
-
-#ifdef FIND_SLOT_DEBUG
-    LLAMA_LOG_WARN("end:   n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
-#endif
-
-    return true;
-}
-
-bool llama_kv_cache_unified::get_can_shift() const {
-    return true;
-}
-
-uint32_t llama_kv_cache_unified::get_n() const {
-    return n;
-}
-
-uint32_t llama_kv_cache_unified::get_size() const {
-    return size;
-}
-
-ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il) const {
-    const int32_t ikv = map_layer_ids.at(il);
-
-    auto * k = layers[ikv].k;
-
-    return ggml_view_3d(ctx, k,
-            hparams.n_embd_head_k, hparams.n_head_kv(il), n,
-            ggml_row_size(k->type, hparams.n_embd_head_k),
-            ggml_row_size(k->type, hparams.n_embd_k_gqa(il)),
-            0);
-}
-
-ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il) const {
-    const int32_t ikv = map_layer_ids.at(il);
-
-    auto * v = layers[ikv].v;
-
-    if (!v_trans) {
-        // note: v->nb[1] <= v->nb[2]
-        return ggml_view_3d(ctx, v,
-                hparams.n_embd_head_v, hparams.n_head_kv(il), n,
-                ggml_row_size(v->type, hparams.n_embd_head_v),    // v->nb[1]
-                ggml_row_size(v->type, hparams.n_embd_v_gqa(il)), // v->nb[2]
-                0);
-    }
-
-    // note: v->nb[1] > v->nb[2]
-    return ggml_view_3d(ctx, v,
-            n, hparams.n_head_kv(il), hparams.n_embd_head_v,
-            ggml_row_size(v->type, v->ne[1]*hparams.n_embd_head_v), // v->nb[1]
-            ggml_row_size(v->type, v->ne[1]),                       // v->nb[2]
-            0);
-}
-
-ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
-    const int32_t ikv = map_layer_ids.at(il);
-
-    auto * k = layers[ikv].k;
-
-    const int64_t n_tokens = k_cur->ne[2];
-
-    ggml_tensor * k_view = ggml_view_1d(ctx, k,
-            n_tokens*hparams.n_embd_k_gqa(il),
-            ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head);
-
-    return ggml_cpy(ctx, k_cur, k_view);
-}
-
-ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
-    const int32_t ikv = map_layer_ids.at(il);
-
-    auto * v = layers[ikv].v;
-
-    const int64_t n_tokens = v_cur->ne[2];
-
-    v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
-
-    ggml_tensor * v_view = nullptr;
-
-    if (!v_trans) {
-        v_view = ggml_view_1d(ctx, v,
-                n_tokens*hparams.n_embd_v_gqa(il),
-                ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head);
-    } else {
-        // note: the V cache is transposed when not using flash attention
-        v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
-                (v->ne[1])*ggml_element_size(v),
-                (    head)*ggml_element_size(v));
-
-        v_cur = ggml_transpose(ctx, v_cur);
-    }
-
-    return ggml_cpy(ctx, v_cur, v_view);
-}
-
-void llama_kv_cache_unified::prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax) {
-    // no pruning is needed when the cache does not use SWA
-    GGML_ASSERT(swa_type != LLAMA_SWA_TYPE_NONE && "do not prune non-SWA cache");
-
-    int n_attended = 0;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        const llama_pos p0 = cells[i].pos;
-
-        if (p0 <= pmin && !is_masked_swa(p0, pmin)) {
-            n_attended++;
-        }
-
-        if (is_masked_swa(p0, pmax)) {
-            if (seq_id < 0) {
-                cells[i].seq_id.clear();
-            } else if (cells[i].has_seq_id(seq_id)) {
-                cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-
-            if (cells[i].is_empty()) {
-                // keep count of the number of used cells
-                if (cells[i].pos >= 0) {
-                    used--;
-                }
-
-                cells[i].pos = -1;
-            }
-        }
-    }
-
-    if (n_attended < std::min<int>(n_swa, pmin)) {
-        LLAMA_LOG_WARN("%s: partial SWA cache detected - possible loss of information, pmin = %d, n_attended = %d, n_swa = %d\n", __func__, pmin, n_attended, n_swa);
-    }
-}
-
-void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
-    const int64_t n_tokens     = ubatch->n_tokens;
-    const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-    const int64_t n_seqs       = ubatch->n_seqs;
-
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    float * data = (float *) dst->data;
-
-    const int64_t n_kv = n;
-
-    // Use only the previous KV cells of the correct sequence for each token of the ubatch.
-    // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
-    // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
-    //   Causal mask:
-    //      xxx-------
-    //      xxxx------
-    //      xxxxx-----
-    //   Non-causal mask:
-    //      xxxxx-----
-    //      xxxxx-----
-    //      xxxxx-----
-    // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
-    for (int h = 0; h < 1; ++h) {
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            for (int j = 0; j < n_seq_tokens; ++j) {
-                const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    const llama_pos p0 = cells[i].pos;
-
-                    bool masked = false;
-
-                    // mask the token if not the same sequence
-                    masked = masked || (!cells[i].has_seq_id(seq_id));
-
-                    // mask future tokens
-                    masked = masked || (causal_attn && p0 > p1);
-
-                    // apply SWA if any
-                    masked = masked || (is_masked_swa(p0, p1));
-
-                    float f = 0.0f;
-
-                    if (masked) {
-                        f = -INFINITY;
-                    } else if (hparams.use_alibi) {
-                        f = -std::abs(p0 - p1);
-                    }
-
-                    data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                }
-            }
-        }
-
-        // mask padded tokens
-        if (data) {
-            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                for (int j = 0; j < n_kv; ++j) {
-                    data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                }
-            }
-        }
-    }
-}
-
-void llama_kv_cache_unified::set_input_k_shift(ggml_tensor * dst) const {
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-
-    int32_t * data = (int32_t *) dst->data;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        data[i] = cells[i].delta;
-    }
-}
-
-void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
-    const int64_t n_tokens = ubatch->n_tokens;
-
-    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
-    GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
-
-    int32_t * data = (int32_t *) dst->data;
-
-    const int64_t n_kv = n;
-
-    for (int h = 0; h < 1; ++h) {
-        for (int j = 0; j < n_tokens; ++j) {
-            for (int i = 0; i < n_kv; ++i) {
-                data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
-            }
-        }
-    }
-}
-
-size_t llama_kv_cache_unified::total_size() const {
-    size_t size = 0;
-
-    for (const auto & buf : bufs) {
-        size += ggml_backend_buffer_get_size(buf.get());
-    }
-
-    return size;
-}
-
-size_t llama_kv_cache_unified::size_k_bytes() const {
-    size_t size_k_bytes = 0;
-
-    for (const auto & layer : layers) {
-        size_k_bytes += ggml_nbytes(layer.k);
-    }
-
-    return size_k_bytes;
-}
-
-size_t llama_kv_cache_unified::size_v_bytes() const {
-    size_t size_v_bytes = 0;
-
-    for (const auto & layer : layers) {
-        size_v_bytes += ggml_nbytes(layer.v);
-    }
-
-    return size_v_bytes;
-}
-
-ggml_tensor * llama_kv_cache_unified::build_rope_shift(
-        const llama_cparams & cparams,
-               ggml_context * ctx,
-                ggml_tensor * cur,
-                ggml_tensor * shift,
-                ggml_tensor * factors,
-                      float   freq_base,
-                      float   freq_scale) const {
-    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
-
-    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
-
-    const auto & n_rot     = hparams.n_rot;
-    const auto & rope_type = hparams.rope_type;
-
-    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
-    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
-
-    ggml_tensor * tmp;
-
-    if (ggml_is_quantized(cur->type)) {
-        // dequantize to f32 -> RoPE -> quantize back
-        tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
-
-        tmp = ggml_rope_ext(ctx, tmp,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-
-        tmp = ggml_cpy(ctx, tmp, cur);
-    } else {
-        // we rotate only the first n_rot dimensions
-        tmp = ggml_rope_ext_inplace(ctx, cur,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-    }
-
-    return tmp;
-}
-
-class llm_graph_input_k_shift : public llm_graph_input_i {
-public:
-    llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_k_shift() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * k_shift; // I32 [kv_size]
-
-    const llama_kv_cache_unified * kv_self;
-};
-
-void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
-    GGML_UNUSED(ubatch);
-
-    if (k_shift) {
-        kv_self->set_input_k_shift(k_shift);
-    }
-}
-
-llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
-        const llama_cparams & cparams,
-               ggml_context * ctx,
-                ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
-
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    //GGML_ASSERT(kv_self->size == n_ctx);
-
-    auto inp = std::make_unique<llm_graph_input_k_shift>(this);
-
-    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
-    ggml_set_input(inp->k_shift);
-
-    for (const auto & layer : layers) {
-        const uint32_t il = layer.il;
-
-        const int64_t n_head_kv    = hparams.n_head_kv(il);
-        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-
-        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
-        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
-
-        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
-
-        ggml_tensor * k =
-            ggml_view_3d(ctx, layer.k,
-                n_embd_head_k, n_head_kv, size,
-                ggml_row_size(layer.k->type, n_embd_head_k),
-                ggml_row_size(layer.k->type, n_embd_k_gqa),
-                0);
-
-        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
-
-        ggml_build_forward_expand(gf, cur);
-    }
-
-    res->add_input(std::move(inp));
-
-    return res;
-}
-
-llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
-        const llama_cparams & cparams,
-               ggml_context * ctx,
-                ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
-
-    const auto & ids = defrag_info.ids;
-
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have ggml_graph based defrag
-    //
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    const uint32_t kv_size = size;
-
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
-
-        const size_t v_size_el = ggml_type_size(v_l[il]->type);
-        const size_t v_size    = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
-
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-
-        ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
-
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == n_kv) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-
-            i += nm - 1;
-        }
-
-        ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    for (uint32_t i = 0; i < ids.size(); ++i) {
-        const uint32_t id = ids[i];
-
-        if (i == id || id == ids.size()) {
-            continue;
-        }
-
-        uint32_t nm = 1;
-
-        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-            nm++;
-        }
-
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-            ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
-                    n_embd_k_gqa, nm,
-                    ggml_row_size(layer.k->type, n_embd_k_gqa),
-                    ggml_row_size(layer.k->type, n_embd_k_gqa*i));
-
-            ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
-                    n_embd_k_gqa, nm,
-                    ggml_row_size(layer.k->type, n_embd_k_gqa),
-                    ggml_row_size(layer.k->type, n_embd_k_gqa*id));
-
-            ggml_tensor * view_v_src;
-            ggml_tensor * view_v_dst;
-
-            if (cparams.flash_attn) {
-                // NOTE: the V cache is not transposed when using flash attention
-                view_v_src = ggml_view_2d(ctx, layer.v,
-                        n_embd_v_gqa, nm,
-                        ggml_row_size(layer.v->type, n_embd_v_gqa),
-                        ggml_row_size(layer.v->type, n_embd_v_gqa*i));
-
-                view_v_dst = ggml_view_2d(ctx, layer.v,
-                        n_embd_v_gqa, nm,
-                        ggml_row_size(layer.v->type, n_embd_v_gqa),
-                        ggml_row_size(layer.v->type, n_embd_v_gqa*id));
-            } else {
-                view_v_src = ggml_view_2d(ctx, layer.v,
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(layer.v->type, size),
-                        ggml_row_size(layer.v->type, i));
-
-                view_v_dst = ggml_view_2d(ctx, layer.v,
-                        nm, n_embd_v_gqa,
-                        ggml_row_size(layer.v->type, size),
-                        ggml_row_size(layer.v->type, id));
-            }
-
-            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
-            ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
-        }
-
-        i += nm - 1;
-    }
-
-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-#endif
-
-    return res;
-}
-
-bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
-    const uint32_t n_layer = layers.size();
-
-    const uint32_t n_kv   = cell_max();
-    const uint32_t n_used = used;
-
-    assert(n_used <= n_kv);
-
-    //const int64_t t_start = ggml_time_us();
-
-    // number of cells moved
-    uint32_t n_moves = 0;
-
-    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
-    //   - source view, destination view, copy operation
-    //   - x2 for keys and values
-    //const uint32_t max_moves = max_nodes()/(6*n_layer);
-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
-
-    // determine which KV cells to move where
-    //
-    //  cell i moves to ids[i]
-    //
-    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
-    //
-    auto & ids = defrag_info.ids;
-
-    ids.clear();
-    ids.resize(n_kv, n_kv);
-
-    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
-        const auto & cell0 = cells[i0];
-
-        if (!cell0.is_empty()) {
-            ids[i0] = i0;
-
-            continue;
-        }
-
-        // found a hole - fill it with data from the end of the cache
-
-        uint32_t nh = 1;
-
-        // determine the size of the hole
-        while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
-            nh++;
-        }
-
-        uint32_t nf = 0;
-        uint32_t is = n_kv - 1;
-
-        // starting from the end, find nh non-empty cells
-        for (; is > i0; --is) {
-            const auto & cell1 = cells[is];
-
-            if (cell1.is_empty() || ids[is] != n_kv) {
-                continue;
-            }
-
-            // non-empty cell which is not yet moved
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        // this can only happen if `n_used` is not accurate, which would be a bug
-        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
-
-        nf = 0;
-
-        uint32_t i1 = is;
-
-        // are we moving a continuous block of memory?
-        bool cont = false;
-
-        // should we stop searching for the next move?
-        bool stop = false;
-
-        // go back and move the nf cells to the hole
-        for (; i1 < n_kv; ++i1) {
-            auto & cell1 = cells[i1];
-
-            if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                cont = false;
-                continue;
-            }
-
-            // this cell goes to (i0 + nf)
-            ids[i1] = i0 + nf;
-
-            // move the cell meta data
-            cells[i0 + nf] = cell1;
-
-            // clear the old cell and move the head there
-            cell1 = kv_cell();
-            head = n_used;
-
-            if (!cont) {
-                n_moves++;
-                cont = true;
-            }
-
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
-
-        i0 += nh - 1;
-    }
-
-    if (n_moves == 0) {
-        return false;
-    }
-
-    LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
-
-    LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
-
-    return true;
-}
-
-uint32_t llama_kv_cache_unified::cell_max() const {
-    for (uint32_t i = size; i > 0; --i) {
-        const kv_cell & cell = cells[i - 1];
-
-        if (cell.pos >= 0 && !cell.is_empty()) {
-            return i;
-        }
-    }
-
-    return 0;
-}
-
-bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
-    if (p0 < 0) {
-        return true;
-    }
-
-    switch (swa_type) {
-        case LLAMA_SWA_TYPE_NONE:
-            {
-            } break;
-        case LLAMA_SWA_TYPE_STANDARD:
-            {
-                if (p1 - p0 >= (int32_t) n_swa) {
-                    return true;
-                }
-            } break;
-        case LLAMA_SWA_TYPE_CHUNKED:
-            {
-                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
-
-                if (p0 < pos_chunk_start) {
-                    return true;
-                }
-            } break;
-    }
-
-    return false;
-}
-
-void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
-    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
-    uint32_t cell_count = 0;
-
-    // Count the number of cells with the specified seq_id
-    // Find all the ranges of cells with this seq id (or all, when -1)
-    uint32_t cell_range_begin = size;
-    for (uint32_t i = 0; i < size; ++i) {
-        const auto & cell = cells[i];
-        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
-            ++cell_count;
-            if (cell_range_begin == size) {
-                cell_range_begin = i;
-            }
-        } else {
-            if (cell_range_begin != size) {
-                cell_ranges.emplace_back(cell_range_begin, i);
-                cell_range_begin = size;
-            }
-        }
-    }
-    if (cell_range_begin != size) {
-        cell_ranges.emplace_back(cell_range_begin, size);
-    }
-
-    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
-    uint32_t cell_count_check = 0;
-    for (const auto & range : cell_ranges) {
-        cell_count_check += range.second - range.first;
-    }
-    GGML_ASSERT(cell_count == cell_count_check);
-
-    io.write(&cell_count, sizeof(cell_count));
-
-    state_write_meta(io, cell_ranges, seq_id);
-    state_write_data(io, cell_ranges);
-}
-
-void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
-    uint32_t cell_count;
-    io.read_to(&cell_count, sizeof(cell_count));
-
-    bool res = true;
-    res = res && state_read_meta(io, cell_count, seq_id);
-    res = res && state_read_data(io, cell_count);
-
-    if (!res) {
-        if (seq_id == -1) {
-            clear();
-        } else {
-            seq_rm(seq_id, -1, -1);
-        }
-        throw std::runtime_error("failed to restore kv cache");
-    }
-}
-
-void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
-    for (const auto & range : cell_ranges) {
-        for (uint32_t i = range.first; i < range.second; ++i) {
-            const auto & cell = cells[i];
-            const llama_pos pos      = cell.pos;
-            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
-
-            io.write(&pos,      sizeof(pos));
-            io.write(&n_seq_id, sizeof(n_seq_id));
-
-            if (n_seq_id) {
-                for (auto seq_id : cell.seq_id) {
-                    io.write(&seq_id, sizeof(seq_id));
-                }
-            }
-        }
-    }
-}
-
-void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
-    const uint32_t v_trans = this->v_trans ? 1 : 0;
-    const uint32_t n_layer = layers.size();
-
-    io.write(&v_trans, sizeof(v_trans));
-    io.write(&n_layer, sizeof(n_layer));
-
-    std::vector<uint8_t> tmp_buf;
-
-    // Iterate and write all the keys first, each row is a cell
-    // Get whole range at a time
-    for (const auto & layer : layers) {
-        const uint32_t il = layer.il;
-
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Write key type
-        const int32_t k_type_i = (int32_t)layer.k->type;
-        io.write(&k_type_i, sizeof(k_type_i));
-
-        // Write row size of key
-        const uint64_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
-        io.write(&k_size_row, sizeof(k_size_row));
-
-        // Read each range of cells of k_size length each into tmp_buf and write out
-        for (const auto & range : cell_ranges) {
-            const size_t range_size = range.second - range.first;
-            const size_t buf_size = range_size * k_size_row;
-            io.write_tensor(layer.k, range.first * k_size_row, buf_size);
-        }
-    }
-
-    if (!v_trans) {
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Write value type
-            const int32_t v_type_i = (int32_t)layer.v->type;
-            io.write(&v_type_i, sizeof(v_type_i));
-
-            // Write row size of value
-            const uint64_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
-            io.write(&v_size_row, sizeof(v_size_row));
-
-            // Read each range of cells of v_size length each into tmp_buf and write out
-            for (const auto & range : cell_ranges) {
-                const size_t range_size = range.second - range.first;
-                const size_t buf_size = range_size * v_size_row;
-                io.write_tensor(layer.v, range.first * v_size_row, buf_size);
-            }
-        }
-    } else {
-        // When v is transposed, we also need the element size and get the element ranges from each row
-        const uint32_t kv_size = size;
-
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Write value type
-            const int32_t v_type_i = (int32_t)layer.v->type;
-            io.write(&v_type_i, sizeof(v_type_i));
-
-            // Write element size
-            const uint32_t v_size_el = ggml_type_size(layer.v->type);
-            io.write(&v_size_el, sizeof(v_size_el));
-
-            // Write GQA embedding size
-            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
-
-            // For each row, we get the element values of each cell
-            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                // Read each range of cells of v_size_el length each into tmp_buf and write out
-                for (const auto & range : cell_ranges) {
-                    const size_t range_size = range.second - range.first;
-                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
-                    const size_t buf_size = range_size * v_size_el;
-                    io.write_tensor(layer.v, src_offset, buf_size);
-                }
-            }
-        }
-    }
-}
-
-bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
-    if (dest_seq_id != -1) {
-        // single sequence
-
-        seq_rm(dest_seq_id, -1, -1);
-
-        llama_sbatch sbatch;
-        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
-
-        batch.n_tokens = cell_count;
-
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            llama_pos pos;
-            uint32_t n_seq_id;
-
-            io.read_to(&pos,      sizeof(pos));
-            io.read_to(&n_seq_id, sizeof(n_seq_id));
-
-            if (n_seq_id != 0) {
-                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
-                return false;
-            }
-
-            batch.pos[i] = pos;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id[i] = &dest_seq_id;
-        }
-
-        if (!find_slot(batch)) {
-            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
-            return false;
-        }
-
-        commit();
-
-        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
-        // Assume that this is one contiguous block of cells
-        GGML_ASSERT(head + cell_count <= size);
-        GGML_ASSERT(cells[head].pos == batch.pos[0]);
-        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
-        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
-        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
-    } else {
-        // whole KV cache restore
-
-        if (cell_count > size) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
-            return false;
-        }
-
-        clear();
-
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            kv_cell & cell = cells[i];
-
-            llama_pos pos;
-            uint32_t  n_seq_id;
-
-            io.read_to(&pos,      sizeof(pos));
-            io.read_to(&n_seq_id, sizeof(n_seq_id));
-
-            cell.pos = pos;
-
-            for (uint32_t j = 0; j < n_seq_id; ++j) {
-                llama_seq_id seq_id;
-                io.read_to(&seq_id, sizeof(seq_id));
-
-                if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
-                    return false;
-                }
-
-                cell.seq_id.insert(seq_id);
-            }
-        }
-
-        head = 0;
-        used = cell_count;
-    }
-
-    return true;
-}
-
-bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
-    uint32_t v_trans;
-    uint32_t n_layer;
-
-    io.read_to(&v_trans, sizeof(v_trans));
-    io.read_to(&n_layer, sizeof(n_layer));
-
-    if (n_layer != layers.size()) {
-        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, (uint32_t) layers.size());
-        return false;
-    }
-    if (cell_count > size) {
-        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
-        return false;
-    }
-    if (this->v_trans != (bool) v_trans) {
-        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
-        return false;
-    }
-
-    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
-    for (const auto & layer : layers) {
-        const uint32_t il = layer.il;
-
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Read type of key
-        int32_t k_type_i_ref;
-        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-        const int32_t k_type_i = (int32_t) layer.k->type;
-        if (k_type_i != k_type_i_ref) {
-            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
-            return false;
-        }
-
-        // Read row size of key
-        uint64_t k_size_row_ref;
-        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-        const size_t k_size_row = ggml_row_size(layer.k->type, n_embd_k_gqa);
-        if (k_size_row != k_size_row_ref) {
-            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
-            return false;
-        }
-
-        if (cell_count) {
-            // Read and set the keys for the whole cell range
-            ggml_backend_tensor_set(layer.k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
-        }
-    }
-
-    if (!this->v_trans) {
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)layer.v->type;
-            if (v_type_i != v_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return false;
-            }
-
-            // Read row size of value
-            uint64_t v_size_row_ref;
-            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-            const size_t v_size_row = ggml_row_size(layer.v->type, n_embd_v_gqa);
-            if (v_size_row != v_size_row_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // Read and set the values for the whole cell range
-                ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
-            }
-        }
-    } else {
-        // For each layer, read the values for each cell (transposed)
-        for (const auto & layer : layers) {
-            const uint32_t il = layer.il;
-
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)layer.v->type;
-            if (v_type_i != v_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return false;
-            }
-
-            // Read element size of value
-            uint32_t v_size_el_ref;
-            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-            const size_t v_size_el = ggml_type_size(layer.v->type);
-            if (v_size_el != v_size_el_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
-                return false;
-            }
-
-            // Read GQA embedding size
-            uint32_t n_embd_v_gqa_ref;
-            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
-            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // For each row in the transposed matrix, read the values for the whole cell range
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    const size_t dst_offset = (head + j * size) * v_size_el;
-                    ggml_backend_tensor_set(layer.v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
-                }
-            }
-        }
-    }
-
-    return true;
-}
-
-//
-// llama_kv_cache_unified_iswa
-//
-
-llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
-        const llama_model & model,
-                ggml_type   type_k,
-                ggml_type   type_v,
-                     bool   v_trans,
-                     bool   offload,
-                     bool   swa_full,
-                 uint32_t   kv_size,
-                 uint32_t   n_seq_max,
-                 uint32_t   n_batch,
-                 uint32_t   n_pad) : hparams(model.hparams) {
-    llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
-    llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
-
-    const uint32_t size_base = kv_size;
-
-    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
-
-    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
-    if (swa_full) {
-        LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
-                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
-
-        size_swa = size_base;
-        do_prune = false;
-    }
-
-    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
-
-    kv_base = std::make_unique<llama_kv_cache_unified>(
-            model, std::move(filter_base), type_k, type_v,
-            v_trans, offload, size_base, n_seq_max, n_pad,
-            0, LLAMA_SWA_TYPE_NONE);
-
-    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
-
-    kv_swa = std::make_unique<llama_kv_cache_unified>(
-            model, std::move(filter_swa), type_k, type_v,
-            v_trans, offload, size_swa, n_seq_max, n_pad,
-            hparams.n_swa, hparams.swa_type);
-}
-
-void llama_kv_cache_unified_iswa::clear() {
-    kv_base->clear();
-    kv_swa ->clear();
-}
-
-bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    bool res = true;
-
-    res = res & kv_base->seq_rm(seq_id, p0, p1);
-    res = res & kv_swa ->seq_rm(seq_id, p0, p1);
-
-    return res;
-}
-
-void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
-    kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
-}
-
-void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
-    kv_base->seq_keep(seq_id);
-    kv_swa ->seq_keep(seq_id);
-}
-
-void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    kv_base->seq_add(seq_id, p0, p1, delta);
-    kv_swa ->seq_add(seq_id, p0, p1, delta);
-}
-
-void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    kv_base->seq_div(seq_id, p0, p1, d);
-    kv_swa ->seq_div(seq_id, p0, p1, d);
-}
-
-llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
-    // the base cache is a superset of the SWA cache, so we can just check the SWA cache
-    return kv_swa->seq_pos_min(seq_id);
-}
-
-llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
-    return kv_swa->seq_pos_max(seq_id);
-}
-
-void llama_kv_cache_unified_iswa::restore() {
-    kv_base->restore();
-    kv_swa ->restore();
-}
-
-void llama_kv_cache_unified_iswa::commit() {
-    kv_base->commit();
-    kv_swa ->commit();
-
-    // slide the attention window, forgetting/pruning old tokens that are outside the window
-    if (do_prune) {
-        for (const auto & [seq_id, entry] : pending.pos) {
-            kv_swa->prune_swa(seq_id, entry.pmin, entry.pmax);
-        }
-
-    }
-
-    pending.clear();
-}
-
-bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
-    bool res = true;
-
-    res = res & kv_base->update(lctx);
-    res = res & kv_swa ->update(lctx);
-
-    return res;
-}
-
-void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
-    kv_base->defrag_sched(thold);
-    kv_swa ->defrag_sched(thold);
-}
-
-void llama_kv_cache_unified_iswa::set_full() {
-    kv_base->set_full();
-    kv_swa ->set_full();
-}
-
-llama_sbatch llama_kv_cache_unified_iswa::sbatch_init(const llama_batch & batch, bool logits_all) {
-    pending.clear();
-
-    if (do_prune) {
-        for (int i = 0; i < batch.n_tokens; ++i) {
-            for (int s = 0; s < batch.n_seq_id[i]; ++s) {
-                const llama_seq_id seq_id = batch.seq_id[i][s];
-                const llama_pos    pos    = batch.pos[i];
-
-                if (pending.pos.find(seq_id) == pending.pos.end()) {
-                    pending.pos[seq_id].pmin = pos;
-                    pending.pos[seq_id].pmax = pos;
-                } else {
-                    pending.pos[seq_id].pmin = std::min(pending.pos[seq_id].pmin, pos);
-                    pending.pos[seq_id].pmax = std::max(pending.pos[seq_id].pmax, pos);
-                }
-            }
-        }
-    }
-
-    return llama_sbatch(batch, hparams.n_embd, true, logits_all);
-}
-
-llama_ubatch llama_kv_cache_unified_iswa::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
-    GGML_UNUSED(embd_pooled);
-    return sbatch.split_simple(n_ubatch);
-}
-
-bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
-    bool res = true;
-
-    res = res & kv_base->find_slot(batch);
-    res = res & kv_swa ->find_slot(batch);
-
-    return res;
-}
-
-bool llama_kv_cache_unified_iswa::get_can_shift() const {
-    return kv_base->get_size() == kv_swa->get_size();
-}
-
-void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
-    kv_base->state_write(io, seq_id);
-    kv_swa ->state_write(io, seq_id);
-}
-
-void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
-    kv_base->state_read(io, seq_id);
-    kv_swa ->state_read(io, seq_id);
-}
-
-llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_base() const {
-    return kv_base.get();
-}
-
-llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_kv_swa() const {
-    return kv_swa.get();
-}
-
-//
-// llama_kv_cache_recurrent
-//
-
-llama_kv_cache_recurrent::llama_kv_cache_recurrent(
-        const llama_model & model,
-                ggml_type   type_k,
-                ggml_type   type_v,
-                     bool   offload,
-                 uint32_t   kv_size,
-                 uint32_t   n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
-    const int32_t n_layer = hparams.n_layer;
-
-    LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
-            __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
-
-    head = 0;
-    size = kv_size;
-    used = 0;
-
-    cells.clear();
-    cells.resize(kv_size);
-
-    // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
-    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
-        auto it = ctx_map.find(buft);
-        if (it == ctx_map.end()) {
-            ggml_init_params params = {
-                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ true,
-            };
-
-            ggml_context * ctx = ggml_init(params);
-            if (!ctx) {
-                return nullptr;
-            }
-
-            ctx_map[buft] = ctx;
-            ctxs.emplace_back(ctx);
-
-            return ctx;
-        }
-
-        return it->second;
-    };
-
-    k_l.reserve(n_layer);
-    v_l.reserve(n_layer);
-
-    for (int i = 0; i < n_layer; i++) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
-
-        const char * dev_name = "CPU";
-
-        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
-
-        if (offload) {
-            auto * dev = model.dev_layer(i);
-            buft = ggml_backend_dev_buffer_type(dev);
-
-            dev_name = ggml_backend_dev_name(dev);
-        }
-
-        LLAMA_LOG_DEBUG("%s, layer %3d: dev = %s\n", __func__, i, dev_name);
-
-        ggml_context * ctx = ctx_for_buft(buft);
-        if (!ctx) {
-            throw std::runtime_error("failed to create ggml context for kv cache");
-        }
-
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-        ggml_format_name(k, "cache_k_l%d", i);
-        ggml_format_name(v, "cache_v_l%d", i);
-        k_l.push_back(k);
-        v_l.push_back(v);
-    }
-
-    // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        auto * buft = it.first;
-        auto * ctx  = it.second;
-
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-        if (!buf) {
-            throw std::runtime_error("failed to allocate buffer for kv cache");
-        }
-        ggml_backend_buffer_clear(buf, 0);
-        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        bufs.emplace_back(buf);
-    }
-
-    {
-        const size_t memory_size_k = size_k_bytes();
-        const size_t memory_size_v = size_v_bytes();
-
-        LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-    }
-}
-
-void llama_kv_cache_recurrent::clear() {
-    for (int32_t i = 0; i < (int32_t) size; ++i) {
-        cells[i].pos = -1;
-        cells[i].seq_id.clear();
-        cells[i].src = -1;
-        cells[i].tail = -1;
-    }
-    head = 0;
-    used = 0;
-
-    for (auto & buf : bufs) {
-        ggml_backend_buffer_clear(buf.get(), 0);
-    }
-}
-
-bool llama_kv_cache_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    uint32_t new_head = size;
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // models like Mamba or RWKV can't have a state partially erased
-    if (seq_id >= (int64_t) size) {
-        // could be fatal
-        return false;
-    }
-    if (0 <= seq_id) {
-        int32_t & tail_id = cells[seq_id].tail;
-        if (tail_id >= 0) {
-            const kv_cell & cell = cells[tail_id];
-            // partial intersection is invalid
-            if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
-                return false;
-            }
-            // invalidate tails which will be cleared
-            if (p0 <= cell.pos && cell.pos < p1) {
-                tail_id = -1;
-            }
-        }
-    } else {
-        // seq_id is negative, then the range should include everything or nothing
-        if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
-            return false;
-        }
-    }
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].pos >= p0 && cells[i].pos < p1) {
-            if (seq_id < 0) {
-                cells[i].seq_id.clear();
-            } else if (cells[i].has_seq_id(seq_id)) {
-                cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-            if (cells[i].is_empty()) {
-                // keep count of the number of used cells
-                if (cells[i].pos >= 0) {
-                    used--;
-                }
-                cells[i].pos = -1;
-                cells[i].src = -1;
-                if (new_head == size) {
-                    new_head = i;
-                }
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != size && new_head < head) {
-        head = new_head;
-    }
-
-    return true;
-}
-
-void llama_kv_cache_recurrent::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    if (seq_id_src == seq_id_dst) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
-        kv_cell & tail_src = cells[seq_id_src];
-        kv_cell & tail_dst = cells[seq_id_dst];
-        if (tail_dst.tail >= 0) {
-            // clear destination seq_id if it wasn't empty
-            kv_cell & cell_dst = cells[tail_dst.tail];
-
-            cell_dst.seq_id.erase(seq_id_dst);
-            tail_dst.tail = -1;
-            if (cell_dst.seq_id.empty()) {
-                cell_dst.pos = -1;
-                cell_dst.src = -1;
-                used -= 1;
-            }
-        }
-        if (tail_src.tail >= 0) {
-            kv_cell & cell_src = cells[tail_src.tail];
-
-            cell_src.seq_id.insert(seq_id_dst);
-            tail_dst.tail = tail_src.tail;
-        }
-    }
-}
-
-void llama_kv_cache_recurrent::seq_keep(llama_seq_id seq_id) {
-    uint32_t new_head = size;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if ((llama_seq_id) i != seq_id) {
-            cells[i].tail = -1;
-        }
-
-        if (!cells[i].has_seq_id(seq_id)) {
-            if (cells[i].pos >= 0) {
-                used--;
-            }
-
-            cells[i].pos = -1;
-            cells[i].src = -1;
-            cells[i].seq_id.clear();
-
-            if (new_head == size){
-                new_head = i;
-            }
-        } else {
-            cells[i].seq_id.clear();
-            cells[i].seq_id.insert(seq_id);
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != size && new_head < head) {
-        head = new_head;
-    }
-}
-
-void llama_kv_cache_recurrent::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    if (delta == 0) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // If there is no range then return early to avoid looping over the
-    if (p0 == p1) {
-        return;
-    }
-
-    // for Mamba-like or RWKV models, only the pos needs to be shifted
-    if (0 <= seq_id && seq_id < (int64_t) size) {
-        const int32_t tail_id = cells[seq_id].tail;
-        if (tail_id >= 0) {
-            kv_cell & cell = cells[tail_id];
-            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                cell.pos += delta;
-            }
-        }
-    }
-}
-
-void llama_kv_cache_recurrent::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    if (d == 1) {
-        return;
-    }
-
-    if (p0 < 0) {
-        p0 = 0;
-    }
-
-    if (p1 < 0) {
-        p1 = std::numeric_limits<llama_pos>::max();
-    }
-
-    // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) {
-        return;
-    }
-
-    // for Mamba-like or RWKV models, only the pos needs to be changed
-    if (0 <= seq_id && seq_id < (int64_t) size) {
-        const int32_t tail_id = cells[seq_id].tail;
-        if (tail_id >= 0) {
-            kv_cell & cell = cells[tail_id];
-            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                cell.pos /= d;
-            }
-        }
-    }
-}
-
-llama_pos llama_kv_cache_recurrent::seq_pos_min(llama_seq_id seq_id) const {
-    llama_pos result = std::numeric_limits<llama_pos>::max();
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id)) {
-            result = std::min(result, cells[i].pos);
-        }
-    }
-
-    if (result == std::numeric_limits<llama_pos>::max()) {
-        result = -1;
-    }
-
-    return result;
-}
-
-llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
-    llama_pos result = -1;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (cells[i].has_seq_id(seq_id)) {
-            result = std::max(result, cells[i].pos);
-        }
-    }
-
-    return result;
-}
-
-void llama_kv_cache_recurrent::restore() {
-    if (pending.ranges.empty()) {
-        return;
-    }
-
-    seq_rm(-1, -1, -1);
-}
-
-void llama_kv_cache_recurrent::commit() {
-    pending.ranges.clear();
-}
-
-bool llama_kv_cache_recurrent::update(llama_context & ctx) {
-    GGML_UNUSED(ctx);
-    return false;
-}
-
-void llama_kv_cache_recurrent::defrag_sched(float thold) {
-    GGML_UNUSED(thold);
-    // noop
-}
-
-void llama_kv_cache_recurrent::set_full() {
-    n = size;
-    head = 0;
-}
-
-llama_sbatch llama_kv_cache_recurrent::sbatch_init(
-        const llama_batch & batch,
-        bool logits_all) {
-    return llama_sbatch(batch, hparams.n_embd, false, logits_all);
-}
-
-llama_ubatch llama_kv_cache_recurrent::ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const {
-    if (embd_pooled) {
-        // Pooled embeddings cannot be split across ubatches (yet)
-        return sbatch.split_seq(n_ubatch);
-    }
-
-    return sbatch.split_equal(n_ubatch);
-}
-
-bool llama_kv_cache_recurrent::find_slot(
-       const llama_ubatch & ubatch) {
-    const uint32_t n_tokens = ubatch.n_tokens;
-    const uint32_t n_seqs   = ubatch.n_seqs;
-
-    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
-
-    // if we have enough unused cells before the current head ->
-    //   better to start searching from the beginning of the cache, hoping to fill it
-    if (head > used + 2*n_tokens) {
-        head = 0;
-    }
-
-    // For recurrent state architectures (like Mamba or RWKV),
-    // each cache cell can store the state for a whole sequence.
-    // A slot should be always be contiguous.
-
-    // can only process batches with an equal number of new tokens in each sequence
-    GGML_ASSERT(ubatch.equal_seqs);
-
-    int32_t min = size - 1;
-    int32_t max = 0;
-
-    // everything should fit if all seq_ids are smaller than the max
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        const uint32_t n_seq_id = ubatch.n_seq_id[s];
-        for (uint32_t j = 0; j < n_seq_id; ++j) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][j];
-
-            if (seq_id < 0 || (uint32_t) seq_id >= size) {
-                // too big seq_id
-                // TODO: would it be possible to resize the cache instead?
-                LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
-                return false;
-            }
-            if (j > 0) {
-                kv_cell & seq = cells[seq_id];
-                if (seq.tail >= 0) {
-                    kv_cell & cell = cells[seq.tail];
-                    // clear cells from seq_ids that become shared
-                    // (should not normally happen, but let's handle it anyway)
-                    cell.seq_id.erase(seq_id);
-                    seq.tail = -1;
-                    if (cell.seq_id.empty()) {
-                        cell.pos = -1;
-                        cell.src = -1;
-                        used -= 1;
-                    }
-                }
-            }
-        }
-    }
-
-#ifndef NDEBUG
-    {
-        std::vector<int32_t> tails_verif;
-        tails_verif.assign(size, -1);
-        for (uint32_t i = 0; i < size; ++i) {
-            kv_cell & cell = cells[i];
-            for (llama_seq_id seq_id : cell.seq_id) {
-                if (tails_verif[seq_id] != -1) {
-                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
-                }
-                tails_verif[seq_id] = i;
-            }
-        }
-        for (uint32_t i = 0; i < size; ++i) {
-            if (tails_verif[i] != cells[i].tail) {
-                LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
-            }
-        }
-    }
-#endif
-
-    // find next empty cell
-    uint32_t next_empty_cell = head;
-
-    for (uint32_t i = 0; i < size; ++i) {
-        if (next_empty_cell >= size) { next_empty_cell -= size; }
-        kv_cell & cell = cells[next_empty_cell];
-        if (cell.is_empty()) { break; }
-        next_empty_cell += 1;
-    }
-
-    // find usable cell range
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        const llama_seq_id seq_id = ubatch.seq_id[s][0];
-        kv_cell & seq_meta = cells[seq_id];
-        bool has_cell = false;
-        if (seq_meta.tail >= 0) {
-            kv_cell & cell = cells[seq_meta.tail];
-            GGML_ASSERT(cell.has_seq_id(seq_id));
-            // does this seq_id "own" the cell?
-            if (cell.seq_id.size() == 1) { has_cell = true; }
-        }
-        if (!has_cell) {
-            kv_cell & empty_cell = cells[next_empty_cell];
-            GGML_ASSERT(empty_cell.is_empty());
-            // copy old tail into the empty cell
-            if (seq_meta.tail >= 0) {
-                kv_cell & orig_cell = cells[seq_meta.tail];
-                empty_cell.pos = orig_cell.pos;
-                empty_cell.src = orig_cell.src;
-                orig_cell.seq_id.erase(seq_id);
-                empty_cell.seq_id.insert(seq_id); // will be overwritten
-            }
-            seq_meta.tail = next_empty_cell;
-            // find next empty cell
-            if (s + 1 < n_seqs) {
-                next_empty_cell += 1;
-                for (uint32_t i = 0; i < size; ++i) {
-                    if (next_empty_cell >= size) { next_empty_cell -= size; }
-                    kv_cell & cell = cells[next_empty_cell];
-                    if (cell.is_empty()) { break; }
-                    next_empty_cell += 1;
-                }
-            }
-        }
-        if (min > seq_meta.tail) { min = seq_meta.tail; }
-        if (max < seq_meta.tail) { max = seq_meta.tail; }
-    }
-
-    // gather and re-order
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        int32_t dst_id = s + min;
-        int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
-        if (dst_id != src_id) {
-            kv_cell & dst_cell = cells[dst_id];
-            kv_cell & src_cell = cells[src_id];
-
-            std::swap(dst_cell.pos, src_cell.pos);
-            std::swap(dst_cell.src, src_cell.src);
-            std::swap(dst_cell.seq_id, src_cell.seq_id);
-
-            // swap tails (assuming they NEVER overlap)
-            for (const llama_seq_id seq_id : src_cell.seq_id) {
-                cells[seq_id].tail = src_id;
-            }
-            for (const llama_seq_id seq_id : dst_cell.seq_id) {
-                cells[seq_id].tail = dst_id;
-            }
-        }
-    }
-
-    // update the pos of the used seqs
-    for (uint32_t s = 0; s < n_seqs; ++s) {
-        const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
-        int32_t cell_id = s + min;
-        kv_cell & cell = cells[cell_id];
-
-        if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
-            // What should happen when the pos backtracks or skips a value?
-            // Clearing the state mid-batch would require special-casing which isn't done.
-            LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
-                __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
-        }
-        cell.pos = last_pos;
-        cell.seq_id.clear();
-        for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][j];
-            cell.seq_id.insert(seq_id);
-            cells[seq_id].tail = cell_id;
-        }
-    }
-
-    // allow getting the range of used cells, from head to head + n
-    head = min;
-    n    = max - min + 1;
-    used = std::count_if(cells.begin(), cells.end(),
-        [](const kv_cell & cell){ return !cell.is_empty(); });
-
-    // sanity check
-    return n >= n_seqs;
-}
-
-bool llama_kv_cache_recurrent::get_can_shift() const {
-    return false;
-}
-
-int32_t llama_kv_cache_recurrent::s_copy(int i) const {
-    const uint32_t cell_id = i + head;
-
-    //////////////////////////////////////////////
-    // TODO: this should not mutate the KV cache !
-    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
-
-    // prevent out-of-bound sources
-    if (cell.src < 0 || (uint32_t) cell.src >= size) {
-        cell.src = cell_id;
-    }
-
-    int32_t res = cell.src;
-
-    // TODO: do not mutate the KV cache
-    // ensure copy only happens once
-    if (cell.src != (int32_t) cell_id) {
-        cell.src = cell_id;
-    }
-
-    return res;
-}
-
-float llama_kv_cache_recurrent::s_mask(int i) const {
-    const uint32_t cell_id = i + head;
-
-    //////////////////////////////////////////////
-    // TODO: this should not mutate the KV cache !
-    kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
-
-    float res = (float) (cell.src >= 0);
-
-    // only clear once
-    if (cell.src < 0) {
-        cell.src = cell_id;
-    }
-
-    return res;
-}
-
-uint32_t llama_kv_cache_recurrent::cell_max() const {
-    for (uint32_t i = size; i > 0; --i) {
-        const kv_cell & cell = cells[i - 1];
-
-        if (cell.pos >= 0 && !cell.is_empty()) {
-            return i;
-        }
-    }
-
-    return 0;
-}
-
-size_t llama_kv_cache_recurrent::total_size() const {
-    size_t size = 0;
-    for (const auto & buf : bufs) {
-        size += ggml_backend_buffer_get_size(buf.get());
-    }
-
-    return size;
-}
-
-size_t llama_kv_cache_recurrent::size_k_bytes() const {
-    size_t size_k_bytes = 0;
-
-    for (const auto & k : k_l) {
-        size_k_bytes += ggml_nbytes(k);
-    }
-
-    return size_k_bytes;
-}
-
-size_t llama_kv_cache_recurrent::size_v_bytes() const {
-    size_t size_v_bytes = 0;
-
-    for (const auto & v : v_l) {
-        size_v_bytes += ggml_nbytes(v);
-    }
-
-    return size_v_bytes;
-}
-
-void llama_kv_cache_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
-    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
-    uint32_t cell_count = 0;
-
-    // Count the number of cells with the specified seq_id
-    // Find all the ranges of cells with this seq id (or all, when -1)
-    uint32_t cell_range_begin = size;
-    for (uint32_t i = 0; i < size; ++i) {
-        const auto & cell = cells[i];
-        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
-            ++cell_count;
-            if (cell_range_begin == size) {
-                cell_range_begin = i;
-            }
-        } else {
-            if (cell_range_begin != size) {
-                cell_ranges.emplace_back(cell_range_begin, i);
-                cell_range_begin = size;
-            }
-        }
-    }
-    if (cell_range_begin != size) {
-        cell_ranges.emplace_back(cell_range_begin, size);
-    }
-
-    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
-    uint32_t cell_count_check = 0;
-    for (const auto & range : cell_ranges) {
-        cell_count_check += range.second - range.first;
-    }
-    GGML_ASSERT(cell_count == cell_count_check);
-
-    io.write(&cell_count, sizeof(cell_count));
-
-    state_write_meta(io, cell_ranges, seq_id);
-    state_write_data(io, cell_ranges);
-}
-
-void llama_kv_cache_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
-    uint32_t cell_count;
-    io.read_to(&cell_count, sizeof(cell_count));
-
-    bool res = true;
-
-    res = res && state_read_meta(io, cell_count, seq_id);
-    res = res && state_read_data(io, cell_count);
-
-    if (!res) {
-        if (seq_id == -1) {
-            clear();
-        } else {
-            seq_rm(seq_id, -1, -1);
-        }
-        throw std::runtime_error("failed to restore kv cache");
-    }
-}
-
-void llama_kv_cache_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
-    for (const auto & range : cell_ranges) {
-        for (uint32_t i = range.first; i < range.second; ++i) {
-            const auto & cell = cells[i];
-            const llama_pos pos      = cell.pos;
-            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
-
-            io.write(&pos,      sizeof(pos));
-            io.write(&n_seq_id, sizeof(n_seq_id));
-
-            if (n_seq_id) {
-                for (auto seq_id : cell.seq_id) {
-                    io.write(&seq_id, sizeof(seq_id));
-                }
-            }
-        }
-    }
-}
-
-void llama_kv_cache_recurrent::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
-    const uint32_t v_trans = 0;
-    const uint32_t n_layer = hparams.n_layer;
-
-    io.write(&v_trans, sizeof(v_trans));
-    io.write(&n_layer, sizeof(n_layer));
-
-    std::vector<uint8_t> tmp_buf;
-
-    // Iterate and write all the keys first, each row is a cell
-    // Get whole range at a time
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Write key type
-        const int32_t k_type_i = (int32_t)k_l[il]->type;
-        io.write(&k_type_i, sizeof(k_type_i));
-
-        // Write row size of key
-        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        io.write(&k_size_row, sizeof(k_size_row));
-
-        // Read each range of cells of k_size length each into tmp_buf and write out
-        for (const auto & range : cell_ranges) {
-            const size_t range_size = range.second - range.first;
-            const size_t buf_size = range_size * k_size_row;
-            io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
-        }
-    }
-
-    if (!v_trans) {
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Write value type
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
-            io.write(&v_type_i, sizeof(v_type_i));
-
-            // Write row size of value
-            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
-            io.write(&v_size_row, sizeof(v_size_row));
-
-            // Read each range of cells of v_size length each into tmp_buf and write out
-            for (const auto & range : cell_ranges) {
-                const size_t range_size = range.second - range.first;
-                const size_t buf_size = range_size * v_size_row;
-                io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
-            }
-        }
-    } else {
-        // When v is transposed, we also need the element size and get the element ranges from each row
-        const uint32_t kv_size = size;
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Write value type
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
-            io.write(&v_type_i, sizeof(v_type_i));
-
-            // Write element size
-            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
-            io.write(&v_size_el, sizeof(v_size_el));
-
-            // Write GQA embedding size
-            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
-
-            // For each row, we get the element values of each cell
-            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                // Read each range of cells of v_size_el length each into tmp_buf and write out
-                for (const auto & range : cell_ranges) {
-                    const size_t range_size = range.second - range.first;
-                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
-                    const size_t buf_size = range_size * v_size_el;
-                    io.write_tensor(v_l[il], src_offset, buf_size);
-                }
-            }
-        }
-    }
-}
-
-bool llama_kv_cache_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
-    if (dest_seq_id != -1) {
-        // single sequence
-
-        seq_rm(dest_seq_id, -1, -1);
-
-        llama_sbatch sbatch;
-        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
-
-        batch.n_tokens = cell_count;
-        batch.n_seq_tokens = cell_count;
-        batch.n_seqs = 1;
-
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            llama_pos pos;
-            uint32_t n_seq_id;
-
-            io.read_to(&pos,      sizeof(pos));
-            io.read_to(&n_seq_id, sizeof(n_seq_id));
-
-            if (n_seq_id != 0) {
-                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
-                return false;
-            }
-
-            batch.pos[i] = pos;
-        }
-        batch.n_seq_id[0] = 1;
-        batch.seq_id[0] = &dest_seq_id;
-        if (!find_slot(batch)) {
-            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
-            return false;
-        }
-        commit();
-
-        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
-        // Assume that this is one contiguous block of cells
-        GGML_ASSERT(head + cell_count <= size);
-        GGML_ASSERT(cells[head].pos == batch.pos[0]);
-        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
-        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
-        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
-    } else {
-        // whole KV cache restore
-
-        if (cell_count > size) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
-            return false;
-        }
-
-        clear();
-
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            kv_cell & cell = cells[i];
-
-            llama_pos pos;
-            uint32_t  n_seq_id;
-
-            io.read_to(&pos,      sizeof(pos));
-            io.read_to(&n_seq_id, sizeof(n_seq_id));
-
-            cell.pos = pos;
-
-            for (uint32_t j = 0; j < n_seq_id; ++j) {
-                llama_seq_id seq_id;
-                io.read_to(&seq_id, sizeof(seq_id));
-
-                // TODO: llama_kv_cache_recurrent should have a notion of max sequences
-                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
-                if (seq_id < 0) {
-                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
-                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
-                    return false;
-                }
-
-                cell.seq_id.insert(seq_id);
-
-                int32_t & tail = cells[seq_id].tail;
-                if (tail != -1) {
-                    LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
-                    return false;
-                }
-                tail = i;
-            }
-        }
-
-        head = 0;
-        used = cell_count;
-    }
-
-    for (uint32_t i = 0; i < cell_count; ++i) {
-        uint32_t cell_id = head + i;
-        // make sure the recurrent states will keep their restored state
-        cells[cell_id].src = cell_id;
-    }
-
-    return true;
-}
-
-bool llama_kv_cache_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
-    uint32_t v_trans;
-    uint32_t n_layer;
-    io.read_to(&v_trans, sizeof(v_trans));
-    io.read_to(&n_layer, sizeof(n_layer));
-
-    if (n_layer != hparams.n_layer) {
-        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
-        return false;
-    }
-    if (cell_count > size) {
-        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
-        return false;
-    }
-    if (false != (bool) v_trans) {
-        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
-        return false;
-    }
-
-    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Read type of key
-        int32_t k_type_i_ref;
-        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-        const int32_t k_type_i = (int32_t) k_l[il]->type;
-        if (k_type_i != k_type_i_ref) {
-            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
-            return false;
-        }
-
-        // Read row size of key
-        uint64_t k_size_row_ref;
-        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        if (k_size_row != k_size_row_ref) {
-            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
-            return false;
-        }
-
-        if (cell_count) {
-            // Read and set the keys for the whole cell range
-            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
-        }
-    }
-
-    if (!v_trans) {
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
-            if (v_type_i != v_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return false;
-            }
-
-            // Read row size of value
-            uint64_t v_size_row_ref;
-            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
-            if (v_size_row != v_size_row_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // Read and set the values for the whole cell range
-                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
-            }
-        }
-    } else {
-        // For each layer, read the values for each cell (transposed)
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-            const int32_t v_type_i = (int32_t)v_l[il]->type;
-            if (v_type_i != v_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return false;
-            }
-
-            // Read element size of value
-            uint32_t v_size_el_ref;
-            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-            const size_t v_size_el = ggml_type_size(v_l[il]->type);
-            if (v_size_el != v_size_el_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
-                return false;
-            }
-
-            // Read GQA embedding size
-            uint32_t n_embd_v_gqa_ref;
-            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
-            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // For each row in the transposed matrix, read the values for the whole cell range
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    const size_t dst_offset = (head + j * size) * v_size_el;
-                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
-                }
-            }
-        }
-    }
-
-    return true;
-}
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 191a1090a..2d04705f2 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -2,58 +2,34 @@
 
 #include "llama.h"
 #include "llama-io.h"
-#include "llama-graph.h"
 #include "llama-memory.h"
 
-#include "ggml-cpp.h"
-
-#include <set>
-#include <unordered_map>
-#include <vector>
-
-struct llama_cparams;
-struct llama_hparams;
-struct llama_ubatch;
-struct llama_sbatch;
-struct llama_model;
-struct llama_context;
-
 struct llama_kv_cache : public llama_memory_i {
     virtual ~llama_kv_cache() = default;
 
-    // call if batch processing fails - restores the cache state
-    virtual void restore() = 0;
+    // split the input batch into a set of ubatches and verify that they can fit into the cache
+    // return a state object containing the ubatches and KV cache state required to process them
+    // check the llama_memory_state_i::get_status() for the result
+    virtual llama_memory_state_ptr init_batch(
+            const llama_batch & batch,
+            uint32_t n_ubatch,
+            bool embd_pooled,
+            bool logits_all) = 0;
 
-    // call after successful batch processing - clears any pending state
-    virtual void commit()  = 0;
+    // simulate full cache, used for allocating worst-case compute buffers
+    virtual llama_memory_state_ptr init_full() = 0;
 
     // process any pending defrag/shift/etc. operations
     // optionally call once before processing a new batch
+    // return true if any operations were performed
     virtual bool update(llama_context & lctx) = 0;
 
     // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
+    // TODO: change to
+    //   llama_memory_state_ptr init_defrag(float thold) = 0;
+    //
     virtual void defrag_sched(float thold) = 0;
 
-    // simulate full cache, used for allocating worst-case compute buffers
-    virtual void set_full() = 0;
-
-    //
-    // batch processing
-    //
-
-    // =============================================================================================================
-    // TODO: refactor  and simplify this
-
-    virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
-
-    // different KV caches require different batch splitting strategies
-    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
-
-    // find an empty slot of size "n_tokens" in the cache
-    virtual bool find_slot(const llama_ubatch & batch) = 0;
-
-    // =============================================================================================================
-
     // getters
     virtual bool get_can_shift() const = 0;
 
@@ -66,450 +42,3 @@ struct llama_kv_cache : public llama_memory_i {
     virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
     virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
 };
-
-//
-// llama_kv_cache_guard
-//
-
-struct llama_kv_cache_guard {
-    llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
-
-    ~llama_kv_cache_guard() {
-        kv->restore();
-    }
-
-    void commit() {
-        kv->commit();
-    }
-
-private:
-    llama_kv_cache * kv;
-};
-
-//
-// llama_kv_cache_unified
-//
-
-class llama_kv_cache_unified : public llama_kv_cache {
-public:
-    static uint32_t get_padding(const llama_cparams & cparams);
-
-    // this callback is used to filter out layers that should not be included in the cache
-    using layer_filter_cb = std::function<bool(int32_t il)>;
-
-    llama_kv_cache_unified(
-            const llama_model &  model,
-              layer_filter_cb && filter,
-                    ggml_type    type_k,
-                    ggml_type    type_v,
-                         bool    v_trans,
-                         bool    offload,
-                     uint32_t    kv_size,
-                     uint32_t    n_seq_max,
-                     uint32_t    n_pad,
-                     uint32_t    n_swa,
-               llama_swa_type    swa_type);
-
-    ~llama_kv_cache_unified() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    void clear() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    //
-    // llama_kv_cache
-    //
-
-    void restore() override;
-    void commit()  override;
-
-    bool update(llama_context & ctx) override;
-
-    void defrag_sched(float thold) override;
-
-    void set_full() override;
-
-    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
-
-    // updates the cache head
-    // Note: On success, it's important that cache.head points
-    // to the first cell of the slot.
-    bool find_slot(const llama_ubatch & batch) override;
-
-    bool get_can_shift() const override;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
-
-    //
-    // llama_kv_cache_unified specific API
-    //
-
-    uint32_t get_n() const;
-    uint32_t get_size() const;
-
-    // get views of the current state of the cache
-    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
-    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
-
-    // store k_cur and v_cur in the cache based on the current head location
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
-
-    void prune_swa(llama_seq_id seq_id, llama_pos pmin, llama_pos pmax);
-
-    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
-    void set_input_k_shift   (ggml_tensor * dst) const;
-    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
-
-private:
-    const llama_model & model;
-    const llama_hparams & hparams;
-
-    struct kv_cell {
-        llama_pos pos   = -1;
-        llama_pos delta =  0;
-
-        // TODO: replace with bitset uint64_t
-        std::set<llama_seq_id> seq_id;
-
-        bool has_seq_id(const llama_seq_id & id) const {
-            return seq_id.find(id) != seq_id.end();
-        }
-
-        bool is_empty() const {
-            return seq_id.empty();
-        }
-
-        bool is_same_seq(const kv_cell & other) const {
-            return seq_id == other.seq_id;
-        }
-    };
-
-    struct kv_layer {
-        // layer index in the model
-        // note: can be different from the layer index in the KV cache
-        uint32_t il;
-
-        ggml_tensor * k;
-        ggml_tensor * v;
-    };
-
-    bool has_shift = false;
-    bool do_defrag = false;
-    bool v_trans   = true;  // the value tensor is transposed
-
-    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
-    uint32_t size = 0; // total number of cells, shared across all sequences
-    uint32_t used = 0; // used cells (i.e. at least one seq_id) (TODO: add `struct kv_cells` and keep track automaticallt)
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    const uint32_t n_seq_max = 1;
-
-    // required padding
-    const uint32_t n_pad = 1;
-
-    // SWA
-    const uint32_t n_swa = 0;
-
-    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
-
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
-
-    std::vector<kv_cell>  cells;  // TODO: replace with `struct kv_cells`
-    std::vector<kv_layer> layers;
-
-    // model layer id -> KV cache layer id
-    std::unordered_map<int32_t, int32_t> map_layer_ids;
-
-    // recovery information used to restore the KV cells to their original state in case of a failure
-    struct {
-        void clear() {
-            cells.clear();
-        }
-
-        std::unordered_map<uint32_t, kv_cell> cells;
-    } recovery;
-
-    // defrag
-    struct {
-        std::vector<uint32_t> ids;
-    } defrag_info;
-
-    // return true if cells have been moved
-    bool defrag_prepare(int32_t n_max_nodes);
-
-    // find how many cells are currently in use
-    uint32_t cell_max() const;
-
-    size_t total_size() const;
-
-    size_t size_k_bytes() const;
-    size_t size_v_bytes() const;
-
-    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
-
-    ggml_tensor * build_rope_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_tensor * cur,
-                    ggml_tensor * shift,
-                    ggml_tensor * factors,
-                          float   freq_base,
-                          float   freq_scale) const;
-
-    llm_graph_result_ptr build_graph_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf) const;
-
-    llm_graph_result_ptr build_graph_defrag(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf) const;
-
-    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
-
-    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
-};
-
-//
-// llama_kv_cache_unified_iswa
-//
-
-// utilizes two instances of llama_kv_cache_unified
-//   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
-//   upon successful commit, the SWA cache removes old tokens outside the n_swa window
-
-class llama_kv_cache_unified_iswa : public llama_kv_cache {
-public:
-    llama_kv_cache_unified_iswa(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   v_trans,
-                         bool   offload,
-                         bool   swa_full,
-                     uint32_t   kv_size,
-                     uint32_t   n_seq_max,
-                     uint32_t   n_batch,
-                     uint32_t   n_pad);
-
-    ~llama_kv_cache_unified_iswa() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    void clear() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    //
-    // llama_kv_cache
-    //
-
-    void restore() override;
-    void commit()  override;
-
-    bool update(llama_context & ctx) override;
-
-    void defrag_sched(float thold) override;
-
-    void set_full() override;
-
-    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
-
-    bool find_slot(const llama_ubatch & batch) override;
-
-    bool get_can_shift() const override;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
-
-    //
-    // llama_kv_cache_unified_iswa specific API
-    //
-
-    llama_kv_cache_unified * get_kv_base() const;
-    llama_kv_cache_unified * get_kv_swa () const;
-
-private:
-    const llama_hparams & hparams;
-
-    bool do_prune = true;
-
-    struct {
-        struct entry {
-            llama_pos pmin;
-            llama_pos pmax;
-        };
-
-        void clear() {
-            pos.clear();
-        }
-
-        // used to perform SWA pruning of old tokens
-        std::unordered_map<llama_seq_id, entry> pos;
-    } pending;
-
-    std::unique_ptr<llama_kv_cache_unified> kv_base;
-    std::unique_ptr<llama_kv_cache_unified> kv_swa;
-};
-
-//
-// llama_kv_cache_recurrent
-//
-
-class llama_kv_cache_recurrent : public llama_kv_cache {
-public:
-    struct kv_cell {
-        llama_pos pos  = -1;
-        int32_t   src  = -1; // used to copy states
-        int32_t   tail = -1;
-
-        std::set<llama_seq_id> seq_id;
-
-        bool has_seq_id(const llama_seq_id & id) const {
-            return seq_id.find(id) != seq_id.end();
-        }
-
-        bool is_empty() const {
-            return seq_id.empty();
-        }
-
-        bool is_same_seq(const kv_cell & other) const {
-            return seq_id == other.seq_id;
-        }
-    };
-
-    llama_kv_cache_recurrent(
-            const llama_model & model,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                         bool   offload,
-                     uint32_t   kv_size,
-                     uint32_t   n_seq_max);
-
-    ~llama_kv_cache_recurrent() = default;
-
-    //
-    // llama_memory_i
-    //
-
-    void clear() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id)                                                          override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    //
-    // llama_kv_cache
-    //
-
-    void restore() override;
-    void commit()  override;
-
-    bool update(llama_context & ctx) override;
-
-    void defrag_sched(float thold) override;
-
-    void set_full() override;
-
-    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
-    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
-
-    bool find_slot(const llama_ubatch & batch) override;
-
-    bool get_can_shift() const override;
-
-    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
-    int32_t s_copy(int i) const;
-    float   s_mask(int i) const;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
-
-    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
-    uint32_t size = 0; // total number of cells, shared across all sequences
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    std::vector<kv_cell> cells;
-
-    std::vector<ggml_tensor *> k_l; // per layer
-    std::vector<ggml_tensor *> v_l;
-
-private:
-    //const llama_model & model;
-    const llama_hparams & hparams;
-
-    // commit/restore cache
-    // TODO: rework for recurrent cache
-    struct slot_range {
-        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
-        uint32_t c1 = 0;
-    };
-
-    // pending cell updates that are not yet committed
-    struct {
-        std::vector<slot_range> ranges;
-    } pending;
-
-    const uint32_t n_seq_max = 1;
-
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
-
-    // find how many cells are currently in use
-    uint32_t cell_max() const;
-
-    size_t total_size() const;
-
-    size_t size_k_bytes() const;
-    size_t size_v_bytes() const;
-
-    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
-
-    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
-};
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
new file mode 100644
index 000000000..9e2c4d927
--- /dev/null
+++ b/src/llama-kv-cells.h
@@ -0,0 +1,410 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-cparams.h"
+
+#include <bitset>
+#include <cassert>
+#include <vector>
+#include <set>
+
+// meta information about KV cells that can be part of multiple sequences at the same time
+// TODO: add unit tests
+class llama_kv_cells_unified {
+public:
+    void reset() {
+        for (uint32_t i = 0; i < pos.size(); ++i) {
+            pos[i]   = -1;
+            shift[i] =  0;
+            seq[i].reset();
+        }
+
+        has_shift = false;
+
+        used.clear();
+
+        for (uint32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            seq_pos[s].clear();
+        }
+    }
+
+    void reset_shift() {
+        has_shift = false;
+
+        for (uint32_t i = 0; i < shift.size(); ++i) {
+            shift[i] = 0;
+        }
+    }
+
+    uint32_t size() const {
+        return pos.size();
+    }
+
+    void resize(uint32_t n) {
+        pos.resize(n);
+        shift.resize(n);
+        seq.resize(n);
+
+        reset();
+    }
+
+    bool is_empty(uint32_t i) const {
+        assert(i < pos.size());
+        assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
+
+        return pos[i] == -1;
+    }
+
+    uint32_t get_used() const {
+        return used.size();
+    }
+
+    // the index of the first cell that is used
+    // return 0 if no cells are used
+    uint32_t used_min() const {
+        return used.empty() ? 0 : *used.begin();
+    }
+
+    // the index of the last cell that is used + 1
+    // return 0 if no cells are used
+    uint32_t used_max_p1() const {
+        return used.empty() ? 0 : *used.rbegin() + 1;
+    }
+
+    bool get_has_shift() const {
+        return has_shift;
+    }
+
+    // move cell isrc to idst (used during defrag)
+    void mv(uint32_t isrc, uint32_t idst) {
+        assert(isrc < pos.size());
+        assert(idst < pos.size());
+
+        pos  [idst] = pos  [isrc];
+        shift[idst] = shift[isrc];
+        seq  [idst] = seq  [isrc];
+
+        pos  [isrc] = -1;
+        shift[isrc] =  0;
+        seq  [isrc].reset();
+
+        used.erase (isrc);
+        used.insert(idst);
+    }
+
+    // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
+    llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
+        assert(i + n <= pos.size());
+
+        llama_kv_cells_unified res;
+
+        res.resize(n);
+
+        for (uint32_t j = 0; j < n; ++j) {
+            res.pos[j] = pos[i + j];
+            res.seq[j] = seq[i + j];
+
+            assert(shift[i + j] == 0);
+        }
+
+        return res;
+    }
+
+    // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
+    void set(uint32_t i, const llama_kv_cells_unified & other) {
+        assert(i + other.pos.size() <= pos.size());
+
+        for (uint32_t j = 0; j < other.pos.size(); ++j) {
+            if (pos[i + j] == -1 && other.pos[j] != -1) {
+                used.insert(i + j);
+            }
+
+            if (pos[i + j] != -1 && other.pos[j] == -1) {
+                used.erase(i + j);
+            }
+
+            if (pos[i + j] != -1) {
+                seq_pos_rm(i + j);
+            }
+
+            pos[i + j] = other.pos[j];
+            seq[i + j] = other.seq[j];
+
+            if (pos[i + j] != -1) {
+                seq_pos_add(i + j);
+            }
+
+            assert(shift[i + j] == 0);
+        }
+    }
+
+    // clear a non-empty cell
+    void rm(uint32_t i) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        seq_pos_rm(i);
+
+        pos[i] = -1;
+        seq[i].reset();
+
+        used.erase(i);
+    }
+
+    // note: call only if the cell has seq_id
+    // return true if the cell becomes empty
+    bool seq_rm(uint32_t i, llama_seq_id seq_id) {
+        assert(i < pos.size());
+        assert(seq[i].test(seq_id));
+        assert(pos[i] != -1);
+        assert(seq_id >= 0);
+
+        seq[i].reset(seq_id);
+        seq_pos[seq_id].erase(pos[i]);
+
+        if (seq[i].none()) {
+            pos[i] = -1;
+
+            used.erase(i);
+
+            return true;
+        }
+
+        return false;
+    }
+
+    // return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
+    bool seq_keep(uint32_t i, llama_seq_id seq_id) {
+        assert(i < pos.size());
+
+        if (seq[i].test(seq_id)) {
+            seq_pos_rm(i);
+            seq[i].reset();
+
+            seq[i].set(seq_id);
+            seq_pos[seq_id].insert(pos[i]);
+
+            return false;
+        }
+
+        if (seq[i].any()) {
+            seq_pos_rm(i);
+            seq[i].reset();
+
+            pos[i] = -1;
+
+            used.erase(i);
+
+            return true;
+        }
+
+        assert(pos[i] == -1);
+
+        return false;
+    }
+
+    // number of different sequences in the cell
+    int seq_count(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return seq[i].count();
+    }
+
+    // check if the cell contains seq_id
+    bool seq_has(uint32_t i, llama_seq_id seq_id) const {
+        assert(i < pos.size());
+        assert(seq_id >= 0);
+
+        return seq[i].test(seq_id);
+    }
+
+    // note: call only if the cell is not empty and the seq_id is not in the cell
+    void seq_add(uint32_t i, llama_seq_id seq_id) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+        assert(!seq[i].test(seq_id));
+
+        seq[i].set(seq_id);
+        seq_pos[seq_id].insert(pos[i]);
+    }
+
+    // return the sequence id of this cell
+    // note: call only for cells with exactly one sequence
+    llama_seq_id seq_get(uint32_t i) const {
+        assert(seq[i].count() == 1);
+
+        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            if (seq[i].test(s)) {
+                return s;
+            }
+        }
+
+        return -1;
+    }
+
+    // the minimum position of sequence seq_id currently present in any of the cells
+    // return -1 if the sequence is not present
+    llama_pos seq_pos_min(llama_seq_id seq_id) const {
+        assert(seq_id >= 0);
+        assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
+
+        if (seq_pos[seq_id].empty()) {
+            return -1;
+        }
+
+        return *seq_pos[seq_id].begin();
+    }
+
+    // the maximum position of sequence seq_id currently present in any of the cells
+    // return -1 if the sequence is not present
+    llama_pos seq_pos_max(llama_seq_id seq_id) const {
+        assert(seq_id >= 0);
+        assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
+
+        if (seq_pos[seq_id].empty()) {
+            return -1;
+        }
+
+        return *seq_pos[seq_id].rbegin();
+    }
+
+    // note: call only if the cell is not empty
+    llama_pos pos_get(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return pos[i];
+    }
+
+    // note: call only if the cell is not empty
+    llama_pos get_shift(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return shift[i];
+    }
+
+    // check if a cell is not empty and its position is within [p0, p1)
+    bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
+        assert(i < pos.size());
+
+        return pos[i] >= p0 && pos[i] < p1;
+    }
+
+    // set the position of an empty cell
+    // does not modify "has_shift"
+    // note: call only if the cell is empty
+    void pos_set(uint32_t i, llama_pos p) {
+        assert(i < pos.size());
+        assert(pos[i] == -1);
+        assert(seq[i].none());
+
+        pos[i] = p;
+
+        used.insert(i);
+    }
+
+    // pos[i] = pos[i] + d
+    // sets "has_shift" to true
+    // note: call only if the cell is not empty
+    bool pos_add(uint32_t i, llama_pos d) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        seq_pos_rm(i);
+
+        pos[i]   += d;
+        shift[i] += d;
+
+        seq_pos_add(i);
+
+        has_shift = true;
+
+        if (pos[i] < 0) {
+            seq_pos_rm(i);
+
+            seq[i].reset();
+            pos[i] = -1;
+
+            used.erase(i);
+
+            return true;
+        }
+
+        return false;
+    }
+
+    // pos[i] = pos[i] / d
+    // sets "has_shift" to true
+    // note: call only if the cell is not empty
+    void pos_div(uint32_t i, int d) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        const llama_pos p_old = pos[i];
+
+        seq_pos_rm(i);
+
+        pos[i]   /= d;
+        shift[i] += p_old - pos[i];
+
+        seq_pos_add(i);
+
+        has_shift = true;
+    }
+
+private:
+    bool has_shift = false;
+
+    // set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
+    std::set<uint32_t> used;
+
+    std::vector<llama_pos> pos;
+
+    // this array accumulates any applied shifts to the pos array since the last reset_shift() call
+    // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
+    //
+    //   cells.pos_add(x, shift_x);
+    //   cells.pos_div(y, shift_y);
+    //   ...
+    //
+    //   if (cells.has_shift()) {
+    //      for (int i = 0; i < n; ++i) {
+    //          auto shift_i = cells.get_shift(i);
+    //          ...
+    //      }
+    //      cells.reset_shift();
+    //   }
+    //
+    std::vector<llama_pos> shift;
+
+    using bits_t = std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>;
+
+    // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
+    std::vector<bits_t> seq;
+
+    // the set seq_pos[s] tells us which positions are currently present for sequence s
+    // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
+    std::set<llama_pos> seq_pos[LLAMA_MAX_PARALLEL_SEQUENCES];
+
+    // helper functions for updating `seq_pos`, once cell at a time:
+
+    // remove cell i
+    void seq_pos_rm(uint32_t i) {
+        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            if (seq[i].test(s)) {
+                seq_pos[s].erase(pos[i]);
+            }
+        }
+    }
+
+    // add cell i
+    void seq_pos_add(uint32_t i) {
+        for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+            if (seq[i].test(s)) {
+                seq_pos[s].insert(pos[i]);
+            }
+        }
+    }
+};
diff --git a/src/llama-memory.h b/src/llama-memory.h
index c2571edc7..b3799d66e 100644
--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@@ -2,6 +2,11 @@
 
 #include "llama.h"
 
+#include <memory>
+#include <vector>
+
+struct llama_ubatch;
+
 struct llama_memory_params {
     // kv cache
     ggml_type type_k;
@@ -22,7 +27,7 @@ public:
     virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
     virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
     virtual void seq_keep(llama_seq_id seq_id) = 0;
-    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
+    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) = 0;
     virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
 
     virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
@@ -30,3 +35,42 @@ public:
 
     virtual bool get_can_edit() const = 0;
 };
+
+enum llama_memory_status {
+    LLAMA_MEMORY_STATUS_SUCCESS = 0,
+    LLAMA_MEMORY_STATUS_FAILED_PREPARE,
+    LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
+};
+
+// the interface for managing the memory state during batch processing
+// this interface is implemented per memory type. see:
+//   - llama_kv_cache_unified_state
+//   - llama_kv_cache_unified_iswa_state
+//   ...
+//
+// the only method that can mutate the memory and the memory state is llama_memory_i::apply()
+//
+// TODO: rename to llama_memory_context_i ?
+class llama_memory_state_i {
+public:
+    virtual ~llama_memory_state_i() = default;
+
+    // consume the current ubatch from the state and proceed to the next one
+    // return false if we are done
+    virtual bool next() = 0;
+
+    // apply the memory state for the current ubatch to the memory object
+    // return false on failure
+    virtual bool apply() = 0;
+
+    // TODO: this might get reworked in the future when refactoring llama_batch
+    virtual std::vector<int64_t> & out_ids() = 0;
+
+    // get the current ubatch
+    virtual const llama_ubatch & get_ubatch() const = 0;
+
+    // get the status of the memory state
+    virtual llama_memory_status get_status() const = 0;
+};
+
+using llama_memory_state_ptr = std::unique_ptr<llama_memory_state_i>;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index b0512bf44..e46403f3c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -5,7 +5,10 @@
 #include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-model-loader.h"
-#include "llama-kv-cache.h"
+
+#include "llama-kv-cache-unified.h"
+#include "llama-kv-cache-unified-iswa.h"
+#include "llama-kv-cache-recurrent.h"
 
 #include "ggml-cpp.h"
 
@@ -464,11 +467,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         GGML_ASSERT(hparams.n_expert_used == 0);
     }
 
-    // zero-out the array hparams
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
 
+    std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
+
+    std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
+
     ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
     ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
 
@@ -575,7 +581,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
                 hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
                 hparams.n_swa         = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
-                hparams.n_swa_pattern = 4;    // pattern: 3 chunked - 1 full
+                hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
 
                 switch (hparams.n_expert) {
                     case 16:  type = LLM_TYPE_17B_16E; break;
@@ -681,6 +687,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
+                ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
 
                 switch (hparams.n_layer) {
                     case 3:
@@ -876,7 +883,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.swa_type = LLAMA_SWA_TYPE_NONE;
 
                     hparams.n_swa         = 0;
-                    hparams.n_swa_pattern = 1;
+                    hparams.set_swa_pattern(1);
                 }
             } break;
         case LLM_ARCH_PHIMOE:
@@ -948,7 +955,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                 hparams.n_swa = 4096; // default value of gemma 2
-                hparams.n_swa_pattern = 2;
+                hparams.set_swa_pattern(2);
                 hparams.attn_soft_cap = true;
 
                 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
@@ -966,7 +973,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         case LLM_ARCH_GEMMA3:
             {
                 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.n_swa_pattern = 6;
+                hparams.set_swa_pattern(6);
 
                 hparams.rope_freq_base_train_swa  = 10000.0f;
                 hparams.rope_freq_scale_train_swa = 1.0f;
@@ -1051,7 +1058,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         case LLM_ARCH_COHERE2:
             {
                 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.n_swa_pattern = 4;
+                hparams.set_swa_pattern(4);
 
                 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                 ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
@@ -2123,7 +2130,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_NOMIC_BERT_MOE:
                 {
                     tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
-                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
+                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
 
                     if (arch == LLM_ARCH_BERT) {
                         pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
@@ -2131,8 +2138,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
                         cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
 
-                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
-                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         TENSOR_NOT_REQUIRED);
+                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
                     }
 
                     tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
@@ -2141,7 +2148,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
 
-                        if (arch == LLM_ARCH_BERT) {
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+                        if (!layer.wqkv) {
                             layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
                             layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
 
@@ -2150,12 +2160,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                             layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
                             layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
-                        } else {
-                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
-                        }
-
-                        if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
-                            layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
                         }
 
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
@@ -2529,7 +2533,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     // output
                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
@@ -4363,7 +4371,7 @@ void llama_model::print_info() const {
         LLAMA_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
         LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
         LLAMA_LOG_INFO("%s: n_swa            = %u\n",     __func__, hparams.n_swa);
-        LLAMA_LOG_INFO("%s: n_swa_pattern    = %u\n",     __func__, hparams.n_swa_pattern);
+        LLAMA_LOG_INFO("%s: is_swa_any       = %u\n",     __func__, hparams.is_swa_any());
         LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
         LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
         LLAMA_LOG_INFO("%s: n_gqa            = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
@@ -5923,8 +5931,10 @@ struct llm_build_bert : public llm_graph_context {
         inpL = build_inp_embd(model.tok_embd);
 
         // token types are hardcoded to zero ("Sentence A")
-        ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
-        inpL = ggml_add(ctx0, inpL, type_row0);
+        if (model.type_embd) {
+            ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
+            inpL = ggml_add(ctx0, inpL, type_row0);
+        }
         if (model.arch == LLM_ARCH_BERT) {
             inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
         }
@@ -5945,36 +5955,11 @@ struct llm_build_bert : public llm_graph_context {
             ggml_tensor * Vcur;
 
             // self-attention
-            if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
-                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
-
-                if (model.layers[il].attn_q_norm) {
-                    Qcur = build_norm(Qcur,
-                            model.layers[il].attn_q_norm,
-                            model.layers[il].attn_q_norm_b,
-                            LLM_NORM, il);
-                }
-
-                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
-
-                if (model.layers[il].attn_k_norm) {
-                    Kcur = build_norm(Kcur,
-                            model.layers[il].attn_k_norm,
-                            model.layers[il].attn_k_norm_b,
-                            LLM_NORM, il);
-                }
-
-                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
-
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-            } else {
-                // compute Q and K and RoPE them
+            if (model.layers[il].wqkv) {
                 cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
-                if (model.arch == LLM_ARCH_NOMIC_BERT_MOE || model.arch == LLM_ARCH_JINA_BERT_V3) {
+                if (model.layers[il].bqkv) {
                     cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
                     cb(cur, "bqkv", il);
                 }
@@ -5982,11 +5967,32 @@ struct llm_build_bert : public llm_graph_context {
                 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                 Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+            } else {
+                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
+                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
+                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
+            }
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            if (model.layers[il].attn_q_norm) {
+                Qcur = build_norm(Qcur,
+                        model.layers[il].attn_q_norm,
+                        model.layers[il].attn_q_norm_b,
+                        LLM_NORM, il);
+            }
 
+            if (model.layers[il].attn_k_norm) {
+                Kcur = build_norm(Kcur,
+                        model.layers[il].attn_k_norm,
+                        model.layers[il].attn_k_norm_b,
+                        LLM_NORM, il);
+            }
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            // RoPE
+            if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
                 Qcur = ggml_rope_ext(
                         ctx0, Qcur, inp_pos, nullptr,
                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -8932,9 +8938,9 @@ struct llm_build_mamba : public llm_graph_context {
              ggml_tensor * state_mask,
       const llama_ubatch & ubatch,
                      int   il) const {
-        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+        const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
 
-        const auto kv_head = kv_self->head;
+        const auto kv_head = kv_state->get_head();
 
         const int64_t d_conv  = hparams.ssm_d_conv;
         const int64_t d_inner = hparams.ssm_d_inner;
@@ -8952,8 +8958,8 @@ struct llm_build_mamba : public llm_graph_context {
         GGML_ASSERT(ubatch.equal_seqs);
         GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
 
-        ggml_tensor * conv_states_all = kv_self->k_l[il];
-        ggml_tensor * ssm_states_all  = kv_self->v_l[il];
+        ggml_tensor * conv_states_all = kv_state->get_k_l(il);
+        ggml_tensor * ssm_states_all  = kv_state->get_v_l(il);
 
         // (ab)using the KV cache to store the states
         ggml_tensor * conv = build_copy_mask_state(
@@ -11680,7 +11686,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
             ggml_tensor * state_mask,
             const llama_ubatch & ubatch,
             int   il) const {
-        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+        const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
 
         const auto n_tokens = ubatch.n_tokens;
         const auto n_seqs = ubatch.n_seqs;
@@ -11690,7 +11696,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
         const auto n_head = n_embd / head_size;
         const auto n_head_kv = hparams.n_head_kv(il);
 
-        const auto kv_head = kv_self->head;
+        const auto kv_head = kv_state->get_head();
 
         const auto & layer = model.layers[il];
 
@@ -11802,7 +11808,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
         }
 
         ggml_tensor * wkv_state = build_copy_mask_state(
-                gf, kv_self->v_l[il], state_copy, state_mask,
+                gf, kv_state->get_v_l(il), state_copy, state_mask,
                 hparams.n_embd_v_s(), n_seqs);
 
         ggml_tensor * wkv_output;
@@ -11821,9 +11827,9 @@ struct llm_build_rwkv6_base : public llm_graph_context {
                     wkv_state,
                     ggml_view_1d(
                         ctx0,
-                        kv_self->v_l[il],
+                        kv_state->get_v_l(il),
                         hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
+                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
                         )
                     )
                 );
@@ -12076,7 +12082,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
             ggml_tensor *& first_layer_value,
             const llama_ubatch & ubatch,
             int   il) const {
-        const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
+        const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
 
         const auto n_tokens = ubatch.n_tokens;
         const auto n_seqs = ubatch.n_seqs;
@@ -12085,7 +12091,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
         const auto head_count = n_embd / head_size;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
 
-        const auto kv_head = kv_self->head;
+        const auto kv_head = kv_state->get_head();
 
         const auto & layer = model.layers[il];
 
@@ -12156,7 +12162,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
         a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
 
         ggml_tensor * wkv_state = build_copy_mask_state(
-                gf, kv_self->v_l[il], state_copy, state_mask,
+                gf, kv_state->get_v_l(il), state_copy, state_mask,
                 hparams.n_embd_v_s(), n_seqs);
 
         ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
@@ -12170,9 +12176,9 @@ struct llm_build_rwkv7_base : public llm_graph_context {
                     wkv_state,
                     ggml_view_1d(
                         ctx0,
-                        kv_self->v_l[il],
+                        kv_state->get_v_l(il),
                         hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il])
+                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_state->get_v_l(il))
                         )
                     )
                 );
@@ -13233,6 +13239,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
         case LLM_ARCH_JINA_BERT_V3:
         case LLM_ARCH_NOMIC_BERT:
         case LLM_ARCH_NOMIC_BERT_MOE:
+        case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 res = nullptr;
             } break;
@@ -13259,7 +13266,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                 LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
                 if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                    GGML_ASSERT(hparams.n_swa_pattern != 1);
+                    GGML_ASSERT(hparams.is_swa_any());
 
                     res = new llama_kv_cache_unified_iswa(
                             *this,
@@ -13270,10 +13277,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             params.swa_full,
                             cparams.n_ctx,
                             cparams.n_seq_max,
-                            cparams.n_batch,
+                            cparams.n_ubatch,
                             padding);
                 } else {
-                    GGML_ASSERT(hparams.n_swa_pattern == 1);
+                    GGML_ASSERT(!hparams.is_swa_any());
 
                     res = new llama_kv_cache_unified(
                             *this,
@@ -13302,7 +13309,6 @@ llm_graph_result_ptr llama_model::build_graph(
 
     switch (arch) {
         case LLM_ARCH_LLAMA:
-        case LLM_ARCH_MINICPM:
             {
                 llm = std::make_unique<llm_build_llama>(*this, params, gf);
             } break;
@@ -13544,6 +13550,7 @@ llm_graph_result_ptr llama_model::build_graph(
             } break;
         case LLM_ARCH_GRANITE:
         case LLM_ARCH_GRANITE_MOE:
+        case LLM_ARCH_MINICPM:
             {
                 llm = std::make_unique<llm_build_granite>(*this, params, gf);
             } break;
@@ -13634,6 +13641,10 @@ int32_t llama_model_n_head_kv(const llama_model * model) {
     return model->hparams.n_head_kv();
 }
 
+int32_t llama_model_n_swa(const llama_model * model) {
+    return model->hparams.n_swa;
+}
+
 // deprecated
 int32_t llama_n_ctx_train(const llama_model * model) {
     return llama_model_n_ctx_train(model);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 804b11e0a..bfbf5fa23 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -798,7 +798,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
         }
 
         // if we have enough values the operation was a success
-        if (filtered_tokens.size() >= ctx->min_keep) {
+        if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
             memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
             cur_p->size = filtered_tokens.size();
             min_p_applied = true;
@@ -909,7 +909,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
         cum_sum += cur_p->data[idx].p;
 
         // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > ctx->p && i >= ctx->min_keep - 1) {
+        if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) {
             last_idx = i + 1;
             break;
         }
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 6d6a1c6e1..00d26ff85 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session {
         }
 
         // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
         // at the beginning tokenization score is zero
         tokenization_results[0] = { vocab.token_unk(), 0, 0 };
 
@@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session {
                     const double challenger_score = current_best.score_sum + token_score;
                     struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                     if (challenger_score > current_champ.score_sum) {
-                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        struct best_tokenization challenger = { token_id, input_offset, challenger_score };
                         current_champ = challenger;
                     }
                 }
@@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session {
                 prefix_offset = input_offset + n_utf8_code_units;
                 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                 if (challenger_score > current_champ.score_sum) {
-                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
+                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
                     current_champ = challenger;
                 }
             }
@@ -1007,7 +1007,7 @@ private:
     struct best_tokenization {
         llama_token token_id;
         size_t input_offset;
-        float score_sum;
+        double score_sum;
     };
 
     struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
@@ -2096,7 +2096,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         // set attributes by model/tokenizer/architecture name
         if (false
                 || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
-                || _contains_any(general_arch, {"jina-bert-v3"})
+                || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
            ) {
             _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
         } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 083347d18..83f7d1a45 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -97,6 +97,9 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
+# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
+# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
+
 if (LLAMA_LLGUIDANCE)
     llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
 endif ()
@@ -142,8 +145,10 @@ if (NOT WIN32)
     # llama_build_and_test(test-double-float.cpp) # SLOW
 endif()
 
-llama_build_and_test(test-log.cpp)
+llama_build_and_test(test-chat-parser.cpp)
 llama_build_and_test(test-chat-template.cpp)
+llama_build_and_test(test-json-partial.cpp)
+llama_build_and_test(test-log.cpp)
 llama_build_and_test(test-regex-partial.cpp)
 
 # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp
new file mode 100644
index 000000000..59e44e07d
--- /dev/null
+++ b/tests/test-chat-parser.cpp
@@ -0,0 +1,352 @@
+//  Tests chat handling, including grammar generation and parsing for tool calling, for various templates.
+//
+//  Also acts as a CLI to generate a Markdown summary of the formats of Jinja templates,
+//  e.g. given Minja (http://github.com/google/minja) checked out in parent dir:
+//
+//    cmake -B build && cmake --build build --parallel && ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+//
+#include <exception>
+#include <iostream>
+#include <string>
+
+#include "chat-parser.h"
+#include "common.h"
+#include "log.h"
+#include "regex-partial.h"
+
+template <class T>
+static void assert_equals(const T & expected, const T & actual) {
+    if (expected != actual) {
+        std::cerr << "Expected: " << expected << std::endl;
+        std::cerr << "Actual: " << actual << std::endl;
+        std::cerr << std::flush;
+        throw std::runtime_error("Test failed");
+    }
+}
+static void assert_equals(const char * expected, const std::string & actual) {
+  return assert_equals<std::string>(expected, actual);
+}
+
+static void assert_throws(const std::function<void()> & fn, const std::string & expected_exception_pattern = "") {
+    try {
+        fn();
+    } catch (const std::exception & e) {
+      if (expected_exception_pattern.empty()) {
+          return;
+        }
+        std::regex expected_exception_regex(expected_exception_pattern);
+        std::string actual_message = e.what();
+        if (std::regex_search(actual_message, expected_exception_regex)) {
+            return;
+        }
+        throw std::runtime_error("Exception doesn't match expected pattern: " + actual_message + " (pattern: " + expected_exception_pattern + ")");
+        throw std::runtime_error("Exception of unexpected type: " + std::string(e.what()));
+    }
+    throw std::runtime_error("Exception was expected but not thrown");
+}
+
+static void test_reasoning() {
+  {
+    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+    });
+    assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals("<tnk>Cogito</tnk>Ergo sum", builder.consume_rest());
+  }
+  {
+    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+    });
+    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals(std::string("Cogito"), builder.result().reasoning_content);
+    assert_equals("Ergo sum", builder.consume_rest());
+  }
+  {
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+    });
+    assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals("Cogito</tnk>Ergo sum", builder.consume_rest());
+  }
+  {
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ true,
+    });
+    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals(std::string("Cogito"), builder.result().reasoning_content);
+    assert_equals("Ergo sum", builder.consume_rest());
+  }
+  {
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ true,
+        /* .thinking_forced_open = */ true,
+    });
+    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals("<think>Cogito</think>", builder.result().content);
+    assert_equals("Ergo sum", builder.consume_rest());
+  }
+}
+
+static void test_regex() {
+  auto test_throws = [](const std::string & input, const std::string & regex, const std::string & expected_exception_pattern = "") {
+    common_chat_msg_parser builder(input, /* is_partial= */ false, {});
+    assert_throws([&]() { builder.consume_regex(common_regex(regex)); }, expected_exception_pattern);
+  };
+
+  test_throws("Hello, world!", "abc", "^abc$");
+  test_throws("Hello, world!", "e", "^e$");
+
+  {
+    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ false, {});
+    builder.consume_regex(common_regex("Hello"));
+    assert_equals(", world!", builder.consume_rest());
+  }
+
+  {
+    // When in non partial mode, we can say whether the regex was consumed or not.
+    common_chat_msg_parser builder("Hello,", /* is_partial= */ false, {});
+    assert_equals(false, builder.try_consume_regex(common_regex("Hello, world!")).has_value());
+  }
+  {
+    common_chat_msg_parser builder("Hello,", /* is_partial= */ false, {});
+    auto res = builder.try_consume_regex(common_regex("H(el)l(?:o, world!)?"));
+    assert_equals(true, res.has_value());
+    // Verify captures
+    assert_equals<size_t>(2, res->groups.size());
+    assert_equals("Hell", builder.str(res->groups[0]));
+    assert_equals("el", builder.str(res->groups[1]));
+    // Verify position is after the match
+    assert_equals<size_t>(4, builder.pos());
+    assert_equals("o,", builder.consume_rest());
+  }
+  {
+    // But in partial mode, we have a partial final match / can't decide, so we throw a partial exception.
+    common_chat_msg_parser builder("Hello,", /* is_partial= */ true, {});
+    assert_throws([&]() {
+      builder.try_consume_regex(common_regex("Hello, world!"));
+    }, "^Hello, world!$");
+  }
+
+  // Now regardless of the mode, we can tell these aren't a match.
+  for (const auto is_partial : {false, true}) {
+    common_chat_msg_parser builder("Hello,", is_partial, {});
+    assert_equals(false, builder.try_consume_regex(common_regex("a(b|c)(d|e)f")).has_value());
+  }
+  for (const auto is_partial : {false, true}) {
+    common_chat_msg_parser builder("Hello,", is_partial, {});
+    assert_equals(false, builder.try_consume_literal("Oh"));
+  }
+}
+
+const std::vector<std::string> barely_healable_jsons = {
+  "{",
+  "{\"",
+  "{\"\\",
+  "{\"n",
+  "{\"name\"",
+  "{\"name\":",
+  "{\"name\":\"",
+  "{\"name\":\"\\",
+  "{\"name\":\"python",
+  "{\"name\":\"python\\",
+  "{\",",
+  "{\":",
+  "{\"[",
+  "{\"]",
+  "{\"{",
+  "{\"}",
+  "{\"1",
+  "{\"name\":\",",
+  "{\"name\":\":",
+  "{\"name\":\"[",
+  "{\"name\":\"]",
+  "{\"name\":\"{",
+  "{\"name\":\"}",
+  "{\"name\":\"1",
+};
+
+static void test(const std::string & input, bool is_partial, const std::vector<std::vector<std::string>> & args_paths, const std::vector<std::vector<std::string>> & content_paths, const std::string & expected) {
+  common_chat_msg_parser builder(input, is_partial, {});
+  auto js = builder.try_consume_json_with_dumped_args(args_paths, content_paths);
+  assert_equals(true, js.has_value());
+  assert_equals(is_partial, js->is_partial);
+  assert_equals(expected, args_paths.size() == 1 && args_paths[0].empty() ? js->value.get<std::string>() : js->value.dump());
+}
+static void test_with_args(const std::string & input, const std::string & expected, bool parse_as_partial = true, bool is_partial = true) {
+  common_chat_msg_parser builder(input, parse_as_partial, {});
+  auto js = builder.try_consume_json_with_dumped_args({{"args"}}, {});
+  assert_equals(true, js.has_value());
+  assert_equals(is_partial, js->is_partial);
+  assert_equals(expected, js->value.dump());
+}
+
+static void test_json_with_dumped_args_no_args() {
+  // Normal JSON, nothing to heal, nothing to dump
+  test("{\"name\": \"python\"}", false, {}, {}, "{\"name\":\"python\"}");
+  // Full json is args
+  test("{\"name\": \"python\"}", false, {{}}, {}, "{\"name\":\"python\"}");
+
+  // If the arguments are further down, don't heal partial content.
+  for (const auto & src : barely_healable_jsons) {
+    test(src, true, {{"arguments"}}, {}, "{}");
+  }
+  // But heal content that isn't partial.
+  test("{\"name\": \"python\"", true, {{"arguments"}}, {}, "{\"name\":\"python\"}");
+}
+
+static void test_json_with_dumped_args() {
+
+  // Partial content.
+  test("{\"content\": \"t", true, {}, {{"content"}}, "{\"content\":\"t\"}");
+  test("{\"content\": \"", true, {}, {{"content"}}, "{\"content\":\"\"}");
+  test("{\"content\": ", true, {}, {{"content"}}, "{}");
+
+  // If the entire JSON is the arguments, healing it them dumping it produces the same output as the input (just reformatted).
+  test("{\"name\": \"python", true, {{}}, {}, "{\"name\":\"python");
+  for (const auto & src : barely_healable_jsons) {
+    test(src, true, {{}}, {}, src);
+  }
+
+  // Full JSON w/ args
+  for (auto parse_as_partial : {true, false}) {
+    test_with_args(
+      R"({"name": "python", "args": {"arg1": 1}})",
+      R"({"name":"python","args":"{\"arg1\":1}"})",
+      parse_as_partial,
+      /* is_partial= */ false
+    );
+  }
+
+  // Partial JSON w/ partial args
+  test_with_args(
+    R"({"foo": "bar", "args": {")",
+    R"({"foo":"bar","args":"{\""})"
+  );
+  // Partial args broken in object key
+  test_with_args(
+    R"({"foo": "bar", "args": {"ar)",
+    R"({"foo":"bar","args":"{\"ar"})"
+  );
+  // Partial args broken after object key
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1")",
+    R"({"foo":"bar","args":"{\"arg1\""})"
+  );
+  // Partial args broken before object value
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1":)",
+    R"({"foo":"bar","args":"{\"arg1\":"})"
+  );
+  // Partial args broken before object value (space)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": )",
+    R"({"foo":"bar","args":"{\"arg1\":"})"
+  );
+  // Partial args broken in object value that may not be complete (int)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": 1)",
+    R"({"foo":"bar","args":"{\"arg1\":"})"
+  );
+  // Partial args broken in object value that is complete (int)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": 1 )",
+    R"({"foo":"bar","args":"{\"arg1\":1"})"
+  );
+  // Partial args broken in object value that is incomplete (string)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": ")",
+    R"({"foo":"bar","args":"{\"arg1\":\""})"
+  );
+  // Partial args broken in object value that is complete (string)
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": "1")",
+    R"({"foo":"bar","args":"{\"arg1\":\"1\""})"
+  );
+  // Partial args broken on array opening
+  test_with_args(
+    R"({"foo": "bar", "args": [)",
+    R"({"foo":"bar","args":"["})"
+  );
+  // Partial args broken on array value that is incomplete (int)
+  test_with_args(
+    R"({"foo": "bar", "args": [1)",
+    R"({"foo":"bar","args":"["})"
+  );
+  // Partial args broken on array value that is complete (int)
+  test_with_args(
+    R"({"foo": "bar", "args": [1 )",
+    R"({"foo":"bar","args":"[1"})"
+  );
+  // Partial args broken on array value that is complete (string)
+  test_with_args(
+    R"({"foo": "bar", "args": ["1")",
+    R"({"foo":"bar","args":"[\"1\""})"
+  );
+  // Partial args broken after array value
+  test_with_args(
+    R"({"foo": "bar", "args": [1,)",
+    R"({"foo":"bar","args":"[1,"})"
+  );
+  // Partial args broken on nested array
+  test_with_args(
+    R"({"foo": "bar", "args": {"arg1": [)",
+    R"({"foo":"bar","args":"{\"arg1\":["})"
+  );
+}
+
+static void test_positions() {
+  {
+    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ false, {});
+    assert_equals<size_t>(0, builder.pos());
+    assert_throws([&]() { builder.move_to(100); });
+    assert_equals<size_t>(0, builder.pos());
+    assert_throws([&]() { builder.move_back(1); });
+    assert_equals<size_t>(0, builder.pos());
+
+    builder.move_to(8);
+    assert_equals<size_t>(8, builder.pos());
+    builder.move_back(1);
+    assert_equals<size_t>(7, builder.pos());
+    assert_equals("world!", builder.consume_rest());
+
+    builder.move_to(0);
+    assert_equals<size_t>(0, builder.pos());
+
+    assert_throws([&]() { builder.finish(); });
+    assert_equals<size_t>(0, builder.pos());
+
+    builder.move_to(builder.input().size());
+    builder.finish();
+  }
+  {
+    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ true, {});
+
+    builder.move_to(builder.input().size());
+    assert_equals<size_t>(builder.input().size(), builder.pos());
+    builder.finish();
+  }
+}
+
+int main() {
+    test_positions();
+    test_json_with_dumped_args_no_args();
+    test_json_with_dumped_args();
+    test_reasoning();
+    test_regex();
+    std::cout << "All tests passed!\n";
+    return 0;
+}
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 4d70da8c3..1c9807921 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -5,21 +5,80 @@
 //
 //    cmake -B build && cmake --build build --parallel && ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
 //
-#include <fstream>
-#include <iostream>
-#include <json.hpp>
-#include <string>
-
 #include "chat.h"
 
 #include "../src/unicode.h"
 #include "../src/llama-grammar.h"
 
+#include <nlohmann/json.hpp>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
 using json = nlohmann::ordered_json;
 
+static std::ostream & operator<<(std::ostream & os, const common_chat_msg_diff & diff) {
+    // os << "reasoning_content_delta: " << diff.reasoning_content_delta << '\n';
+    os << "{ content_delta: " << diff.content_delta << "; ";
+    if (diff.tool_call_index != std::string::npos) {
+        os << "tool_call_index: " << diff.tool_call_index << "; ";
+        os << "tool_call_delta.name: " << diff.tool_call_delta.name << "; ";
+        os << "tool_call_delta.id: " << diff.tool_call_delta.id << "; ";
+        os << "tool_call_delta.arguments: " << diff.tool_call_delta.arguments << "; ";
+    }
+    os << "}";
+    return os;
+}
+// operator<< for vector<common_chat_msg_diff>:
+static std::ostream & operator<<(std::ostream & os, const std::vector<common_chat_msg_diff> & diffs) {
+    os << "[\n";
+    for (const auto & diff : diffs) {
+        os << "  " << diff << ",\n";
+    }
+    os << "]";
+    return os;
+}
+static std::ostream & operator<<(std::ostream & os, const common_chat_msg & msg) {
+    os << "{ role: " << msg.role << "; ";
+    os << "content: " << msg.content << "; ";
+    os << "content_parts: [\n";
+    for (const auto & part : msg.content_parts) {
+        os << "  { type: " << part.type << "; text: " << part.text << " },\n";
+    }
+    os << "]; ";
+    os << "reasoning_content: " << msg.reasoning_content << "; ";
+    os << "tool_calls: [\n";
+    for (const auto & tool_call : msg.tool_calls) {
+        os << "  { name: " << tool_call.name << "; arguments: " << tool_call.arguments << "; id: " << tool_call.id << " },\n";
+    }
+    os << "]";
+    os << "}";
+    return os;
+}
+
+template <class T> static bool equals(const T & expected, const T & actual) {
+    return expected == actual;
+}
+
+static common_chat_msg normalize(const common_chat_msg & msg) {
+    common_chat_msg normalized = msg;
+    for (auto & tool_call : normalized.tool_calls) {
+        try {
+            tool_call.arguments = json::parse(tool_call.arguments).dump();
+        } catch (const std::exception &) {
+            // Do nothing
+        }
+    }
+    return normalized;
+}
+template <>
+bool equals(const common_chat_msg & expected, const common_chat_msg & actual) {
+    return normalize(expected) == normalize(actual);
+}
 
 template <class T> static void assert_equals(const T & expected, const T & actual) {
-    if (expected != actual) {
+    if (!equals(expected, actual)) {
         std::cerr << "Expected: " << expected << std::endl;
         std::cerr << "Actual: " << actual << std::endl;
         std::cerr << std::flush;
@@ -77,6 +136,15 @@ static bool match_string(const std::string & input, llama_grammar * grammar) {
     return false;
 }
 
+static std::string renormalize_json(const std::string & json_str) {
+    try {
+        auto json_obj = json::parse(json_str);
+        return json_obj.dump();
+    } catch (const std::exception & e) {
+        std::cerr << "Failed to parse JSON: " << e.what() << '\n';
+        return json_str;
+    }
+}
 static void assert_msg_equals(const common_chat_msg & expected, const common_chat_msg & actual) {
     assert_equals(expected.role, actual.role);
     assert_equals(expected.content, actual.content);
@@ -93,7 +161,7 @@ static void assert_msg_equals(const common_chat_msg & expected, const common_cha
         const auto & expected_tool_call = expected.tool_calls[i];
         const auto & actual_tool_call   = actual.tool_calls[i];
         assert_equals(expected_tool_call.name, actual_tool_call.name);
-        assert_equals(json::parse(expected_tool_call.arguments).dump(), json::parse(actual_tool_call.arguments).dump());
+        assert_equals(renormalize_json(expected_tool_call.arguments), renormalize_json(actual_tool_call.arguments));
         assert_equals(expected_tool_call.id, actual_tool_call.id);
     }
 }
@@ -152,14 +220,12 @@ static delta_data init_delta(const struct common_chat_templates * tmpls, const s
                              const common_chat_msg & user_message,
                              const common_chat_msg & delta_message,
                              const std::vector<common_chat_tool> & tools,
-                             const common_chat_tool_choice & tool_choice,
-                             bool think = false) {
+                             const common_chat_tool_choice & tool_choice) {
     common_chat_templates_inputs inputs;
     inputs.parallel_tool_calls = true;
     inputs.messages.push_back(user_message);
     inputs.tools       = tools;
     inputs.tool_choice = tool_choice;
-    inputs.extract_reasoning = think;
     auto params_prefix = common_chat_templates_apply(tmpls, inputs);
 
     inputs.messages.push_back(delta_message);
@@ -211,19 +277,22 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
                           const std::string & expected_delta = "",
                           bool expect_grammar_triggered = true,
                           bool test_grammar_if_triggered = true,
-                          bool think = false) {
+                          common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE) {
     common_chat_msg user_message;
     user_message.role = "user";
     user_message.content = "Hello, world!";
 
     for (const auto & tool_choice : std::vector<common_chat_tool_choice> {COMMON_CHAT_TOOL_CHOICE_AUTO, COMMON_CHAT_TOOL_CHOICE_REQUIRED}) {
-        auto data = init_delta(tmpls, end_tokens, user_message, test_message, tools, tool_choice, think);
+        auto data = init_delta(tmpls, end_tokens, user_message, test_message, tools, tool_choice);
         if (!expected_delta.empty()) {
             assert_equals(expected_delta, data.delta);
         }
 
         if (expect_grammar_triggered) {
-            const auto msg = common_chat_parse(data.delta, data.params.format);
+            common_chat_syntax syntax;
+            syntax.format = data.params.format;
+            syntax.reasoning_format = reasoning_format;
+            const auto msg = common_chat_parse(data.delta, /* is_partial= */ false, syntax);
             assert_msg_equals(test_message, msg);
         }
 
@@ -251,15 +320,25 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
                     {
                         const auto & pattern = trigger.value;
                         if (std::regex_search(constrained, match, std::regex(pattern))) {
-                            pos = match.position();
+                            pos = match.position(1);
                         }
                         break;
                     }
-                    case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
+                    case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
                     {
                         const auto & pattern = trigger.value;
-                        if (std::regex_search(constrained, match, std::regex(pattern)) && match.position() == 0) {
-                            pos = 0;
+                        if (std::regex_match(constrained, match, std::regex(pattern))) {
+                            auto mpos = std::string::npos;
+                            for (size_t i = 1; i < match.size(); ++i) {
+                                if (match[i].length() > 0) {
+                                    mpos = match.position(i);
+                                    break;
+                                }
+                            }
+                            if (mpos == std::string::npos) {
+                                mpos = match.position(0);
+                            }
+                            pos = mpos;
                         }
                         break;
                     }
@@ -313,117 +392,42 @@ const common_chat_msg message_user_parts {
     /* .tool_name = */ "",
     /* .tool_call_id = */ "",
 };
-const common_chat_msg message_assist {
-    "assistant",
-    "Hello, world!\nWhat's up?",
-    /* .content_parts = */ {},
-    /* .tool_calls = */ {},
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_thoughts_unparsed_think {
-    "assistant",
-    "<think>I'm thinking</think>Hello, world!\nWhat's up?",
-    /* .content_parts = */ {},
-    /* .tool_calls = */ {},
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_thoughts_unparsed_r7b {
-    "assistant",
-    "<|START_THINKING|>I'm thinking<|END_THINKING|>Hello, world!\nWhat's up?",
-    /* .content_parts = */ {},
-    /* .tool_calls = */ {},
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_thoughts {
-    "assistant",
-    "Hello, world!\nWhat's up?",
-    /* .content_parts = */ {},
-    /* .tool_calls = */ {},
-    /* .reasoning_content = */ "I'm thinking",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const std::vector<common_chat_tool_call> tool_calls {
-    { "special_function", "{\"arg1\": 1}", /* .id = */ "" },
-};
-const std::vector<common_chat_tool_call> tool_calls_idx {
-    { "special_function", "{\"arg1\": 1}", /* .id = */ "0" },
-};
-const std::vector<common_chat_tool_call> tool_calls_id {
-    { "special_function", "{\"arg1\": 1}", /* .id = */ "123456789" },
-};
+static common_chat_msg simple_assist_msg(const std::string & content, const std::string & reasoning_content = "", const std::string & tool_name = "", const std::string & arguments = "", const std::string & id = "") {
+    common_chat_msg msg;
+    msg.role = "assistant";
+    msg.content = content;
+    msg.reasoning_content = reasoning_content;
+    if (!tool_name.empty()) {
+        msg.tool_calls.push_back({ tool_name, arguments, id });
+    }
+    return msg;
+}
+const common_chat_msg message_assist                              = simple_assist_msg("Hello, world!\nWhat's up?");
+const common_chat_msg message_assist_empty                        = simple_assist_msg("");
+const common_chat_msg message_assist_thoughts_unparsed_deepseek   = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?");
+const common_chat_msg message_assist_thoughts_unparsed_md         = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```");
+const common_chat_msg message_assist_thoughts_unparsed_md_partial = simple_assist_msg("<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}");
 
-const common_chat_msg message_assist_call {
-    "assistant",
-    "",
-    /* .content_parts = */ {},
-    tool_calls,
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_call_thoughts = {
-    "assistant",
-    /* .content = */ "",
-    /* .content_parts = */ {},
-    tool_calls,
-    /* .reasoning_content = */ "I'm\nthinking",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_call_thoughts_unparsed = {
-    "assistant",
-    /* .content = */ "<think>I'm\nthinking</think>",
-    /* .content_parts = */ {},
-    tool_calls,
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_call_id {
-    "assistant",
-    "",
-    /* .content_parts = */ {},
-    tool_calls_id,
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_call_idx {
-    "assistant",
-    "",
-    /* .content_parts = */ {},
-    tool_calls_idx,
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_call_python {
-    "assistant",
-    "",
-    /* .content_parts = */ {},
-    { { "python", "{\"code\": \"print('hey')\"}", /* .id = */ "" } },
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
-const common_chat_msg message_assist_call_code_interpreter {
-    "assistant",
-    "",
-    /* .content_parts = */ {},
-    { { "code_interpreter", "{\"code\": \"print('hey')\"}", /* .id = */ "" } },
-    /* .reasoning_content = */ "",
-    /* .tool_name = */ "",
-    /* .tool_call_id = */ "",
-};
+const common_chat_msg message_assist_thoughts_unparsed_r7b       = simple_assist_msg("<|START_THINKING|>I'm\nthinking<|END_THINKING|>Hello, world!\nWhat's up?");
+const common_chat_msg message_assist_thoughts                    = simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking");
+const common_chat_msg message_assist_thoughts_unopened_unparsed  = simple_assist_msg("I'm\nthinking</think>Hello, world!\nWhat's up?");
+const common_chat_msg message_assist_thoughts_no_content         = simple_assist_msg("", "I'm\nthinking");
+const common_chat_msg message_assist_call                        = simple_assist_msg("", "", "special_function", "{\"arg1\": 1}");
+const common_chat_msg message_assist_call_content                = simple_assist_msg("Hello, world!\nWhat's up?", "", "special_function", "{\"arg1\":1}");
+const common_chat_msg message_assist_call_empty_args             = simple_assist_msg("", "", "special_function");
+const common_chat_msg message_assist_call_cutoff_args            = simple_assist_msg("", "", "special_function", "{\"arg");
+const common_chat_msg message_assist_call_thoughts               = simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\":1}");
+const common_chat_msg message_assist_call_thoughts_unparsed      = simple_assist_msg("<think>I'm\nthinking</think>\n\n", "", "special_function", "{\"arg1\": 1}");
+const common_chat_msg message_assist_call_id                     = simple_assist_msg("", "", "special_function", "{\"arg1\":1}", /* .id = */ "123456789");
+const common_chat_msg message_assist_call_idx                    = simple_assist_msg("", "", "special_function", "{\"arg1\":1}", /* .id = */ "0");
+const common_chat_msg message_assist_thoughts_call_idx           = simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}", /* id = */ "0");
+const common_chat_msg message_assist_call_python                 = simple_assist_msg("", "", "python", "{\"code\":\"print('hey')\"}");
+const common_chat_msg message_assist_call_python_lines           = simple_assist_msg("", "", "python", "{\"code\":\"# This is a program:\\nprint('hey')\"}");
+const common_chat_msg message_assist_call_python_lines_unclosed  = simple_assist_msg("", "", "python", "{\"code\":\"# This is a program:\\nprint('hey')");
+const common_chat_msg message_assist_call_code_interpreter       = simple_assist_msg("", "", "code_interpreter", "{\"code\":\"print('hey')\"}");
 
 static void test_msgs_oaicompat_json_conversion() {
+    printf("[%s]\n", __func__);
     std::vector<common_chat_msg> msgs{
         message_user,
         message_user_parts,
@@ -473,7 +477,7 @@ static void test_msgs_oaicompat_json_conversion() {
             "        \"type\": \"function\",\n"
             "        \"function\": {\n"
             "          \"name\": \"python\",\n"
-            "          \"arguments\": \"{\\\"code\\\": \\\"print('hey')\\\"}\"\n"
+            "          \"arguments\": \"{\\\"code\\\":\\\"print('hey')\\\"}\"\n"
             "        }\n"
             "      }\n"
             "    ]\n"
@@ -499,6 +503,7 @@ static void test_msgs_oaicompat_json_conversion() {
 }
 
 static void test_tools_oaicompat_json_conversion() {
+    printf("[%s]\n", __func__);
     std::vector<common_chat_tool> tools{
         special_function_tool,
         python_tool,
@@ -543,29 +548,18 @@ static void test_tools_oaicompat_json_conversion() {
 }
 
 static void test_template_output_parsers() {
+    printf("[%s]\n", __func__);
 
     common_chat_templates_inputs inputs_no_tools;
     inputs_no_tools.messages                = {message_user};
-    inputs_no_tools.extract_reasoning       = false;
-
-    common_chat_templates_inputs inputs_no_tools_think;
-    inputs_no_tools_think.messages          = {message_user};
-    inputs_no_tools_think.extract_reasoning = true;
 
     common_chat_templates_inputs inputs_tools;
     inputs_tools.messages                   = {message_user};
     inputs_tools.tools                      = {special_function_tool};
-    inputs_tools.extract_reasoning          = false;
-
-    common_chat_templates_inputs inputs_tools_think;
-    inputs_tools_think.messages             = {message_user};
-    inputs_tools_think.tools                = {special_function_tool};
-    inputs_tools_think.extract_reasoning    = true;
 
     common_chat_templates_inputs inputs_tools_builtin;
     inputs_tools_builtin.messages           = {message_user};
     inputs_tools_builtin.tools              = {python_tool};
-    inputs_tools_builtin.extract_reasoning  = false;
 
     {
         // Not supported yet
@@ -577,44 +571,87 @@ static void test_template_output_parsers() {
         auto tmpls = read_templates("models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja");
         std::vector<std::string>   end_tokens{ "<|END_OF_TURN_TOKEN|>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B,                   common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
-        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B,                   common_chat_templates_apply(tmpls.get(), inputs_tools).format);
-        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING, common_chat_templates_apply(tmpls.get(), inputs_tools_think).format);
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B, params.format);
+            assert_equals(false, params.thinking_forced_open);
+        }
 
         assert_msg_equals(message_assist,
             common_chat_parse(
                 "Hello, world!\nWhat's up?",
-                COMMON_CHAT_FORMAT_COMMAND_R7B));
-        assert_msg_equals(message_assist,
-            common_chat_parse(
-                "Hello, world!\nWhat's up?<|END_RESPONSE|>",
-                COMMON_CHAT_FORMAT_COMMAND_R7B));
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_COMMAND_R7B}));
         assert_msg_equals(message_assist,
             common_chat_parse(
                 "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
-                COMMON_CHAT_FORMAT_COMMAND_R7B));
-        assert_msg_equals(message_assist_thoughts_unparsed_r7b,
-            common_chat_parse(
-                "<|START_THINKING|>I'm thinking<|END_THINKING|>"
-                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
-                COMMON_CHAT_FORMAT_COMMAND_R7B));
-        assert_msg_equals(message_assist_thoughts_unparsed_r7b,
-            common_chat_parse(
-                "<|START_THINKING|>I'm thinking<|END_THINKING|>"
-                "Hello, world!\nWhat's up?<|END_RESPONSE|>",
-                COMMON_CHAT_FORMAT_COMMAND_R7B));
-
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_COMMAND_R7B}));
         assert_msg_equals(message_assist_thoughts,
             common_chat_parse(
-                "<|START_THINKING|>I'm thinking<|END_THINKING|>"
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
                 "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
-                COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING));
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+            common_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ true,
+                    /* .thinking_forced_open = */ false,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_r7b,
+            common_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_COMMAND_R7B}));
+        assert_msg_equals(message_assist_thoughts,
+            common_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_call_idx,
+            common_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_ACTION|>[\n"
+                "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+                "]<|END_ACTION|>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_no_content,
+            common_chat_parse(
+                "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
+                "<|START_ACTION|>[\n"
+                "    {\"tool_call_id\": \"0\", \"tool_name\": \"special",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_COMMAND_R7B,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
 
         test_templates(tmpls.get(), end_tokens, message_assist_call_idx, tools,
                       "<|START_THINKING|><|END_THINKING|>"
                       "<|START_ACTION|>[\n"
                       "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
-                      "]<|END_ACTION|>");
+                      "]<|END_ACTION|>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      COMMON_REASONING_FORMAT_DEEPSEEK);
         test_templates(tmpls.get(), end_tokens, message_assist, tools,
                       "<|START_RESPONSE|>Hello, world!\n"
                       "What's up?<|END_RESPONSE|>",
@@ -634,11 +671,52 @@ static void test_template_output_parsers() {
 
         // Generic tool calls doesn't generate / parse content-only messages symmetrically.
 
+        assert_equals(
+            simple_assist_msg("{ \"tool_call\" : { \"name\" : \"t"),
+            common_chat_parse(
+                "{ \"tool_call\" : { \"name\" : \"t",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_GENERIC,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                    /* .parse_tool_calls = */ false,
+                }));
+        assert_equals(
+            message_assist_empty,
+            common_chat_parse(
+                "{ \"tool_call\" : { \"name\" : \"t",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+
+        assert_equals(
+            simple_assist_msg("", "", "puppeteer_screenshot", "{\"name\":\"servethehome_homepage\","),
+            common_chat_parse(
+                R"({"tool_call": {"name": "puppeteer_screenshot", "arguments": {"name": "servethehome_homepage",)",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+
+        assert_equals(
+            message_assist_call_empty_args,
+            common_chat_parse(
+                "{ \"tool_call\" : { \"name\" : \"special_function\"",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+        assert_equals(
+            message_assist_call_cutoff_args,
+            common_chat_parse(
+                "{ \"tool_call\" : { \"name\" : \"special_function\", \"arguments\" : { \"arg",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_GENERIC}));
+
         assert_msg_equals(message_assist,
-                          common_chat_parse("{\n"
-                                            "  \"response\": \"Hello, world!\\nWhat's up?\"\n"
-                                            "}",
-                                            common_chat_templates_apply(tmpls.get(), inputs_tools).format));
+            common_chat_parse(
+                "{\n"
+                "  \"response\": \"Hello, world!\\nWhat's up?\"\n"
+                "}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_GENERIC}));
         test_templates(tmpls.get(), end_tokens, message_assist_call_id, tools,
                       "{\n"
                       "  \"tool_calls\": [\n"
@@ -663,11 +741,18 @@ static void test_template_output_parsers() {
             tmpls.get(), end_tokens, message_assist_call_id, tools,
             "[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]");
     }
+    {
+        auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja");
+        std::vector<std::string> end_tokens{ "<|im_end|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+    }
     {
         auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja");
         std::vector<std::string> end_tokens{ "<|im_end|>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
         assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
         assert_equals(
             COMMON_CHAT_FORMAT_HERMES_2_PRO,
@@ -683,114 +768,288 @@ static void test_template_output_parsers() {
                 .format);
 
         // Test parsing
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<tool_call>\n"
-            "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</tool_call>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<function=special_function>{\"arg1\": 1}</function>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<function name=\"special_function\">\n"
-            "{\"arg1\": 1}\n"
-            "</function>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<tool>\n"
-            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</tool>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<tools>\n"
-            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</tools>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<response>\n"
-            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</response>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "```xml\n"
-            "<response>\n"
-            "    {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</response>\n"
-            "```",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "```xml\n"
-            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "```",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "```\n"
-            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "```",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "```\n"
-            "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "```",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "```json\n"
-            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "```",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "```json\n"
-            "\n"
-            "                    <function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}} \n"
-            "                    </function_call> \n"
-            "``` ",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<json>\n"
-            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</json>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<xml>\n"
-            "  {\n"
-            "    \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}\n"
-            "  }\n"
-            "</xml>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "<JSON>\n"
-            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
-            "</JSON>",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_call, common_chat_parse(
-            "{\n  \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(
+            simple_assist_msg("", "", "python", ""),
+            common_chat_parse(
+                "```json\n"
+                "<function_call> { \"name\" : \"python\"",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            simple_assist_msg("Let's call something\n"),
+            common_chat_parse(
+                "Let's call something\n"
+                "<tool_call>{\"name\"",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(
+            simple_assist_msg("Let's call something\n"),
+            common_chat_parse(
+                "Let's call something\n"
+                "<tool_call>{\"name",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_call_thoughts,
+            common_chat_parse(
+                // QwQ-32B's template adds a trailing <think> if add_generation_prompt
+                "I'm\nthinking</think>\n"
+                "<tool_call>{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<tool_call>\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(message_assist_call_content,
+            common_chat_parse(
+                "Hello, world!\nWhat's up?<tool_call>\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool_call>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<function=special_function>{\"arg1\": 1}</function>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<function name=\"special_function\">\n"
+                "{\"arg1\": 1}\n"
+                "</function>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<tool>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tool>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<tools>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</tools>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<response>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</response>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "```xml\n"
+                "<response>\n"
+                "    {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</response>\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "```xml\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "```\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "```\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "```json\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "```",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "```json\n"
+                "\n"
+                "                    <function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}} \n"
+                "                    </function_call> \n"
+                "``` ",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<json>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</json>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<xml>\n"
+                "  {\n"
+                "    \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}\n"
+                "  }\n"
+                "</xml>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<JSON>\n"
+                "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+                "</JSON>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(
+            message_assist_call,
+            common_chat_parse(
+                "{\n  \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
 
-        assert_msg_equals(message_assist_thoughts_unparsed_think,
-            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
-        assert_msg_equals(message_assist_thoughts_unparsed_think,
-            common_chat_parse("I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(
+            simple_assist_msg(
+                "This is not a tool call:",
+                "",
+                "special_function",
+                "{\"arg1\": 1}"),
+            common_chat_parse(
+                "This is not a tool call:\n"
+                "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(message_assist,
+            common_chat_parse(
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
+        // assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+        //     common_chat_parse(
+        //         "I'm\nthinking</think>Hello, world!\nWhat's up?",
+        //         COMMON_CHAT_FORMAT_HERMES_2_PRO));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING));
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse("I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING));
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_md,
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ true,
+                    /* .thinking_forced_open = */ false,
+                    /* .parse_tool_calls = */ false,
+                }));
+        assert_msg_equals(message_assist_thoughts_unparsed_md_partial,
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ true,
+                    /* .thinking_forced_open = */ false,
+                }));
+        assert_msg_equals(message_assist_thoughts_unopened_unparsed,
+            common_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            common_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
 
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
                       "<tool_call>\n"
                       "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                       "</tool_call>");
-        test_templates(tmpls.get(), end_tokens, message_assist_call_python, tools,
+        test_templates(tmpls.get(), end_tokens, message_assist_call_python_lines, tools,
                       "<tool_call>\n"
-                      "{\"name\": \"python\", \"arguments\": {\"code\": \"print('hey')\"}}\n"
+                      "{\"name\": \"python\", \"arguments\": {\"code\":\"# This is a program:\\nprint('hey')\"}}\n"
                       "</tool_call>");
+        assert_msg_equals(
+            simple_assist_msg("", /* reasoning_content= */ "<tool_call>nah uhg</tool_call>"),
+            common_chat_parse(
+                "<think><tool_call>nah uhg</tool_call>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_HERMES_2_PRO,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
     }
     {
         auto tmpls = read_templates("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja");
@@ -806,6 +1065,13 @@ static void test_template_output_parsers() {
                           inputs_tools_builtin)
                           .format);
 
+        assert_equals(
+            message_assist_call,
+            common_chat_parse(
+                "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LLAMA_3_X}));
+
         // test_templates(tmpls.get(), end_tokens, message_assist, tools, R"(?)", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_call_code_interpreter, llama_3_1_tools,
                       "<|python_tag|>code_interpreter.call(code=\"print('hey')\")");
@@ -836,6 +1102,22 @@ static void test_template_output_parsers() {
         assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY,
                         common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
 
+        for (auto is_partial : { false, true }) {
+            assert_equals(
+                message_assist_call,
+                common_chat_parse(
+                    "<function=special_function>{\"arg1\": 1}</function>",
+                    is_partial,
+                    {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
+        }
+
+        assert_equals(
+            message_assist_call,
+            common_chat_parse(
+                "<function=special_function>{\"arg1\": 1}<",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
+
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
                       "<function=special_function>{\"arg1\": 1}</function>");
@@ -847,6 +1129,47 @@ static void test_template_output_parsers() {
         assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
         assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
 
+        assert_msg_equals(
+            simple_assist_msg(
+                "Hello, world!\nnono\nWhat's up?",
+                "",
+                "special_function",
+                "{\"arg1\": 1}"),
+            common_chat_parse(
+                "all\n"
+                "Hello, world!\n"
+                "nono\n"
+                "What's up?>>>special_function\n"
+                "{\"arg1\": 1}\n",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+        assert_msg_equals(message_assist_call_python_lines,
+            common_chat_parse(
+                "python\n"
+                "# This is a program:\n"
+                "print('hey')",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+        assert_msg_equals(message_assist_call_python_lines_unclosed,
+            common_chat_parse(
+                "python\n"
+                "# This is a program:\n"
+                "print('hey')",
+                /* is_partial= */ true,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+        assert_msg_equals(message_assist_call,
+            common_chat_parse(
+                "special_function\n"
+                "{\"arg1\": 1} \n                    ",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+        assert_msg_equals(message_assist,
+            common_chat_parse(
+                "all\n"
+                "Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
+
         test_templates(tmpls.get(), end_tokens, message_assist, {},
                       "all\n"
                       "Hello, world!\n"
@@ -872,22 +1195,73 @@ static void test_template_output_parsers() {
         auto tmpls = read_templates("models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja");
         std::vector<std::string>   end_tokens{ "<｜end▁of▁sentence｜>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,                   common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
-        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,                   common_chat_templates_apply(tmpls.get(), inputs_tools).format);
-        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING, common_chat_templates_apply(tmpls.get(), inputs_tools_think).format);
+        for (const auto & inputs : { inputs_no_tools, inputs_tools }) {
+            auto params = common_chat_templates_apply(tmpls.get(), inputs);
+            assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, params.format);
+            assert_equals(true, params.thinking_forced_open);
+        }
 
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
-        assert_msg_equals(message_assist_thoughts_unparsed_think,
-            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_DEEPSEEK_R1));
+        assert_msg_equals(
+            simple_assist_msg("Hello, world!\nWhat's up?", "<think>I'm\nthinking"),
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+        assert_msg_equals(
+            simple_assist_msg("", "I need to remember the correct syntax. It starts with <｜tool▁calls▁begin｜> and ends with"),
+            common_chat_parse(
+                "I need to remember the correct syntax. It starts with <｜tool▁calls▁begin｜> and ends with",
+                /* is_partial= */ true,
+                {
+                    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING));
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_unopened_unparsed,
+            common_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            common_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
         assert_msg_equals(message_assist_thoughts,
             // Latest template update (ast of 20250209) adds a trailing <think>\n if add_generation_prompt is true.
-            common_chat_parse("I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING));
+            common_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
         // test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
         //               "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
         //               "```json\n"
@@ -904,16 +1278,32 @@ static void test_template_output_parsers() {
 
         assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,                   common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
         assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,                   common_chat_templates_apply(tmpls.get(), inputs_tools).format);
-        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING, common_chat_templates_apply(tmpls.get(), inputs_tools_think).format);
 
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
-        assert_msg_equals(message_assist_thoughts_unparsed_think,
-            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_DEEPSEEK_R1));
+        assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
-            COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING));
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            common_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
 
         assert_msg_equals(message_assist_call_thoughts_unparsed,
             common_chat_parse(
@@ -922,7 +1312,17 @@ static void test_template_output_parsers() {
                 "```json\n"
                 "{\"arg1\": 1}\n"
                 "```<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
-                COMMON_CHAT_FORMAT_DEEPSEEK_R1));
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
+        assert_msg_equals(message_assist_call,
+            common_chat_parse(
+                "<｜tool▁calls｜>function<｜tool▁sep｜>special_function\n"
+                "```json\n"
+                "{\"arg1\": 1}\n"
+                "```<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
+
         assert_msg_equals(message_assist_call_thoughts,
             common_chat_parse(
                 "<think>I'm\nthinking</think>\n\n"
@@ -930,7 +1330,11 @@ static void test_template_output_parsers() {
                 "```json\n"
                 "{\"arg1\": 1}\n"
                 "```<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
-                COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING));
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
         test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
                 "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
                 "```json\n"
@@ -939,6 +1343,90 @@ static void test_template_output_parsers() {
     }
 }
 
+static void test_msg_diffs_compute() {
+    printf("[%s]\n", __func__);
+    {
+        common_chat_msg msg1;
+
+        common_chat_msg msg2;
+        msg2.content = "Hello, world!";
+
+        common_chat_msg_diff diff;
+        diff.content_delta = "Hello, world!";
+
+        assert_equals(
+            {diff},
+            common_chat_msg_diff::compute_diffs(msg1, msg2));
+    }
+    {
+        common_chat_msg msg1;
+        msg1.content = "Hello,";
+
+        common_chat_msg msg2;
+        msg2.content = "Hello, world!";
+
+        common_chat_msg_diff diff;
+        diff.content_delta = " world!";
+
+        assert_equals(
+            {diff},
+            common_chat_msg_diff::compute_diffs(msg1, msg2));
+    }
+    {
+        common_chat_msg msg0;
+
+        common_chat_msg msg1;
+        msg1.tool_calls = { { "special_function", "{\"ar", /* .id = */ "123" } };
+
+        common_chat_msg msg2;
+        msg2.tool_calls = { { "special_function", "{\"arg1\": 1}", /* .id = */ "123" } };
+
+        common_chat_msg_diff diff01;
+        diff01.tool_call_index = 0;
+        diff01.tool_call_delta.name = "special_function";
+        diff01.tool_call_delta.id = "123";
+        diff01.tool_call_delta.arguments = "{\"ar";
+
+        assert_equals(
+            {diff01},
+            common_chat_msg_diff::compute_diffs(msg0, msg1));
+
+        common_chat_msg_diff diff12;
+        diff12.tool_call_index = 0;
+        // Note: neither id nor name change here.
+        diff12.tool_call_delta.arguments = "g1\": 1}";
+
+        assert_equals(
+            {diff12},
+            common_chat_msg_diff::compute_diffs(msg1, msg2));
+    }
+    {
+        common_chat_msg msg0;
+
+        common_chat_msg msg2;
+        msg2.tool_calls = {
+            { "f1", "{\"arg1\": 1}", /* .id = */ "123" },
+            { "f2", "{\"arg2\": 2}", /* .id = */ "222" },
+        };
+
+        common_chat_msg_diff diff1;
+        diff1.tool_call_index = 0;
+        diff1.tool_call_delta.name = "f1";
+        diff1.tool_call_delta.id = "123";
+        diff1.tool_call_delta.arguments = "{\"arg1\": 1}";
+
+        common_chat_msg_diff diff2;
+        diff2.tool_call_index = 1;
+        diff2.tool_call_delta.name = "f2";
+        diff2.tool_call_delta.id = "222";
+        diff2.tool_call_delta.arguments = "{\"arg2\": 2}";
+
+        assert_equals(
+            {diff1, diff2},
+            common_chat_msg_diff::compute_diffs(msg0, msg2));
+    }
+}
+
 int main(int argc, char ** argv) {
     // try {
 #ifndef _WIN32
@@ -972,6 +1460,7 @@ int main(int argc, char ** argv) {
         } else
 #endif
         {
+            test_msg_diffs_compute();
             test_msgs_oaicompat_json_conversion();
             test_tools_oaicompat_json_conversion();
             test_template_output_parsers();
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
index eaf572c66..3f0c312e2 100644
--- a/tests/test-gguf.cpp
+++ b/tests/test-gguf.cpp
@@ -16,6 +16,7 @@ constexpr int offset_has_data    = 3000;
 
 enum handcrafted_file_type {
     HANDCRAFTED_HEADER_BAD_MAGIC           =  10,
+    HANDCRAFTED_HEADER_BAD_VERSION_0       =  15,
     HANDCRAFTED_HEADER_BAD_VERSION_1       =  20,
     HANDCRAFTED_HEADER_BAD_VERSION_FUTURE  =  30,
     HANDCRAFTED_HEADER_BAD_N_TENSORS       =  40,
@@ -51,6 +52,7 @@ enum handcrafted_file_type {
 static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
     switch (hft) {
         case HANDCRAFTED_HEADER_BAD_MAGIC:           return "HEADER_BAD_MAGIC";
+        case HANDCRAFTED_HEADER_BAD_VERSION_0:       return "HEADER_BAD_VERSION_0";
         case HANDCRAFTED_HEADER_BAD_VERSION_1:       return "HEADER_BAD_VERSION_1";
         case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE:  return "HEADER_BAD_VERSION_FUTURE";
         case HANDCRAFTED_HEADER_BAD_N_KV:            return "HEADER_BAD_N_KV";
@@ -171,7 +173,10 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
         helper_write(file, GGUF_MAGIC, 4);
     }
 
-    if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
+    if (hft == HANDCRAFTED_HEADER_BAD_VERSION_0) {
+        const uint32_t version = 0;
+        helper_write(file, version);
+    } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) {
         const uint32_t version = 1;
         helper_write(file, version);
     } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) {
@@ -660,6 +665,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
 
     const std::vector<handcrafted_file_type> hfts = {
         HANDCRAFTED_HEADER_BAD_MAGIC,
+        HANDCRAFTED_HEADER_BAD_VERSION_0,
         HANDCRAFTED_HEADER_BAD_VERSION_1,
         HANDCRAFTED_HEADER_BAD_VERSION_FUTURE,
         HANDCRAFTED_HEADER_BAD_N_KV,
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 8988c347e..6d64f0737 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -7,6 +7,8 @@
 #include "../src/unicode.h"
 #include "../src/llama-grammar.h"
 
+#include <nlohmann/json.hpp>
+
 #include <cassert>
 #include <string>
 #include <vector>
diff --git a/tests/test-json-partial.cpp b/tests/test-json-partial.cpp
new file mode 100644
index 000000000..bc136bece
--- /dev/null
+++ b/tests/test-json-partial.cpp
@@ -0,0 +1,237 @@
+#include "common.h"
+#include "json-partial.h"
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+
+template <class T> static void assert_equals(const T & expected, const T & actual) {
+  if (expected != actual) {
+      std::cerr << "Expected: " << expected << std::endl;
+      std::cerr << "Actual: " << actual << std::endl;
+      std::cerr << std::flush;
+      throw std::runtime_error("Test failed");
+  }
+}
+
+static void test_json_healing() {
+  auto parse = [](const std::string & str) {
+      std::cerr << "# Parsing: " << str << '\n';
+      std::string::const_iterator it = str.begin();
+      const auto end = str.end();
+      common_json out;
+      std::string healing_marker = "$llama.cpp.json$";
+      if (common_json_parse(it, end, healing_marker, out)) {
+          auto dump = out.json.dump();
+          std::cerr << "Parsed: " << dump << '\n';
+          std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
+          std::string result;
+          if (!out.healing_marker.json_dump_marker.empty()) {
+              auto i = dump.find(out.healing_marker.json_dump_marker);
+              if (i == std::string::npos) {
+                  throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
+              }
+              result = dump.substr(0, i);
+          } else {
+            result = dump;
+          }
+          std::cerr << "Result: " << result << '\n';
+          if (string_starts_with(str, result)) {
+            std::cerr << "Failure!\n";
+          }
+        //   return dump;
+      } else {
+        throw std::runtime_error("Failed to parse: " + str);
+      }
+
+  };
+  auto parse_all = [&](const std::string & str) {
+      for (size_t i = 1; i < str.size(); i++) {
+          parse(str.substr(0, i));
+      }
+  };
+  parse_all("{\"a\": \"b\"}");
+  parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
+
+  parse_all("[{\"a\": \"b\"}]");
+
+  auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
+      for (const auto & input : inputs) {
+        common_json out;
+        assert_equals(true, common_json_parse(input, "$foo", out));
+        assert_equals<std::string>(expected, out.json.dump());
+        assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
+      }
+  };
+  // No healing needed:
+  test(
+    {
+      R"([{"a":"b"}, "y"])",
+    },
+    R"([{"a":"b"},"y"])",
+    ""
+  );
+  // Partial literals can't be healed:
+  test(
+    {
+      R"([1)",
+      R"([tru)",
+      R"([n)",
+      R"([nul)",
+      R"([23.2)",
+    },
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({"a": 1)",
+      R"({"a": tru)",
+      R"({"a": n)",
+      R"({"a": nul)",
+      R"({"a": 23.2)",
+    },
+    R"({"a":"$foo"})",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({)",
+    },
+    R"({"$foo":1})",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([)",
+    },
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  // Healing right after a full literal
+  test(
+    {
+      R"(1 )",
+    },
+    R"(1)",
+    ""
+  );
+  test(
+    {
+      R"(true)",
+      R"(true )",
+    },
+    R"(true)",
+    ""
+  );
+  test(
+    {
+      R"(null)",
+      R"(null )",
+    },
+    R"(null)",
+    ""
+  );
+  test(
+    {
+      R"([1 )",
+    },
+    R"([1,"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([{})",
+      R"([{} )",
+    },
+    R"([{},"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([true)",
+    },
+    // TODO: detect the true/false/null literal was complete
+    R"(["$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([true )",
+    },
+    R"([true,"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([true,)",
+    },
+    R"([true,"$foo"])",
+    R"("$foo)"
+  );
+  // Test nesting
+  test(
+    {
+      R"([{"a": [{"b": [{)",
+    },
+    R"([{"a":[{"b":[{"$foo":1}]}]}])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"([{"a": [{"b": [)",
+    },
+    R"([{"a":[{"b":["$foo"]}]}])",
+    R"("$foo)"
+  );
+
+  test(
+    {
+      R"([{"a": "b"})",
+      R"([{"a": "b"} )",
+    },
+    R"([{"a":"b"},"$foo"])",
+    R"(,"$foo)"
+  );
+  test(
+    {
+      R"([{"a": "b"},)",
+      R"([{"a": "b"}, )",
+    },
+    R"([{"a":"b"},"$foo"])",
+    R"("$foo)"
+  );
+  test(
+    {
+      R"({ "code)",
+    },
+    R"({"code$foo":1})",
+    R"($foo)"
+  );
+  test(
+    {
+      R"({ "code\)",
+    },
+    R"({"code\\$foo":1})",
+    R"(\$foo)"
+  );
+  test(
+    {
+      R"({ "code")",
+    },
+    R"({"code":"$foo"})",
+    R"(:"$foo)"
+  );
+  test(
+    {
+      R"({ "key")",
+    },
+    R"({"key":"$foo"})",
+    R"(:"$foo)"
+  );
+}
+
+int main() {
+    test_json_healing();
+    std::cerr << "All tests passed.\n";
+    return 0;
+}
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 38cf01d6d..78ee55e24 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -6,6 +6,8 @@
 
 #include "../src/llama-grammar.h"
 
+#include <nlohmann/json.hpp>
+
 #include <cassert>
 #include <fstream>
 #include <sstream>
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 60ac62b38..6300f25ca 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -98,7 +98,7 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_top_p(p, 1));
+    tester.apply(llama_sampler_init_top_p(p, 0));
     tester.apply(llama_sampler_init_dist (0));
     DUMP(&tester.cur_p);
 
@@ -109,7 +109,7 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_min_p(p, 1));
+    tester.apply(llama_sampler_init_min_p(p, 0));
     tester.apply(llama_sampler_init_dist (0));
     DUMP(&tester.cur_p);
 
@@ -130,7 +130,7 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
     sampler_tester tester(probs, probs_expected);
 
     DUMP(&tester.cur_p);
-    tester.apply(llama_sampler_init_typical(p, 1));
+    tester.apply(llama_sampler_init_typical(p, 0));
     DUMP(&tester.cur_p);
 
     tester.check();
@@ -332,6 +332,7 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.74f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
+    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.05f);
 
     printf("XTC should:\n");
     test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
@@ -341,8 +342,8 @@ int main(void) {
     printf("XTC should not:\n");
     test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
 
-    test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
-    test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
+    test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f},            0.5f);
+    test_typical({0.4f, 0.2f, 0.2f, 0.2f},     {0.2f, 0.2f, 0.2f}, 0.5f);
 
     test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
     test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 06196cf24..803630d26 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -315,7 +315,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  --numa <distribute|isolate|numactl>       numa mode (default: disabled)\n");
     printf("  -r, --repetitions <n>                     number of times to repeat each test (default: %d)\n",
            cmd_params_defaults.reps);
-    printf("  --prio <0|1|2|3>                          process/thread priority (default: %d)\n",
+    printf("  --prio <-1|0|1|2|3>                          process/thread priority (default: %d)\n",
            cmd_params_defaults.prio);
     printf("  --delay <0...N> (seconds)                 delay between each test (default: %d)\n",
            cmd_params_defaults.delay);
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index e7ba23587..4baa15b96 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -1,37 +1,50 @@
 # mtmd
 
-add_library(mtmd OBJECT
+find_package(Threads REQUIRED)
+
+add_library(mtmd
             mtmd.cpp
-            mtmd-helper.cpp
+            mtmd-audio.cpp
             mtmd.h
             clip.cpp
             clip.h
             clip-impl.h
+            mtmd-helper.cpp
+            mtmd-helper.h
             )
 
-target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-
-target_include_directories(mtmd PUBLIC .)
+target_link_libraries     (mtmd PUBLIC ggml llama)
+target_link_libraries     (mtmd PRIVATE Threads::Threads)
+target_include_directories(mtmd PUBLIC  .)
 target_include_directories(mtmd PRIVATE ../..)
-target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
+target_include_directories(mtmd PRIVATE ../../vendor)
+target_compile_features   (mtmd PRIVATE cxx_std_17)
 
-target_compile_features(mtmd PRIVATE cxx_std_17)
-
-add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
 if (BUILD_SHARED_LIBS)
-    set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
-    target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-    install(TARGETS mtmd_shared LIBRARY)
+    set_target_properties     (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(mtmd PRIVATE LLAMA_BUILD)
+    target_compile_definitions(mtmd PUBLIC  LLAMA_SHARED)
 endif()
 
+set(MTMD_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h
+    )
+
+set_target_properties(mtmd
+    PROPERTIES
+    PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}")
+
+install(TARGETS mtmd LIBRARY PUBLIC_HEADER)
+
 if (NOT MSVC)
-    target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
+    # for stb_image.h and miniaudio.h
+    target_compile_options(mtmd PRIVATE -Wno-cast-qual)
 endif()
 
-if(TARGET BUILD_INFO)
-    add_dependencies(mtmd BUILD_INFO)
+if (TARGET BUILD_INFO)
+    add_dependencies(mtmd        BUILD_INFO)
+    add_dependencies(mtmd-helper BUILD_INFO)
 endif()
 
 add_executable(llama-llava-cli    deprecation-warning.cpp)
@@ -40,8 +53,8 @@ add_executable(llama-minicpmv-cli deprecation-warning.cpp)
 add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
 
 set(TARGET llama-mtmd-cli)
-add_executable(${TARGET} mtmd-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
+add_executable         (${TARGET} mtmd-cli.cpp)
+set_target_properties  (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
+install                (TARGETS ${TARGET} RUNTIME)
+target_link_libraries  (${TARGET} PRIVATE common mtmd Threads::Threads)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 7b7d2df39..62c936ed0 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -16,22 +16,26 @@
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
-#define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
+#define KEY_PROJ_TYPE           "clip.projector_type"
+#define KEY_HAS_AUDIO_ENC       "clip.has_audio_encoder"
+#define KEY_HAS_VISION_ENC      "clip.has_vision_encoder"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_USE_SILU            "clip.use_silu"
-#define KEY_N_EMBD              "clip.vision.embedding_length"
-#define KEY_N_FF                "clip.vision.feed_forward_length"
-#define KEY_N_BLOCK             "clip.vision.block_count"
-#define KEY_N_HEAD              "clip.vision.attention.head_count"
-#define KEY_LAYER_NORM_EPS      "clip.vision.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM            "clip.vision.projection_dim"
+
+#define KEY_N_EMBD              "clip.%s.embedding_length"
+#define KEY_N_FF                "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK             "clip.%s.block_count"
+#define KEY_PROJ_DIM            "clip.%s.projection_dim"
+#define KEY_N_HEAD              "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+
+// vision-specific
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
-#define KEY_PROJ_TYPE           "clip.projector_type"
 #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
 
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
@@ -39,13 +43,18 @@
 #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
 #define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
 #define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
+
+// audio-specific
+#define KEY_A_NUM_MEL_BINS      "clip.audio.num_mel_bins"
+#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
 
 
 //
 // tensor name constants
 //
 
-#define TN_POS_EMBD        "v.position_embd.weight"
+#define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
 #define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
@@ -95,6 +104,13 @@
 #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
 #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
 
+// ultravox
+#define TN_CONV1D       "a.conv1d.%d.%s"
+#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
+#define TN_MM_AUDIO_FC  "mm.a.fc.%s" // fully connected layer
+#define TN_MM_NORM_PRE  "mm.a.norm_pre.%s"
+#define TN_MM_NORM_MID  "mm.a.norm_mid.%s"
+
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
 
@@ -110,8 +126,11 @@ enum projector_type {
     PROJECTOR_TYPE_IDEFICS3,
     PROJECTOR_TYPE_PIXTRAL,
     PROJECTOR_TYPE_QWEN25VL,
+    PROJECTOR_TYPE_ULTRAVOX,
     PROJECTOR_TYPE_INTERNVL,
     PROJECTOR_TYPE_LLAMA4,
+    PROJECTOR_TYPE_QWEN2A,
+    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -126,8 +145,11 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
+    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
     { PROJECTOR_TYPE_INTERNVL,  "internvl"},
     { PROJECTOR_TYPE_LLAMA4,    "llama4"},
+    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"},
+    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -147,8 +169,10 @@ struct clip_image_u8 {
     std::vector<uint8_t> buf;
 };
 
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
+// For images, buf.size() == nx*ny*3
+//     Memory layout: RGBRGBRGB...
+// For audio, only one channel is used, buf.size() == nx*ny
+//     nx will be n_frames and ny will be n_mel
 struct clip_image_f32 {
     int nx;
     int ny;
@@ -242,6 +266,7 @@ struct clip_image_u8_batch {
 
 struct clip_image_f32_batch {
     std::vector<clip_image_f32_ptr> entries;
+    bool is_audio = false;
 
     // for llava-uhd style models, we need to know the grid size
     // note: entries.size() == grid_x * grid_y + 1 (one overview image)
@@ -249,7 +274,12 @@ struct clip_image_f32_batch {
     int grid_y = 0;
 
     clip_image_f32_batch clone() const {
-        clip_image_f32_batch new_batch;
+        clip_image_f32_batch new_batch{
+            /* entries  */ {},
+            /* is_audio */ is_audio,
+            /* grid_x   */ grid_x,
+            /* grid_y   */ grid_y,
+        };
         new_batch.entries.reserve(entries.size());
         for (const auto & entry : entries) {
             new_batch.entries.emplace_back(new clip_image_f32(*entry));
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index eba07f6c8..c25bacc17 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -11,9 +11,6 @@
 #include "ggml-backend.h"
 #include "gguf.h"
 
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
@@ -35,6 +32,7 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
 
 enum ffn_op_type {
     FFN_GELU,
+    FFN_GELU_ERF,
     FFN_SILU,
     FFN_GELU_QUICK,
 };
@@ -174,9 +172,13 @@ struct clip_hparams {
     int32_t n_layer;
     int32_t proj_scale_factor = 0; // idefics3
 
+    float image_mean[3];
+    float image_std[3];
+
     // for models using dynamic image size, we need to have a smaller image size to warmup
     // otherwise, user will get OOM everytime they load the model
     int32_t warmup_image_size = 0;
+    int32_t warmup_audio_size = 3000;
 
     ffn_op_type ffn_op = FFN_GELU;
 
@@ -191,6 +193,14 @@ struct clip_hparams {
     int32_t attn_window_size = 0;
     int32_t n_wa_pattern = 0;
     int32_t spatial_merge_size = 0;
+
+    // audio
+    int32_t n_mel_bins = 0; // whisper preprocessor
+    int32_t proj_stack_factor = 0; // ultravox
+
+    // legacy
+    bool has_llava_projector = false;
+    int minicpmv_version = 0;
 };
 
 struct clip_layer {
@@ -228,8 +238,10 @@ struct clip_layer {
     ggml_tensor * ls_2_w = nullptr;
 };
 
-struct clip_vision_model {
-    struct clip_hparams hparams;
+struct clip_model {
+    clip_modality modality = CLIP_MODALITY_VISION;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;
+    clip_hparams hparams;
 
     // embeddings
     ggml_tensor * class_embedding = nullptr;
@@ -246,7 +258,9 @@ struct clip_vision_model {
     ggml_tensor * post_ln_w;
     ggml_tensor * post_ln_b;
 
-    ggml_tensor * projection;
+    ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
+    ggml_tensor * mm_fc_w;
+    ggml_tensor * mm_fc_b;
 
     // LLaVA projection
     ggml_tensor * mm_input_norm_w = nullptr;
@@ -332,17 +346,18 @@ struct clip_vision_model {
     // pixtral
     ggml_tensor * token_embd_img_break = nullptr;
     ggml_tensor * mm_patch_merger_w = nullptr;
+
+    // ultravox / whisper encoder
+    ggml_tensor * conv1d_1_w = nullptr;
+    ggml_tensor * conv1d_1_b = nullptr;
+    ggml_tensor * conv1d_2_w = nullptr;
+    ggml_tensor * conv1d_2_b = nullptr;
+    ggml_tensor * mm_norm_pre_w = nullptr;
+    ggml_tensor * mm_norm_mid_w = nullptr;
 };
 
 struct clip_ctx {
-    bool has_llava_projector = false;
-    int minicpmv_version = 0;
-
-    struct clip_vision_model vision_model;
-    projector_type proj_type = PROJECTOR_TYPE_MLP;
-
-    float image_mean[3];
-    float image_std[3];
+    clip_model model;
 
     gguf_context_ptr ctx_gguf;
     ggml_context_ptr ctx_data;
@@ -396,11 +411,16 @@ struct clip_ctx {
             ggml_backend_free(backend_cpu);
         }
     }
+
+    // this function is added so that we don't change too much of the existing code
+    projector_type proj_type() const {
+        return model.proj_type;
+    }
 };
 
 struct clip_graph {
     clip_ctx * ctx;
-    const clip_vision_model & model;
+    const clip_model & model;
     const clip_hparams & hparams;
 
     // we only support single image per batch
@@ -423,7 +443,7 @@ struct clip_graph {
 
     clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
             ctx(ctx),
-            model(ctx->vision_model),
+            model(ctx->model),
             hparams(model.hparams),
             img(img),
             patch_size(hparams.patch_size),
@@ -455,7 +475,7 @@ struct clip_graph {
                                 model.position_embeddings,
                                 nullptr);
 
-        if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
+        if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
             const int batch_size = 1;
             GGML_ASSERT(n_patches_x == n_patches_y);
             const int patches_per_image = n_patches_x;
@@ -478,7 +498,7 @@ struct clip_graph {
                 ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
                 cur);
 
-        } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
+        } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
             // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
 
             const int scale_factor = model.hparams.proj_scale_factor;
@@ -612,7 +632,7 @@ struct clip_graph {
         const int n_pos            = n_patches;
         const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
 
-        norm_type norm_t = ctx->proj_type == PROJECTOR_TYPE_QWEN25VL
+        norm_type norm_t = ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
             ? NORM_TYPE_RMS // qwen 2.5 vl
             : NORM_TYPE_NORMAL; // qwen 2 vl
 
@@ -828,11 +848,11 @@ struct clip_graph {
             const int d_head = 128;
             int n_head = n_embd/d_head;
             int num_query = 96;
-            if (ctx->minicpmv_version == 2) {
+            if (ctx->model.hparams.minicpmv_version == 2) {
                 num_query = 96;
-            } else if (ctx->minicpmv_version == 3) {
+            } else if (ctx->model.hparams.minicpmv_version == 3) {
                 num_query = 64;
-            } else if (ctx->minicpmv_version == 4) {
+            } else if (ctx->model.hparams.minicpmv_version == 4) {
                 num_query = 64;
             }
 
@@ -1049,7 +1069,7 @@ struct clip_graph {
             int il_last = hparams.n_layer - 1;
             int deepest_feature_layer = -1;
 
-            if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV || ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
                 il_last += 1;
             }
 
@@ -1183,7 +1203,7 @@ struct clip_graph {
         }
 
         // llava projector (also used by granite)
-        if (ctx->has_llava_projector) {
+        if (ctx->model.hparams.has_llava_projector) {
             embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
 
             ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
@@ -1197,7 +1217,7 @@ struct clip_graph {
             // print_tensor_info(embeddings, "embeddings");
 
             // llava projector
-            if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+            if (ctx->proj_type() == PROJECTOR_TYPE_MLP) {
                 embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
                 embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
 
@@ -1207,7 +1227,7 @@ struct clip_graph {
                     embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
                 }
             }
-            else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) {
                 embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
                 embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
                 // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -1228,7 +1248,7 @@ struct clip_graph {
                 embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
                                     model.mm_4_b);
             }
-            else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+            else if (ctx->proj_type() == PROJECTOR_TYPE_LDP) {
                 // MobileVLM projector
                 int n_patch = 24;
                 ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
@@ -1338,7 +1358,7 @@ struct clip_graph {
                 }
                 embeddings = block_1;
             }
-            else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
+            else if (ctx->proj_type() == PROJECTOR_TYPE_LDPV2)
             {
                 int n_patch = 24;
                 ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
@@ -1368,7 +1388,7 @@ struct clip_graph {
         }
 
         // glm projector
-        else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
             size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
             embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
             embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
@@ -1408,6 +1428,114 @@ struct clip_graph {
         return gf;
     }
 
+    // whisper encoder with custom projector
+    ggml_cgraph * build_whisper_enc() {
+        const int n_frames = img.nx;
+        const int n_pos    = n_frames / 2;
+        GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+        ggml_tensor * inp = build_inp_raw(1);
+
+        // conv1d block
+        {
+            // convolution + gelu
+            ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
+            cur = ggml_add(ctx0, cur, model.conv1d_1_b);
+
+            cur = ggml_gelu_erf(ctx0, cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
+            cur = ggml_add(ctx0, cur, model.conv1d_2_b);
+
+            cur = ggml_gelu_erf(ctx0, cur);
+            // transpose
+            inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+            cb(inp, "after_conv1d", -1);
+        }
+
+        // sanity check (only check one layer, but it should be the same for all)
+        GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
+        GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
+        GGML_ASSERT(model.layers[0].q_b);
+        GGML_ASSERT(model.layers[0].v_b);
+        GGML_ASSERT(!model.layers[0].k_b); // no bias for k
+        GGML_ASSERT(model.post_ln_w && model.post_ln_b);
+
+        ggml_tensor * pos_embd_selected = ggml_view_2d(
+            ctx0, model.position_embeddings,
+            model.position_embeddings->ne[0], n_pos,
+            model.position_embeddings->nb[1], 0
+        );
+        ggml_tensor * cur = build_vit(
+                                inp, n_pos,
+                                NORM_TYPE_NORMAL,
+                                hparams.ffn_op,
+                                pos_embd_selected,
+                                nullptr);
+
+        cb(cur, "after_transformer", -1);
+
+        if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) {
+            // StackAudioFrames
+            // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
+            {
+                int64_t stride = n_embd * hparams.proj_stack_factor;
+                int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
+                int64_t pad = padded_len - ggml_nelements(cur);
+                if (pad > 0) {
+                    cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
+                    cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
+                }
+                cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
+                                    ggml_row_size(cur->type, stride), 0);
+            }
+
+            cb(cur, "after_stacked", -1);
+
+            // UltravoxProjector
+            {
+                // pre-norm
+                cur = ggml_rms_norm(ctx0, cur, 1e-6);
+                cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+
+                // ffn in
+                cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+
+                // swiglu
+                {
+                    int64_t split_point = cur->ne[0] / 2;
+                    ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                    ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                    // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
+                    x1 = ggml_silu(ctx0, x1);
+                    cur = ggml_mul(ctx0, x0, x1);
+                }
+
+                // mid-norm
+                cur = ggml_rms_norm(ctx0, cur, 1e-6);
+                cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
+
+                // ffn out
+                cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+            }
+
+        } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
+            // projector
+            cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
+            cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+        } else {
+            GGML_ABORT("%s: unknown projector type", __func__);
+        }
+
+        cb(cur, "projected", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
 private:
     //
     // utility functions
@@ -1541,6 +1669,17 @@ private:
             inpL = cur;
         }
 
+        // TODO @ngxson : find a way to move this outside
+        if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) {
+            ggml_tensor * cur = inpL;
+            cur = ggml_transpose(ctx0, cur);
+            cur = ggml_cont(ctx0, cur);
+            cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
+            cur = ggml_transpose(ctx0, cur);
+            cur = ggml_cont(ctx0, cur);
+            inpL = cur;
+        }
+
         // post-layernorm
         if (model.post_ln_w) {
             inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
@@ -1562,8 +1701,8 @@ private:
         return inp;
     }
 
-    ggml_tensor * build_inp_raw() {
-        ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
+    ggml_tensor * build_inp_raw(int channels = 3) {
+        ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
         ggml_set_name(inp_raw, "inp_raw");
         ggml_set_input(inp_raw);
         return inp_raw;
@@ -1641,6 +1780,11 @@ private:
                     cur = ggml_gelu(ctx0, cur);
                     cb(cur, "ffn_gelu", il);
                 } break;
+            case FFN_GELU_ERF:
+                {
+                    cur = ggml_gelu_erf(ctx0, cur);
+                    cb(cur, "ggml_gelu_erf", il);
+                } break;
             case FFN_GELU_QUICK:
                 {
                     cur = ggml_gelu_quick(ctx0, cur);
@@ -1805,7 +1949,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
     ggml_cgraph * res;
 
-    switch (ctx->proj_type) {
+    switch (ctx->proj_type()) {
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_IDEFICS3:
             {
@@ -1832,6 +1976,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 res = graph.build_llama4();
             } break;
+        case PROJECTOR_TYPE_ULTRAVOX:
+        case PROJECTOR_TYPE_QWEN2A:
+            {
+                res = graph.build_whisper_enc();
+            } break;
         default:
             {
                 res = graph.build_llava();
@@ -1844,13 +1993,15 @@ struct clip_model_loader {
     ggml_context_ptr ctx_meta;
     gguf_context_ptr ctx_gguf;
 
-    clip_ctx & ctx_clip;
     std::string fname;
 
     size_t model_size = 0; // in bytes
 
-    // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
-    clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
+    bool has_vision = false;
+    bool has_audio  = false;
+
+    // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
+    clip_model_loader(const char * fname) : fname(fname) {
         struct ggml_context * meta = nullptr;
 
         struct gguf_init_params params = {
@@ -1882,6 +2033,19 @@ struct clip_model_loader {
             LOG_INF("\n");
         }
 
+        // modalities
+        {
+            get_bool(KEY_HAS_VISION_ENC, has_vision, false);
+            get_bool(KEY_HAS_AUDIO_ENC,  has_audio,  false);
+
+            if (has_vision) {
+                LOG_INF("%s: has vision encoder\n", __func__);
+            }
+            if (has_audio) {
+                LOG_INF("%s: has audio encoder\n", __func__);
+            }
+        }
+
         // tensors
         {
             for (int i = 0; i < n_tensors; ++i) {
@@ -1897,44 +2061,72 @@ struct clip_model_loader {
         }
     }
 
-    void load_hparams() {
-        auto & hparams = ctx_clip.vision_model.hparams;
+    void load_hparams(clip_model & model, clip_modality modality) {
+        auto & hparams = model.hparams;
         std::string log_ffn_op; // for logging
 
+        // sanity check
+        if (modality == CLIP_MODALITY_VISION) {
+            GGML_ASSERT(has_vision);
+        } else if (modality == CLIP_MODALITY_AUDIO) {
+            GGML_ASSERT(has_audio);
+        }
+        model.modality = modality;
+
+
         // projector type
         std::string proj_type;
         {
             get_string(KEY_PROJ_TYPE, proj_type, false);
             if (!proj_type.empty()) {
-                ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
+                model.proj_type = clip_projector_type_from_string(proj_type);
             }
-            if (ctx_clip.proj_type == PROJECTOR_TYPE_UNKNOWN) {
+            if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
                 throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
             }
+
+            // correct arch for multimodal models
+            if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
+                model.proj_type = modality == CLIP_MODALITY_VISION
+                                    ? PROJECTOR_TYPE_QWEN25VL
+                                    : PROJECTOR_TYPE_QWEN2A;
+            }
         }
 
+        const bool is_vision = model.modality == CLIP_MODALITY_VISION;
+        const bool is_audio  = model.modality == CLIP_MODALITY_AUDIO;
+
         // other hparams
         {
-            get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
+            const char * prefix = is_vision ? "vision" : "audio";
+            get_u32(string_format(KEY_N_EMBD,         prefix), hparams.n_embd);
+            get_u32(string_format(KEY_N_HEAD,         prefix), hparams.n_head);
+            get_u32(string_format(KEY_N_FF,           prefix), hparams.n_ff);
+            get_u32(string_format(KEY_N_BLOCK,        prefix), hparams.n_layer);
+            get_u32(string_format(KEY_PROJ_DIM,       prefix), hparams.projection_dim);
+            get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
 
-            get_u32(KEY_N_EMBD,         hparams.n_embd);
-            get_u32(KEY_N_HEAD,         hparams.n_head);
-            get_u32(KEY_N_FF,           hparams.n_ff);
-            get_u32(KEY_N_BLOCK,        hparams.n_layer);
-            get_u32(KEY_PROJ_DIM,       hparams.projection_dim);
-            get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
-            get_u32(KEY_IMAGE_SIZE,     hparams.image_size);
-            get_u32(KEY_PATCH_SIZE,     hparams.patch_size);
-            get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
-            get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
+            if (is_vision) {
+                get_u32(KEY_IMAGE_SIZE, hparams.image_size);
+                get_u32(KEY_PATCH_SIZE, hparams.patch_size);
+                get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
+                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
+                get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
+
+            } else if (is_audio) {
+                get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
+
+            } else {
+                GGML_ASSERT(false && "unknown modality");
+            }
 
             // default warmup value
             hparams.warmup_image_size = hparams.image_size;
 
-            ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
-                                        || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
-                                        || ctx_clip.proj_type == PROJECTOR_TYPE_LDP
-                                        || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
+            hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
+                                       || model.proj_type == PROJECTOR_TYPE_MLP_NORM
+                                       || model.proj_type == PROJECTOR_TYPE_LDP
+                                       || model.proj_type == PROJECTOR_TYPE_LDPV2;
 
             {
                 bool use_gelu = false;
@@ -1964,7 +2156,7 @@ struct clip_model_loader {
                 }
             }
 
-            {
+            if (is_vision) {
                 int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
                 int idx_std  = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
                 GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
@@ -1972,8 +2164,8 @@ struct clip_model_loader {
                 const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
                 const float * std_data  = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
                 for (int i = 0; i < 3; ++i) {
-                    ctx_clip.image_mean[i] = mean_data[i];
-                    ctx_clip.image_std[i]  = std_data[i];
+                    hparams.image_mean[i] = mean_data[i];
+                    hparams.image_std[i]  = std_data[i];
                 }
             }
 
@@ -1990,11 +2182,11 @@ struct clip_model_loader {
             }
 
             // model-specific params
-            switch (ctx_clip.proj_type) {
+            switch (model.proj_type) {
                 case PROJECTOR_TYPE_MINICPMV:
                     {
-                        if (ctx_clip.minicpmv_version == 0) {
-                            ctx_clip.minicpmv_version = 2; // default to 2 if not set
+                        if (hparams.minicpmv_version == 0) {
+                            hparams.minicpmv_version = 2; // default to 2 if not set
                         }
                     } break;
                 case PROJECTOR_TYPE_IDEFICS3:
@@ -2050,6 +2242,17 @@ struct clip_model_loader {
                             isize,   isize*3, // 336, 1008
                         };
                     } break;
+                case PROJECTOR_TYPE_ULTRAVOX:
+                case PROJECTOR_TYPE_QWEN2A:
+                    {
+                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX;
+                        get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
+                        if (hparams.n_mel_bins != 128) {
+                            throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
+                        }
+                        hparams.ffn_op = FFN_GELU_ERF;
+                        log_ffn_op = "gelu_erf"; // temporary solution for logging
+                    } break;
                 default:
                     break;
             }
@@ -2059,29 +2262,36 @@ struct clip_model_loader {
             LOG_INF("%s: n_head:             %d\n", __func__, hparams.n_head);
             LOG_INF("%s: n_ff:               %d\n", __func__, hparams.n_ff);
             LOG_INF("%s: n_layer:            %d\n", __func__, hparams.n_layer);
-            LOG_INF("%s: projection_dim:     %d\n", __func__, hparams.projection_dim);
-            LOG_INF("%s: image_size:         %d\n", __func__, hparams.image_size);
-            LOG_INF("%s: patch_size:         %d\n", __func__, hparams.patch_size);
-            LOG_INF("\n");
-            LOG_INF("%s: has_llava_proj:     %d\n", __func__, ctx_clip.has_llava_projector);
-            LOG_INF("%s: minicpmv_version:   %d\n", __func__, ctx_clip.minicpmv_version);
-            LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
-            LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
             LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
+            LOG_INF("%s: projection_dim:     %d\n", __func__, hparams.projection_dim);
+            if (is_vision) {
+                LOG_INF("\n--- vision hparams ---\n");
+                LOG_INF("%s: image_size:         %d\n", __func__, hparams.image_size);
+                LOG_INF("%s: patch_size:         %d\n", __func__, hparams.patch_size);
+                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
+                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
+                LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
+                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
+            } else if (is_audio) {
+                LOG_INF("\n--- audio hparams ---\n");
+                LOG_INF("%s: n_mel_bins:         %d\n", __func__, hparams.n_mel_bins);
+                LOG_INF("%s: proj_stack_factor:  %d\n", __func__, hparams.proj_stack_factor);
+            }
+            LOG_INF("\n");
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
-
-            if (ctx_clip.proj_type == PROJECTOR_TYPE_LLAMA4) {
-                LOG_WRN("%s: llama 4 vision is known to have degraded quality: https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
-            }
         }
     }
 
-    void load_tensors() {
-        auto & hparams = ctx_clip.vision_model.hparams;
+    void load_tensors(clip_ctx & ctx_clip) {
+        auto & model = ctx_clip.model;
+        auto & hparams = model.hparams;
         std::map<std::string, size_t> tensor_offset;
         std::vector<ggml_tensor *> tensors_to_load;
 
+        // TODO @ngxson : support both audio and video in the future
+        const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
+
         // get offsets
         for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
             const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
@@ -2115,51 +2325,49 @@ struct clip_model_loader {
             return cur;
         };
 
-        auto & vision_model = ctx_clip.vision_model;
+        model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
 
-        vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
+        model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
+        model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"),   false);
 
-        vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, "v", "weight"), false);
-        vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, "v", "bias"),   false);
+        model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
+        model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"),   false);
 
-        vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, "v", "weight"), false);
-        vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, "v", "bias"),   false);
+        model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
+        model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
+        model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
 
-        vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
-        vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
-        vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
-
-        vision_model.position_embeddings = get_tensor(TN_POS_EMBD, false);
+        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
 
         // layers
-        vision_model.layers.resize(hparams.n_layer);
+        model.layers.resize(hparams.n_layer);
         for (int il = 0; il < hparams.n_layer; ++il) {
-            auto & layer = vision_model.layers[il];
-            layer.k_w    = get_tensor(string_format(TN_ATTN_K,      "v", il, "weight"));
-            layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      "v", il, "weight"));
-            layer.v_w    = get_tensor(string_format(TN_ATTN_V,      "v", il, "weight"));
-            layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
-            layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
-            layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
-            layer.ln_1_w = get_tensor(string_format(TN_LN_1,        "v", il, "weight"), false);
-            layer.ln_2_w = get_tensor(string_format(TN_LN_2,        "v", il, "weight"), false);
-            layer.ls_1_w = get_tensor(string_format(TN_LS_1,        "v", il, "weight"), false); // no bias
-            layer.ls_2_w = get_tensor(string_format(TN_LS_2,        "v", il, "weight"), false); // no bias
+            auto & layer = model.layers[il];
+            layer.k_w    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "weight"));
+            layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "weight"));
+            layer.v_w    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "weight"));
+            layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
+            layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
+            layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
+            layer.ln_1_w = get_tensor(string_format(TN_LN_1,        prefix, il, "weight"), false);
+            layer.ln_2_w = get_tensor(string_format(TN_LN_2,        prefix, il, "weight"), false);
+            layer.ls_1_w = get_tensor(string_format(TN_LS_1,        prefix, il, "weight"), false); // no bias
+            layer.ls_2_w = get_tensor(string_format(TN_LS_2,        prefix, il, "weight"), false); // no bias
 
-            layer.k_b    = get_tensor(string_format(TN_ATTN_K,      "v", il, "bias"), false);
-            layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      "v", il, "bias"), false);
-            layer.v_b    = get_tensor(string_format(TN_ATTN_V,      "v", il, "bias"), false);
-            layer.o_b    = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
-            layer.ln_1_b = get_tensor(string_format(TN_LN_1,        "v", il, "bias"), false);
-            layer.ln_2_b = get_tensor(string_format(TN_LN_2,        "v", il, "bias"), false);
+            layer.k_b    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "bias"), false);
+            layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "bias"), false);
+            layer.v_b    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "bias"), false);
+            layer.o_b    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
+            layer.ln_1_b = get_tensor(string_format(TN_LN_1,        prefix, il, "bias"), false);
+            layer.ln_2_b = get_tensor(string_format(TN_LN_2,        prefix, il, "bias"), false);
 
             // ffn
-            layer.ff_up_w   = get_tensor(string_format(TN_FFN_UP,   "v", il, "weight"));
-            layer.ff_up_b   = get_tensor(string_format(TN_FFN_UP,   "v", il, "bias"),   false);
-            layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
-            layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"),   false);
-            layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
-            layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"),   false);
+            layer.ff_up_w   = get_tensor(string_format(TN_FFN_UP,   prefix, il, "weight"));
+            layer.ff_up_b   = get_tensor(string_format(TN_FFN_UP,   prefix, il, "bias"),   false);
+            layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
+            layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"),   false);
+            layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
+            layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"),   false);
 
             // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
             // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
@@ -2175,146 +2383,166 @@ struct clip_model_loader {
             }
         }
 
-        switch (ctx_clip.proj_type) {
+        switch (model.proj_type) {
             case PROJECTOR_TYPE_MLP:
             case PROJECTOR_TYPE_MLP_NORM:
                 {
                     // LLaVA projection
-                    vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
-                    vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
                     // Yi-type llava
-                    vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
-                    vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
                     // missing in Yi-type llava
-                    vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
-                    vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
                     // Yi-type llava
-                    vision_model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
-                    vision_model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
-                    vision_model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
-                    vision_model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
-                    if (vision_model.mm_3_w) {
+                    model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
+                    model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
+                    model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
+                    model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
+                    if (model.mm_3_w) {
                         // TODO: this is a hack to support Yi-type llava
-                        ctx_clip.proj_type = PROJECTOR_TYPE_MLP_NORM;
+                        model.proj_type = PROJECTOR_TYPE_MLP_NORM;
                     }
-                    vision_model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
+                    model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
                 } break;
             case PROJECTOR_TYPE_LDP:
                 {
                     // MobileVLM projection
-                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
-                    vision_model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
-                    vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
-                    vision_model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
-                    vision_model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
-                    vision_model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
-                    vision_model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
-                    vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
-                    vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
-                    vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
-                    vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
-                    vision_model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
-                    vision_model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
-                    vision_model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
-                    vision_model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
-                    vision_model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
-                    vision_model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
-                    vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
-                    vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
-                    vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
-                    vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
-                    vision_model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
-                    vision_model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
-                    vision_model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
+                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
+                    model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
+                    model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+                    model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+                    model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+                    model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
+                    model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
+                    model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
+                    model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
+                    model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+                    model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+                    model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+                    model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+                    model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+                    model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+                    model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
+                    model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
+                    model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
+                    model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
+                    model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+                    model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+                    model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
                 } break;
             case PROJECTOR_TYPE_LDPV2:
                 {
                     // MobilVLM_V2 projection
-                    vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
-                    vision_model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
-                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
-                    vision_model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
-                    vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
-                    vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
+                    model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
+                    model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
+                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
+                    model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
+                    model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
+                    model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
                 } break;
             case PROJECTOR_TYPE_MINICPMV:
                 {
-                    // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
-                    vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
-                    vision_model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
-                    vision_model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
-                    vision_model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
-                    vision_model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
-                    vision_model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
-                    vision_model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
-                    vision_model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
-                    vision_model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
-                    vision_model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
-                    vision_model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
-                    vision_model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
-                    vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
-                    vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
-                    vision_model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
-                    vision_model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
-                    vision_model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
-                    vision_model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
+                    // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
+                    model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
+                    model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
+                    model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
+                    model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
+                    model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
+                    model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
+                    model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
+                    model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
+                    model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
+                    model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
+                    model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
+                    model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
+                    model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
+                    model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
+                    model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
+                    model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
+                    model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
+                    model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
                 } break;
             case PROJECTOR_TYPE_GLM_EDGE:
                 {
-                    vision_model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
-                    vision_model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
-                    vision_model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
-                    vision_model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
-                    vision_model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
-                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
-                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
-                    vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
-                    vision_model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
-                    vision_model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
+                    model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
+                    model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
+                    model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
+                    model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
+                    model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
+                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
+                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
+                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
+                    model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight"));
+                    model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight"));
                 } break;
             case PROJECTOR_TYPE_QWEN2VL:
             case PROJECTOR_TYPE_QWEN25VL:
                 {
-                    vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
-                    vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
-                    vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
-                    vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                 } break;
             case PROJECTOR_TYPE_GEMMA3:
                 {
-                    vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
-                    vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
+                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                 } break;
             case PROJECTOR_TYPE_IDEFICS3:
                 {
-                    vision_model.projection = get_tensor(TN_MM_PROJECTOR);
+                    model.projection = get_tensor(TN_MM_PROJECTOR);
                 } break;
             case PROJECTOR_TYPE_PIXTRAL:
                 {
-                    vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
-                    vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
-                    vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
-                    vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
                     // [IMG_BREAK] token embedding
-                    vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
+                    model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
                     // for mistral small 3.1
-                    vision_model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM,     false);
-                    vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
+                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM,     false);
+                    model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
+                } break;
+            case PROJECTOR_TYPE_ULTRAVOX:
+                {
+                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
+                    model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
+                } break;
+            case PROJECTOR_TYPE_QWEN2A:
+                {
+                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+                    model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
+                    model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
                 } break;
             case PROJECTOR_TYPE_INTERNVL:
                 {
-                    vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
-                    vision_model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
-                    vision_model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
-                    vision_model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
-                    vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
-                    vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
+                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
+                    model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
+                    model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
                 } break;
             case PROJECTOR_TYPE_LLAMA4:
                 {
-                    vision_model.mm_model_proj    = get_tensor(TN_MM_PROJECTOR);
-                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
-                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
+                    model.mm_model_proj    = get_tensor(TN_MM_PROJECTOR);
+                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
                 } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
@@ -2357,15 +2585,20 @@ struct clip_model_loader {
         }
     }
 
-    void alloc_compute_meta() {
+    void alloc_compute_meta(clip_ctx & ctx_clip) {
+        const auto & hparams = ctx_clip.model.hparams;
         ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
 
         // create a fake batch
         clip_image_f32_batch batch;
         clip_image_f32_ptr img(clip_image_f32_init());
-        img->nx = ctx_clip.vision_model.hparams.warmup_image_size;
-        img->ny = ctx_clip.vision_model.hparams.warmup_image_size;
-        img->buf.resize(img->nx * img->ny * 3);
+        if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
+            img->nx = hparams.warmup_image_size;
+            img->ny = hparams.warmup_image_size;
+        } else {
+            img->nx = hparams.warmup_audio_size;
+            img->ny = hparams.n_mel_bins;
+        }
         batch.entries.push_back(std::move(img));
 
         ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
@@ -2443,23 +2676,40 @@ struct clip_model_loader {
     }
 };
 
-struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
+struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
     g_logger_state.verbosity_thold = ctx_params.verbosity;
-    clip_ctx * ctx_clip = nullptr;
+    clip_ctx * ctx_vision = nullptr;
+    clip_ctx * ctx_audio = nullptr;
 
     try {
-        ctx_clip = new clip_ctx(ctx_params);
-        clip_model_loader loader(fname, *ctx_clip);
-        loader.load_hparams();
-        loader.load_tensors();
-        loader.alloc_compute_meta();
+        clip_model_loader loader(fname);
+
+        if (loader.has_vision) {
+            ctx_vision = new clip_ctx(ctx_params);
+            loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
+            loader.load_tensors(*ctx_vision);
+            loader.alloc_compute_meta(*ctx_vision);
+        }
+
+        if (loader.has_audio) {
+            ctx_audio = new clip_ctx(ctx_params);
+            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
+            loader.load_tensors(*ctx_audio);
+            loader.alloc_compute_meta(*ctx_audio);
+        }
+
     } catch (const std::exception & e) {
         LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
-        delete ctx_clip;
-        return nullptr;
+        if (ctx_vision) {
+            delete ctx_vision;
+        }
+        if (ctx_audio) {
+            delete ctx_audio;
+        }
+        return {nullptr, nullptr};
     }
 
-    return ctx_clip;
+    return {ctx_vision, ctx_audio};
 }
 
 struct clip_image_size * clip_image_size_init() {
@@ -2533,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
     memcpy(img->buf.data(), rgb_pixels, img->buf.size());
 }
 
-bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
-    int nx, ny, nc;
-    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
-    if (!data) {
-        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
-        return false;
-    }
-    clip_build_img_from_pixels(data, nx, ny, img);
-    stbi_image_free(data);
-    return true;
-}
-
-bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
-    int nx, ny, nc;
-    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
-    if (!data) {
-        LOG_ERR("%s: failed to decode image bytes\n", __func__);
-        return false;
-    }
-    clip_build_img_from_pixels(data, nx, ny, img);
-    stbi_image_free(data);
-    return true;
-}
-
 // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
 static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
     dst.nx = src.nx;
@@ -2820,12 +3046,12 @@ struct llava_uhd {
         const float ratio = (float)original_width * original_height / (slice_size * slice_size);
         const int multiple = fmin(ceil(ratio), max_slice_nums);
         const bool has_slices = (multiple > 1);
-        const bool has_pinpoints = !ctx->vision_model.hparams.image_grid_pinpoints.empty();
+        const bool has_pinpoints = !ctx->model.hparams.image_grid_pinpoints.empty();
 
         if (has_pinpoints) {
             // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
             auto refine_size = llava_uhd::select_best_resolution(
-                ctx->vision_model.hparams.image_grid_pinpoints,
+                ctx->model.hparams.image_grid_pinpoints,
                 original_size);
             res.overview_size   = clip_image_size{slice_size, slice_size};
             res.refined_size    = refine_size;
@@ -3047,7 +3273,7 @@ private:
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
     clip_image_size original_size{img->nx, img->ny};
     bool pad_to_square = true;
-    auto & params = ctx->vision_model.hparams;
+    auto & params = ctx->model.hparams;
     // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
     if (params.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD) {
         pad_to_square = false;
@@ -3060,7 +3286,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         for (size_t i = 0; i < imgs.size(); ++i) {
             // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
             clip_image_f32_ptr res(clip_image_f32_init());
-            normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
+            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
             res_imgs->entries.push_back(std::move(res));
         }
 
@@ -3068,7 +3294,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         res_imgs->grid_y = inst.grid_size.height;
         return true;
 
-    } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
         clip_image_u8 resized;
         auto patch_size = params.patch_size * 2;
         auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
@@ -3076,42 +3302,42 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
 
         clip_image_f32_ptr img_f32(clip_image_f32_init());
         // clip_image_f32_ptr res(clip_image_f32_init());
-        normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std);
+        normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
         // res_imgs->data[0] = *res;
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
     }
-    else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
-            || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
-            || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3
-            || ctx->proj_type == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
+    else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
+            || ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
+            || ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
+            || ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
     ) {
         clip_image_u8 resized_image;
         int sz = params.image_size;
         image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
         clip_image_f32_ptr img_f32(clip_image_f32_init());
         //clip_image_save_to_bmp(resized_image, "resized.bmp");
-        normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
+        normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
 
-    } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_PIXTRAL) {
         clip_image_u8 resized_image;
         auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
         image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
         clip_image_f32_ptr img_f32(clip_image_f32_init());
-        normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
+        normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
 
-    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
         GGML_ASSERT(!params.image_grid_pinpoints.empty());
         auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
         std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
 
         for (size_t i = 0; i < imgs.size(); ++i) {
             clip_image_f32_ptr res(clip_image_f32_init());
-            normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
+            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
             res_imgs->entries.push_back(std::move(res));
         }
 
@@ -3141,7 +3367,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         image_manipulation::resize_and_pad_image(*img, *temp, clip_image_size{params.image_size, params.image_size}, pad_color);
 
         clip_image_f32_ptr res(clip_image_f32_init());
-        normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std);
+        normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
         res_imgs->entries.push_back(std::move(res));
         return true;
 
@@ -3153,7 +3379,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         for (size_t i = 0; i < imgs.size(); ++i) {
             // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
             clip_image_f32_ptr res(clip_image_f32_init());
-            normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
+            normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
             res_imgs->entries.push_back(std::move(res));
         }
 
@@ -3165,7 +3391,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
 }
 
 ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
-    return ctx->vision_model.image_newline;
+    return ctx->model.image_newline;
 }
 
 void clip_free(clip_ctx * ctx) {
@@ -3177,8 +3403,8 @@ void clip_free(clip_ctx * ctx) {
 
 // deprecated
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    const int32_t nx = ctx->vision_model.hparams.image_size;
-    const int32_t ny = ctx->vision_model.hparams.image_size;
+    const int32_t nx = ctx->model.hparams.image_size;
+    const int32_t ny = ctx->model.hparams.image_size;
     return clip_embd_nbytes_by_img(ctx, nx, ny);
 }
 
@@ -3190,97 +3416,135 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h
 }
 
 int32_t clip_get_image_size(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.image_size;
+    return ctx->model.hparams.image_size;
 }
 
 int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.patch_size;
+    return ctx->model.hparams.patch_size;
 }
 
 int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.n_embd;
+    return ctx->model.hparams.n_embd;
 }
 
 const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
+    return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
 }
 
 const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
-    if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
-        return &ctx->vision_model.hparams.image_grid_pinpoints.front();
+    if (ctx->model.hparams.image_grid_pinpoints.size()) {
+        return &ctx->model.hparams.image_grid_pinpoints.front();
     }
     return nullptr;
 }
 
 size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.image_grid_pinpoints.size();
+    return ctx->model.hparams.image_grid_pinpoints.size();
 }
 
 int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
-    const auto & params = ctx->vision_model.hparams;
+    const auto & params = ctx->model.hparams;
     const int n_total = clip_n_output_tokens(ctx, img);
-    if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
         return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
     }
     return n_total;
 }
 
 int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
-    const auto & params = ctx->vision_model.hparams;
-    if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    const auto & params = ctx->model.hparams;
+    if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL) {
         return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
     }
     return 1;
 }
 
 int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
-    const auto & params = ctx->vision_model.hparams;
+    const auto & params = ctx->model.hparams;
 
-    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-    int scale_factor = ctx->vision_model.hparams.proj_scale_factor;
+    // only for models using fixed size square images
+    int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
 
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP
-            || ctx->proj_type == PROJECTOR_TYPE_LDPV2
-            || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-        n_patches /= 4;
-        if (ctx->vision_model.mm_glm_tok_boi) {
-            n_patches += 2; // for BOI and EOI token embeddings
-        }
-    } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
-        if (ctx->minicpmv_version == 2) {
-            n_patches = 96;
-        }
-        else if (ctx->minicpmv_version == 3) {
-            n_patches = 64;
-        }
-        else if (ctx->minicpmv_version == 4) {
-            n_patches = 64;
-        }
-        else {
-            GGML_ABORT("Unknown minicpmv version");
-        }
-    } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
-        int patch_size = params.patch_size * 2;
-        int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
-        int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
-        n_patches = x_patch * y_patch;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        int n_per_side = params.image_size / params.patch_size;
-        int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
-        n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type == PROJECTOR_TYPE_INTERNVL) {
-        // both W and H are divided by proj_scale_factor
-        n_patches /= (params.proj_scale_factor * params.proj_scale_factor);
-    } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
-        int n_merge = params.spatial_merge_size;
-        int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
-        int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
-        n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
-    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
-        n_patches /= (scale_factor * scale_factor);
+    projector_type proj = ctx->proj_type();
+
+    switch (proj) {
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_MLP_NORM:
+            {
+                // do nothing
+            } break;
+        case PROJECTOR_TYPE_LDP:
+        case PROJECTOR_TYPE_LDPV2:
+        case PROJECTOR_TYPE_GLM_EDGE:
+            {
+                n_patches_sq /= 4;
+                if (ctx->model.mm_glm_tok_boi) {
+                    n_patches_sq += 2; // for BOI and EOI token embeddings
+                }
+            } break;
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                if (params.minicpmv_version == 2) {
+                    n_patches_sq = 96;
+                } else if (params.minicpmv_version == 3) {
+                    n_patches_sq = 64;
+                } else if (params.minicpmv_version == 4) {
+                    n_patches_sq = 64;
+                } else {
+                    GGML_ABORT("Unknown minicpmv version");
+                }
+            } break;
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+            {
+                // dynamic size
+                int patch_size = params.patch_size * 2;
+                int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
+                int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
+                n_patches_sq = x_patch * y_patch;
+            } break;
+        case PROJECTOR_TYPE_GEMMA3:
+            {
+                int n_per_side = params.image_size / params.patch_size;
+                int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
+                n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
+            } break;
+        case PROJECTOR_TYPE_IDEFICS3:
+        case PROJECTOR_TYPE_INTERNVL:
+            {
+                // both W and H are divided by proj_scale_factor
+                n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor);
+            } break;
+        case PROJECTOR_TYPE_PIXTRAL:
+            {
+                // dynamic size
+                int n_merge = params.spatial_merge_size;
+                int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
+                int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
+                n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
+            } break;
+        case PROJECTOR_TYPE_LLAMA4:
+            {
+                int scale_factor = ctx->model.hparams.proj_scale_factor;
+                n_patches_sq /= (scale_factor * scale_factor);
+            } break;
+        case PROJECTOR_TYPE_ULTRAVOX:
+            {
+                const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
+                const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
+                n_patches_sq = n_len / proj_stack_factor / 2;
+            } break;
+        case PROJECTOR_TYPE_QWEN2A:
+            {
+                // divide by 2 because of whisper
+                // another divide by 2 because of nn.AvgPool1d(2, stride=2)
+                n_patches_sq = img->nx / 4;
+            } break;
+        default:
+            GGML_ABORT("unsupported projector type");
     }
 
-    return n_patches;
+    return n_patches_sq;
 }
 
 static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
@@ -3395,7 +3659,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
 
     // set inputs
-    const auto & model   = ctx->vision_model;
+    const auto & model   = ctx->model;
     const auto & hparams = model.hparams;
 
     const int image_size_width  = imgs.entries[0]->nx;
@@ -3435,7 +3699,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     };
 
     // set input pixel values
-    {
+    if (!imgs.is_audio) {
         size_t nelem = 0;
         for (const auto & img : imgs.entries) {
             nelem += img->nx * img->ny * 3;
@@ -3472,10 +3736,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             }
         }
         set_input_f32("inp_raw", inp_raw);
+
+    } else {
+        // audio input
+        GGML_ASSERT(imgs.entries.size() == 1);
+        const auto & mel_inp = imgs.entries[0];
+        const int n_step = mel_inp->nx;
+        const int n_mel  = mel_inp->ny;
+        std::vector<float> inp_raw(n_step * n_mel);
+        std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
+        set_input_f32("inp_raw", inp_raw);
     }
 
     // set input per projector
-    switch (ctx->proj_type) {
+    switch (ctx->model.proj_type) {
         case PROJECTOR_TYPE_MINICPMV:
             {
                 // inspired from siglip:
@@ -3668,6 +3942,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_IDEFICS3:
         case PROJECTOR_TYPE_INTERNVL:
+        case PROJECTOR_TYPE_QWEN2A:
+        case PROJECTOR_TYPE_ULTRAVOX:
             {
                 // do nothing
             } break;
@@ -3727,7 +4003,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const int n_tokens_out = embeddings->ne[1];
     const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
     if (n_tokens_out != expected_n_tokens_out) {
-        LOG_ERR("%s: expected %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
+        LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
         GGML_ABORT("Invalid number of output tokens");
     }
 
@@ -3738,64 +4014,83 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 }
 
 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type) {
+    const auto & hparams = ctx->model.hparams;
+    switch (ctx->model.proj_type) {
         case PROJECTOR_TYPE_LDP:
-            return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
+            return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
         case PROJECTOR_TYPE_LDPV2:
-            return ctx->vision_model.mm_model_peg_0_b->ne[0];
+            return ctx->model.mm_model_peg_0_b->ne[0];
         case PROJECTOR_TYPE_MLP:
         case PROJECTOR_TYPE_PIXTRAL:
-            return ctx->vision_model.mm_2_w->ne[1];
+            return ctx->model.mm_2_w->ne[1];
         case PROJECTOR_TYPE_MLP_NORM:
-            return ctx->vision_model.mm_3_b->ne[0];
+            return ctx->model.mm_3_b->ne[0];
         case PROJECTOR_TYPE_MINICPMV:
-            if (ctx->minicpmv_version == 2) {
+            if (hparams.minicpmv_version == 2) {
                 return 4096;
-            } else if (ctx->minicpmv_version == 3) {
+            } else if (hparams.minicpmv_version == 3) {
                 return 3584;
-            } else if (ctx->minicpmv_version == 4) {
+            } else if (hparams.minicpmv_version == 4) {
                 return 3584;
             }
             GGML_ABORT("Unknown minicpmv version");
         case PROJECTOR_TYPE_GLM_EDGE:
-            return ctx->vision_model.mm_model_mlp_3_w->ne[1];
+            return ctx->model.mm_model_mlp_3_w->ne[1];
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN25VL:
-            return ctx->vision_model.mm_1_b->ne[0];
+            return ctx->model.mm_1_b->ne[0];
         case PROJECTOR_TYPE_GEMMA3:
-            return ctx->vision_model.mm_input_proj_w->ne[0];
+            return ctx->model.mm_input_proj_w->ne[0];
         case PROJECTOR_TYPE_IDEFICS3:
-            return ctx->vision_model.projection->ne[1];
+            return ctx->model.projection->ne[1];
+        case PROJECTOR_TYPE_ULTRAVOX:
+            return ctx->model.mm_2_w->ne[1];
         case PROJECTOR_TYPE_INTERNVL:
-            return ctx->vision_model.mm_3_w->ne[1];
+            return ctx->model.mm_3_w->ne[1];
         case PROJECTOR_TYPE_LLAMA4:
-            return ctx->vision_model.mm_model_proj->ne[1];
+            return ctx->model.mm_model_proj->ne[1];
+        case PROJECTOR_TYPE_QWEN2A:
+            return ctx->model.mm_fc_w->ne[1];
         default:
             GGML_ABORT("Unknown projector type");
     }
 }
 
 int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
-        return ctx->minicpmv_version;
+    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
+        return ctx->model.hparams.minicpmv_version;
     }
     return 0;
 }
 
 bool clip_is_glm(const struct clip_ctx * ctx) {
-    return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
+    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
 }
 
 bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
-    return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
+    return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL;
 }
 
 bool clip_is_llava(const struct clip_ctx * ctx) {
-    return ctx->has_llava_projector;
+    return ctx->model.hparams.has_llava_projector;
 }
 
 bool clip_is_gemma3(const struct clip_ctx * ctx) {
-    return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
+    return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
+}
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
+    return ctx->model.modality == CLIP_MODALITY_VISION;
+}
+
+bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
+    return ctx->model.modality == CLIP_MODALITY_AUDIO;
+}
+
+bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
+        || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A;
 }
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
@@ -3816,5 +4111,16 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
 //
 
 projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
-    return ctx->proj_type;
+    return ctx->proj_type();
+}
+
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
+    clip_image_f32 * audio = new clip_image_f32;
+    audio->nx = n_frames;
+    audio->ny = n_mel;
+    audio->buf.resize(n_frames * n_mel);
+    std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
+
+    batch->entries.push_back(clip_image_f32_ptr(audio));
+    batch->is_audio = true;
 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index e7a1c0782..cb2eb261f 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -4,6 +4,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+// !!! Internal header, to be used by mtmd only !!!
+
 struct clip_ctx;
 
 struct clip_image_size {
@@ -15,12 +17,22 @@ struct clip_image_f32;
 struct clip_image_u8_batch;
 struct clip_image_f32_batch;
 
+enum clip_modality {
+    CLIP_MODALITY_VISION,
+    CLIP_MODALITY_AUDIO,
+};
+
 struct clip_context_params {
     bool use_gpu;
     enum ggml_log_level verbosity;
 };
 
-struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
+struct clip_init_result {
+    struct clip_ctx * ctx_v; // vision context
+    struct clip_ctx * ctx_a; // audio context
+};
+
+struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
 
 void clip_free(struct clip_ctx * ctx);
 
@@ -93,3 +105,10 @@ bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
+
+// use by audio input
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx);
+bool clip_has_audio_encoder(const struct clip_ctx * ctx);
+bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
new file mode 100644
index 000000000..4d053895c
--- /dev/null
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -0,0 +1,769 @@
+#include "mtmd-audio.h"
+
+#define _USE_MATH_DEFINES // for M_PI
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <thread>
+#include <vector>
+#include <fstream>
+#include <algorithm>
+
+// most of the code here is copied from whisper.cpp
+
+// align x to upper multiple of n
+#define _ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
+
+namespace whisper_preprocessor {
+
+#define SIN_COS_N_COUNT WHISPER_N_FFT
+namespace {
+struct whisper_global_cache {
+    // In FFT, we frequently use sine and cosine operations with the same values.
+    // We can use precalculated values to speed up the process.
+    float sin_vals[SIN_COS_N_COUNT];
+    float cos_vals[SIN_COS_N_COUNT];
+
+    // Hann window (Use cosf to eliminate difference)
+    // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
+    // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
+    float hann_window[WHISPER_N_FFT];
+
+    whisper_global_cache() {
+        fill_sin_cos_table();
+        fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
+    }
+
+    void fill_sin_cos_table() {
+        for (int i = 0; i < SIN_COS_N_COUNT; i++) {
+            double theta = (2 * M_PI * i) / SIN_COS_N_COUNT;
+            sin_vals[i] = sinf(theta);
+            cos_vals[i] = cosf(theta);
+        }
+    }
+
+    void fill_hann_window(int length, bool periodic, float * output) {
+        int offset = -1;
+        if (periodic) {
+            offset = 0;
+        }
+        for (int i = 0; i < length; i++) {
+            output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+        }
+    }
+} global_cache;
+}
+
+// naive Discrete Fourier Transform
+// input is real-valued
+// output is complex-valued
+static void dft(const float* in, int N, float* out) {
+    const int sin_cos_step = SIN_COS_N_COUNT / N;
+
+    for (int k = 0; k < N; k++) {
+        float re = 0;
+        float im = 0;
+
+        for (int n = 0; n < N; n++) {
+            int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
+            re += in[n]*global_cache.cos_vals[idx]; // cos(t)
+            im -= in[n]*global_cache.sin_vals[idx]; // sin(t)
+        }
+
+        out[k*2 + 0] = re;
+        out[k*2 + 1] = im;
+    }
+}
+
+// Cooley-Tukey FFT
+// poor man's implementation - use something better
+// input is real-valued
+// output is complex-valued
+static void fft(float* in, int N, float* out) {
+    if (N == 1) {
+        out[0] = in[0];
+        out[1] = 0;
+        return;
+    }
+
+    const int half_N = N / 2;
+    if (N - half_N*2 == 1) {
+        dft(in, N, out);
+        return;
+    }
+
+    float* even = in + N;
+    for (int i = 0; i < half_N; ++i) {
+        even[i]= in[2*i];
+    }
+    float* even_fft = out + 2 * N;
+    fft(even, half_N, even_fft);
+
+    float* odd = even;
+    for (int i = 0; i < half_N; ++i) {
+        odd[i] = in[2*i + 1];
+    }
+    float* odd_fft = even_fft + N;
+    fft(odd, half_N, odd_fft);
+
+    const int sin_cos_step = SIN_COS_N_COUNT / N;
+    for (int k = 0; k < half_N; k++) {
+        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+        float re = global_cache.cos_vals[idx]; // cos(t)
+        float im = -global_cache.sin_vals[idx]; // sin(t)
+
+        float re_odd = odd_fft[2*k + 0];
+        float im_odd = odd_fft[2*k + 1];
+
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
+
+        out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
+        out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
+    }
+}
+
+static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
+                                              int n_samples, int frame_size, int frame_step, int n_threads,
+                                              const whisper_filters & filters, whisper_mel & mel) {
+    std::vector<float> fft_in(frame_size * 2, 0.0);
+    std::vector<float> fft_out(frame_size * 2 * 2 * 2);
+
+    int n_fft = filters.n_fft;
+    int i = ith;
+
+    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
+    WHISPER_ASSERT(n_fft == 1 + (frame_size / 2));
+
+    // calculate FFT only when fft_in are not all zero
+    for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
+        const int offset = i * frame_step;
+
+        // apply Hann window (~10% faster)
+        for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
+            fft_in[j] = hann[j] * samples[offset + j];
+        }
+
+        // fill the rest with zeros
+        if (n_samples - offset < frame_size) {
+            std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
+        }
+
+        // FFT
+        fft(fft_in.data(), frame_size, fft_out.data());
+
+        // Calculate modulus^2 of complex numbers
+        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
+        for (int j = 0; j < n_fft; j++) {
+            fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
+        }
+
+        // mel spectrogram
+        for (int j = 0; j < mel.n_mel; j++) {
+            double sum = 0.0;
+            // unroll loop (suggested by GH user @lunixbochs)
+            int k = 0;
+            for (k = 0; k < n_fft - 3; k += 4) {
+                sum +=
+                        fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
+                        fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
+                        fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
+                        fft_out[k + 3] * filters.data[j * n_fft + k + 3];
+            }
+            // handle n_fft remainder
+            for (; k < n_fft; k++) {
+                sum += fft_out[k] * filters.data[j * n_fft + k];
+            }
+            sum = log10(std::max(sum, 1e-10));
+            mel.data[j * mel.n_len + i] = sum;
+        }
+    }
+
+    // Otherwise fft_out are all zero
+    double sum = log10(1e-10);
+    for (; i < mel.n_len; i += n_threads) {
+        for (int j = 0; j < mel.n_mel; j++) {
+            mel.data[j * mel.n_len + i] = sum;
+        }
+    }
+}
+
+// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
+static bool log_mel_spectrogram(
+        const float * samples,
+        const int   n_samples,
+        const int   /*sample_rate*/,
+        const int   frame_size,
+        const int   frame_step,
+        const int   n_mel,
+        const int   n_threads,
+        const whisper_filters & filters,
+        const bool   debug,
+        whisper_mel & mel) {
+    //const int64_t t_start_us = ggml_time_us();
+
+    // Hann window
+    WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
+    const float * hann = global_cache.hann_window;
+
+    // Calculate the length of padding
+    int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
+    int64_t stage_2_pad = frame_size / 2;
+
+    // Initialize a vector and copy data from C array to it.
+    std::vector<float> samples_padded;
+    samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
+    std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
+
+    // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
+    std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
+
+    // reflective pad 200 samples at the beginning of audio
+    std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
+
+    mel.n_mel     = n_mel;
+    // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
+    // Calculate number of frames + remove the last frame
+    mel.n_len     = (samples_padded.size() - frame_size) / frame_step;
+    // Calculate semi-padded sample length to ensure compatibility
+    mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
+    mel.data.resize(mel.n_mel * mel.n_len);
+
+    {
+        std::vector<std::thread> workers(n_threads - 1);
+        for (int iw = 0; iw < n_threads - 1; ++iw) {
+            workers[iw] = std::thread(
+                    log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
+                    n_samples + stage_2_pad, frame_size, frame_step, n_threads,
+                    std::cref(filters), std::ref(mel));
+        }
+
+        // main thread
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
+
+        for (int iw = 0; iw < n_threads - 1; ++iw) {
+            workers[iw].join();
+        }
+    }
+
+    // clamping and normalization
+    double mmax = -1e20;
+    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
+        if (mel.data[i] > mmax) {
+            mmax = mel.data[i];
+        }
+    }
+
+    mmax -= 8.0;
+
+    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
+        if (mel.data[i] < mmax) {
+            mel.data[i] = mmax;
+        }
+
+        mel.data[i] = (mel.data[i] + 4.0)/4.0;
+    }
+
+    // Dump log_mel_spectrogram
+    if (debug) {
+        std::ofstream outFile("log_mel_spectrogram.json");
+        outFile << "[";
+        for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
+            outFile << mel.data[i] << ", ";
+        }
+        outFile << mel.data[mel.data.size() - 1] << "]";
+        outFile.close();
+    }
+
+    return true;
+}
+
+bool preprocess_audio(
+        const float * samples,
+        size_t n_samples,
+        const whisper_filters & filters,
+        std::vector<whisper_mel> & output) {
+
+    if (n_samples == 0) {
+        // empty audio
+        return false;
+    }
+
+    whisper_mel out_full;
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                COMMON_SAMPLE_RATE,
+                WHISPER_N_FFT,
+                WHISPER_HOP_LENGTH,
+                filters.n_mel,
+                4, // n_threads
+                filters,
+                false, // debug
+                out_full);
+    if (!ok) {
+        return false;
+    }
+
+    // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
+    // we always expect the mel to have 3000 silent frames at the end
+    // printf("n_len %d\n", out_full.n_len);
+    const size_t frames_per_chunk = 3000;
+    GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
+    for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
+        int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
+        if ((size_t)n_len < frames_per_chunk) {
+            break; // last uncomplete chunk will always be a padded chunk, safe to ignore
+        }
+
+        whisper_mel out_chunk;
+        out_chunk.n_len     = n_len;
+        out_chunk.n_mel     = out_full.n_mel;
+        out_chunk.n_len_org = out_full.n_mel; // unused
+        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
+
+        for (int i = 0; i < out_full.n_mel; i++) {
+            auto src = out_full.data.begin() + i*out_full.n_len + off;
+            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
+        }
+
+        output.push_back(std::move(out_chunk));
+    }
+
+    return true;
+}
+
+} // namespace whisper_preprocessor
+
+
+// precalculated mel filter banks
+// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
+//
+// generated from python code:
+//
+// from numpy import load
+// data = load('mel_filters.npz')
+// lst = data.files
+// for item in lst:
+//   print(item)
+//   print(data[item].shape)
+//   n_mel = data[item].shape[0]
+//   n_fft = data[item].shape[1]
+//   for i, row in enumerate(data[item]):
+//     for j, val in enumerate(row):
+//       val = val * 1000.0
+//       if val != 0:
+//         print(f"data[{i*n_fft + j}] = {val:.6f};")
+
+namespace whisper_precalc_filters {
+
+whisper_preprocessor::whisper_filters get_128_bins() {
+    whisper_preprocessor::whisper_filters filters;
+    filters.n_mel = 128;
+    filters.n_fft = 201;
+    std::vector data(filters.n_mel * filters.n_fft, 0.0f);
+
+    data[1] = 12.37398665;
+    data[202] = 30.39256483;
+    data[404] = 24.74797331;
+    data[605] = 18.01857911;
+    data[807] = 37.12195903;
+    data[1008] = 5.64459199;
+    data[1009] = 6.72939420;
+    data[1210] = 36.03715822;
+    data[1412] = 19.10337992;
+    data[1613] = 23.66316877;
+    data[1815] = 31.47736564;
+    data[2016] = 11.28918398;
+    data[2017] = 1.08480197;
+    data[2218] = 41.68175161;
+    data[2420] = 13.45878839;
+    data[2621] = 29.30776216;
+    data[2823] = 25.83277412;
+    data[3024] = 16.93377644;
+    data[3226] = 38.20675984;
+    data[3427] = 4.55979025;
+    data[3428] = 7.81419594;
+    data[3629] = 34.95235741;
+    data[3831] = 20.18818259;
+    data[4032] = 22.57836796;
+    data[4234] = 32.56217018;
+    data[4435] = 10.20438317;
+    data[4436] = 2.16960395;
+    data[4637] = 40.59694707;
+    data[4839] = 14.54358920;
+    data[5040] = 28.22295949;
+    data[5242] = 26.91757679;
+    data[5443] = 15.84897563;
+    data[5645] = 39.29156065;
+    data[5846] = 3.47498828;
+    data[5847] = 8.89899861;
+    data[6048] = 33.86755288;
+    data[6250] = 21.27298526;
+    data[6451] = 21.49356715;
+    data[6653] = 33.64697099;
+    data[6854] = 9.11958050;
+    data[6855] = 3.25440569;
+    data[7056] = 39.51214626;
+    data[7258] = 15.62839188;
+    data[7459] = 27.13815868;
+    data[7661] = 28.00237760;
+    data[7862] = 14.76417296;
+    data[8064] = 40.37636518;
+    data[8265] = 2.38068704;
+    data[8266] = 10.20263787;
+    data[8467] = 31.61146119;
+    data[8669] = 24.54700135;
+    data[8870] = 15.32919332;
+    data[8871] = 1.66583748;
+    data[9072] = 36.72905266;
+    data[9274] = 20.09709924;
+    data[9475] = 16.93102531;
+    data[9476] = 2.90265540;
+    data[9677] = 32.84499049;
+    data[9879] = 23.52004871;
+    data[10080] = 11.03894413;
+    data[10081] = 10.72582975;
+    data[10282] = 22.71829173;
+    data[10484] = 32.27872774;
+    data[10685] = 0.11626833;
+    data[10686] = 22.85348251;
+    data[10887] = 8.56344029;
+    data[10888] = 14.97978810;
+    data[11089] = 15.51398356;
+    data[11090] = 8.51490628;
+    data[11291] = 21.10680379;
+    data[11292] = 3.32652032;
+    data[11493] = 25.47064796;
+    data[11695] = 27.35907957;
+    data[11896] = 0.65853616;
+    data[11897] = 23.83812517;
+    data[12098] = 3.44359246;
+    data[12099] = 21.22455277;
+    data[12300] = 5.35842171;
+    data[12301] = 19.42555793;
+    data[12502] = 6.49324711;
+    data[12503] = 18.35542172;
+    data[12704] = 6.93138083;
+    data[12705] = 17.93504693;
+    data[12906] = 6.74968259;
+    data[12907] = 18.09151843;
+    data[13108] = 6.01899112;
+    data[13109] = 18.75767298;
+    data[13310] = 4.80452832;
+    data[13311] = 19.87172849;
+    data[13512] = 3.16627859;
+    data[13513] = 21.37690969;
+    data[13514] = 1.25317345;
+    data[13714] = 1.15934468;
+    data[13715] = 20.80361731;
+    data[13716] = 4.04486805;
+    data[13917] = 17.55363122;
+    data[13918] = 7.08320038;
+    data[14119] = 14.07538634;
+    data[14120] = 10.32655034;
+    data[14321] = 10.40921453;
+    data[14322] = 13.73696327;
+    data[14523] = 6.59187697;
+    data[14524] = 17.27988198;
+    data[14525] = 1.46804214;
+    data[14725] = 2.65681883;
+    data[14726] = 18.09193194;
+    data[14727] = 5.85655728;
+    data[14928] = 13.34277913;
+    data[14929] = 10.28267574;
+    data[15130] = 8.56800377;
+    data[15131] = 14.72230814;
+    data[15132] = 1.04039861;
+    data[15332] = 3.79085587;
+    data[15333] = 17.14678481;
+    data[15334] = 6.11609267;
+    data[15535] = 11.75929047;
+    data[15536] = 11.13393717;
+    data[15737] = 6.43857848;
+    data[15738] = 16.07806236;
+    data[15739] = 4.23917221;
+    data[15939] = 1.19989377;
+    data[15940] = 12.75671553;
+    data[15941] = 9.65298992;
+    data[16142] = 7.06935255;
+    data[16143] = 14.94054683;
+    data[16144] = 4.19024844;
+    data[16344] = 1.51483389;
+    data[16345] = 12.00899947;
+    data[16346] = 9.84823331;
+    data[16547] = 6.10224018;
+    data[16548] = 15.33857174;
+    data[16549] = 5.57676842;
+    data[16749] = 0.36827257;
+    data[16750] = 9.89749376;
+    data[16751] = 11.35340426;
+    data[16752] = 2.05122307;
+    data[16952] = 3.89297144;
+    data[16953] = 12.97352277;
+    data[16954] = 8.06631614;
+    data[17155] = 6.74493238;
+    data[17156] = 13.85874674;
+    data[17157] = 5.41190524;
+    data[17357] = 0.74220158;
+    data[17358] = 8.98779090;
+    data[17359] = 11.37871388;
+    data[17360] = 3.32958088;
+    data[17560] = 2.82313535;
+    data[17561] = 10.68049297;
+    data[17562] = 9.43340641;
+    data[17563] = 1.76325557;
+    data[17763] = 4.39018616;
+    data[17764] = 11.87758986;
+    data[17765] = 7.97005836;
+    data[17766] = 0.66104700;
+    data[17966] = 5.49466675;
+    data[17967] = 12.62953598;
+    data[17968] = 6.93987962;
+    data[18169] = 6.18401915;
+    data[18170] = 12.93473132;
+    data[18171] = 6.29778765;
+    data[18371] = 0.02325210;
+    data[18372] = 6.50206627;
+    data[18373] = 12.32661773;
+    data[18374] = 6.00216538;
+    data[18574] = 0.31548753;
+    data[18575] = 6.48925547;
+    data[18576] = 12.04130240;
+    data[18577] = 6.01462880;
+    data[18777] = 0.29979556;
+    data[18778] = 6.18288014;
+    data[18779] = 12.04272825;
+    data[18780] = 6.29981188;
+    data[18781] = 0.55689598;
+    data[18980] = 0.01120471;
+    data[18981] = 5.61729167;
+    data[18982] = 11.22337859;
+    data[18983] = 6.82516303;
+    data[18984] = 1.35264499;
+    data[19184] = 4.82410006;
+    data[19185] = 10.16623247;
+    data[19186] = 7.56075513;
+    data[19187] = 2.34590308;
+    data[19387] = 3.83235747;
+    data[19388] = 8.92296247;
+    data[19389] = 8.47910438;
+    data[19390] = 3.50978645;
+    data[19590] = 2.66873185;
+    data[19591] = 7.51965167;
+    data[19592] = 9.55500547;
+    data[19593] = 4.81966138;
+    data[19594] = 0.08431751;
+    data[19793] = 1.35767367;
+    data[19794] = 5.98019501;
+    data[19795] = 10.60271543;
+    data[19796] = 6.25298498;
+    data[19797] = 1.74059917;
+    data[19997] = 4.32644226;
+    data[19998] = 8.73131864;
+    data[19999] = 7.78916525;
+    data[20000] = 3.48923868;
+    data[20200] = 2.57835095;
+    data[20201] = 6.77582854;
+    data[20202] = 9.40941647;
+    data[20203] = 5.31194592;
+    data[20204] = 1.21447595;
+    data[20403] = 0.75411191;
+    data[20404] = 4.75395704;
+    data[20405] = 8.75380263;
+    data[20406] = 7.19209015;
+    data[20407] = 3.28754401;
+    data[20607] = 2.68179690;
+    data[20608] = 6.49331464;
+    data[20609] = 9.11457930;
+    data[20610] = 5.39387390;
+    data[20611] = 1.67316827;
+    data[20810] = 0.57394296;
+    data[20811] = 4.20600036;
+    data[20812] = 7.83805829;
+    data[20813] = 7.52023002;
+    data[20814] = 3.97470826;
+    data[20815] = 0.42918732;
+    data[21014] = 1.90464477;
+    data[21015] = 5.36569161;
+    data[21016] = 8.82673822;
+    data[21017] = 6.27609482;
+    data[21018] = 2.89750961;
+    data[21218] = 2.89885257;
+    data[21219] = 6.19694078;
+    data[21220] = 8.56699049;
+    data[21221] = 5.34748193;
+    data[21222] = 2.12797290;
+    data[21421] = 0.44750227;
+    data[21422] = 3.59030394;
+    data[21423] = 6.73310598;
+    data[21424] = 7.77023612;
+    data[21425] = 4.70231380;
+    data[21426] = 1.63439126;
+    data[21625] = 1.01536023;
+    data[21626] = 4.01018746;
+    data[21627] = 7.00501446;
+    data[21628] = 7.23442994;
+    data[21629] = 4.31095669;
+    data[21630] = 1.38748321;
+    data[21829] = 1.33348850;
+    data[21830] = 4.18730825;
+    data[21831] = 7.04112789;
+    data[21832] = 6.93188375;
+    data[21833] = 4.14605811;
+    data[21834] = 1.36023236;
+    data[22033] = 1.42879714;
+    data[22034] = 4.14824858;
+    data[22035] = 6.86769979;
+    data[22036] = 6.83705276;
+    data[22037] = 4.18239459;
+    data[22038] = 1.52773573;
+    data[22237] = 1.32610439;
+    data[22238] = 3.91751388;
+    data[22239] = 6.50892360;
+    data[22240] = 6.92639686;
+    data[22241] = 4.39672917;
+    data[22242] = 1.86706171;
+    data[22441] = 1.04827771;
+    data[22442] = 3.51767405;
+    data[22443] = 5.98707050;
+    data[22444] = 7.17824046;
+    data[22445] = 4.76767914;
+    data[22446] = 2.35711760;
+    data[22645] = 0.61636406;
+    data[22646] = 2.96949223;
+    data[22647] = 5.32262027;
+    data[22648] = 7.57265091;
+    data[22649] = 5.27558755;
+    data[22650] = 2.97852419;
+    data[22651] = 0.68146095;
+    data[22849] = 0.04971400;
+    data[22850] = 2.29204819;
+    data[22851] = 4.53438237;
+    data[22852] = 6.77671656;
+    data[22853] = 5.90240723;
+    data[22854] = 3.71349836;
+    data[22855] = 1.52458926;
+    data[23054] = 1.50285335;
+    data[23055] = 3.63961048;
+    data[23056] = 5.77636715;
+    data[23057] = 6.63159089;
+    data[23058] = 4.54574358;
+    data[23059] = 2.45989650;
+    data[23060] = 0.37404924;
+    data[23258] = 0.61795861;
+    data[23259] = 2.65410915;
+    data[23260] = 4.69025923;
+    data[23261] = 6.72641024;
+    data[23262] = 5.46034705;
+    data[23263] = 3.47270933;
+    data[23264] = 1.48507138;
+    data[23463] = 1.59233576;
+    data[23464] = 3.53261665;
+    data[23465] = 5.47289755;
+    data[23466] = 6.44368259;
+    data[23467] = 4.54962999;
+    data[23468] = 2.65557761;
+    data[23469] = 0.76152512;
+    data[23667] = 0.46749352;
+    data[23668] = 2.31641904;
+    data[23669] = 4.16534441;
+    data[23670] = 6.01426978;
+    data[23671] = 5.67844696;
+    data[23672] = 3.87357362;
+    data[23673] = 2.06870004;
+    data[23674] = 0.26382666;
+    data[23872] = 1.05349103;
+    data[23873] = 2.81536230;
+    data[23874] = 4.57723346;
+    data[23875] = 6.33910485;
+    data[23876] = 5.12815686;
+    data[23877] = 3.40826320;
+    data[23878] = 1.68837002;
+    data[24077] = 1.43350090;
+    data[24078] = 3.11241671;
+    data[24079] = 4.79133241;
+    data[24080] = 6.40943693;
+    data[24081] = 4.77052201;
+    data[24082] = 3.13160778;
+    data[24083] = 1.49269309;
+    data[24281] = 0.02932359;
+    data[24282] = 1.62918994;
+    data[24283] = 3.22905602;
+    data[24284] = 4.82892245;
+    data[24285] = 6.14671456;
+    data[24286] = 4.58496623;
+    data[24287] = 3.02321767;
+    data[24288] = 1.46146910;
+    data[24486] = 0.13601698;
+    data[24487] = 1.66055572;
+    data[24488] = 3.18509457;
+    data[24489] = 4.70963307;
+    data[24490] = 6.04072399;
+    data[24491] = 4.55250870;
+    data[24492] = 3.06429295;
+    data[24493] = 1.57607743;
+    data[24494] = 0.08786193;
+    data[24691] = 0.09328097;
+    data[24692] = 1.54603878;
+    data[24693] = 2.99879676;
+    data[24694] = 4.45155473;
+    data[24695] = 5.90431225;
+    data[24696] = 4.65566106;
+    data[24697] = 3.23751615;
+    data[24698] = 1.81937125;
+    data[24699] = 0.40122634;
+    data[24897] = 1.30262633;
+    data[24898] = 2.68698297;
+    data[24899] = 4.07133950;
+    data[24900] = 5.45569602;
+    data[24901] = 4.87832492;
+    data[24902] = 3.52695142;
+    data[24903] = 2.17557792;
+    data[24904] = 0.82420459;
+    data[25102] = 0.94595028;
+    data[25103] = 2.26512621;
+    data[25104] = 3.58430226;
+    data[25105] = 4.90347855;
+    data[25106] = 5.20569785;
+    data[25107] = 3.91795207;
+    data[25108] = 2.63020652;
+    data[25109] = 1.34246063;
+    data[25110] = 0.05471494;
+    data[25307] = 0.49037894;
+    data[25308] = 1.74744334;
+    data[25309] = 3.00450763;
+    data[25310] = 4.26157191;
+    data[25311] = 5.51863620;
+    data[25312] = 4.39707236;
+    data[25313] = 3.16995848;
+    data[25314] = 1.94284460;
+    data[25315] = 0.71573065;
+    data[25513] = 1.14698056;
+    data[25514] = 2.34485767;
+    data[25515] = 3.54273478;
+    data[25516] = 4.74061165;
+    data[25517] = 4.95198462;
+    data[25518] = 3.78264743;
+    data[25519] = 2.61331047;
+    data[25520] = 1.44397374;
+    data[25521] = 0.27463681;
+    data[25718] = 0.47569509;
+    data[25719] = 1.61717169;
+    data[25720] = 2.75864848;
+    data[25721] = 3.90012516;
+    data[25722] = 5.04160160;
+    data[25723] = 4.45712078;
+    data[25724] = 3.34284059;
+    data[25725] = 2.22856039;
+    data[25726] = 1.11428020;
+
+    for (auto & val : data) {
+        val /= 1000.0f;
+    }
+
+    filters.data = std::move(data);
+    return filters;
+}
+
+} // namespace whisper_precalc_filters
diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h
new file mode 100644
index 000000000..b7b940aff
--- /dev/null
+++ b/tools/mtmd/mtmd-audio.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "ggml.h"
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+#define WHISPER_ASSERT GGML_ASSERT
+
+#define WHISPER_SAMPLE_RATE 16000
+#define WHISPER_N_FFT       400
+#define WHISPER_HOP_LENGTH  160
+#define WHISPER_CHUNK_SIZE  30
+
+#define COMMON_SAMPLE_RATE 16000
+
+namespace whisper_preprocessor {
+
+struct whisper_mel {
+    int n_len;
+    int n_len_org;
+    int n_mel;
+
+    std::vector<float> data;
+};
+
+struct whisper_filters {
+    int32_t n_mel;
+    int32_t n_fft;
+
+    std::vector<float> data;
+};
+
+bool preprocess_audio(
+        const float * samples,
+        size_t n_samples,
+        const whisper_filters & filters,
+        std::vector<whisper_mel> & output);
+
+} // namespace whisper_preprocessor
+
+namespace whisper_precalc_filters {
+
+whisper_preprocessor::whisper_filters get_128_bins();
+
+} // namespace whisper_precalc_filters
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 4977d5480..508a64c58 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -7,6 +7,7 @@
 #include "console.h"
 #include "chat.h"
 #include "mtmd.h"
+#include "mtmd-helper.h"
 
 #include <vector>
 #include <limits.h>
@@ -37,10 +38,10 @@ static volatile bool g_is_interrupted = false;
 static void show_additional_info(int /*argc*/, char ** argv) {
     LOG(
         "Experimental CLI for multimodal\n\n"
-        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
+        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
         "  -m and --mmproj are required\n"
         "  -hf user/repo can replace both -m and --mmproj in most cases\n"
-        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
+        "  --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
         "  to disable using GPU for mmproj model, add --no-mmproj-offload\n",
         argv[0]
     );
@@ -142,8 +143,8 @@ struct mtmd_cli_context {
         );
     }
 
-    bool load_image(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
+    bool load_media(const std::string & fname) {
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
         if (!bmp.ptr) {
             return false;
         }
@@ -243,7 +244,7 @@ int main(int argc, char ** argv) {
     common_params params;
     params.sampling.temp = 0.2; // lower temp by default for better quality
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
         return 1;
     }
 
@@ -283,14 +284,16 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        if (params.prompt.find("<__image__>") == std::string::npos) {
-            params.prompt += " <__image__>";
+        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
+            for (size_t i = 0; i < params.image.size(); i++) {
+                params.prompt += mtmd_default_marker();
+            }
         }
         common_chat_msg msg;
         msg.role = "user";
         msg.content = params.prompt;
         for (const auto & image : params.image) {
-            if (!ctx.load_image(image)) {
+            if (!ctx.load_media(image)) {
                 return 1; // error is already printed by libmtmd
             }
         }
@@ -303,7 +306,12 @@ int main(int argc, char ** argv) {
 
     } else {
         LOG("\n Running in chat mode, available commands:");
-        LOG("\n   /image <path>    load an image");
+        if (mtmd_support_vision(ctx.ctx_vision.get())) {
+            LOG("\n   /image <path>    load an image");
+        }
+        if (mtmd_support_audio(ctx.ctx_vision.get())) {
+            LOG("\n   /audio <path>    load an audio");
+        }
         LOG("\n   /clear           clear the chat history");
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
@@ -333,15 +341,17 @@ int main(int argc, char ** argv) {
                 continue;
             }
             g_is_generating = true;
-            if (line == "/image" || line.find("/image ") == 0) {
+            bool is_image = line == "/image" || line.find("/image ") == 0;
+            bool is_audio = line == "/audio" || line.find("/audio ") == 0;
+            if (is_image || is_audio) {
                 if (line.size() < 8) {
-                    LOG_ERR("ERR: Missing image filename\n");
+                    LOG_ERR("ERR: Missing media filename\n");
                     continue;
                 }
-                std::string image = line.substr(7);
-                if (ctx.load_image(image)) {
-                    LOG("Image %s loaded\n", image.c_str());
-                    content += "<__image__>";
+                std::string media_path = line.substr(7);
+                if (ctx.load_media(media_path)) {
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                    content += mtmd_default_marker();
                 }
                 // else, error is already printed by libmtmd
                 continue;
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index bb20db150..64f03fd1e 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -1,10 +1,37 @@
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 #include "mtmd.h"
+#include "mtmd-helper.h"
 #include "llama.h"
 
 #include <algorithm>
 #include <cinttypes>
 #include <vector>
 
+//#define MTMD_AUDIO_DEBUG
+
+#define MINIAUDIO_IMPLEMENTATION
+#ifndef MTMD_AUDIO_DEBUG
+#   define MA_NO_ENCODING
+#endif
+#define MA_NO_DEVICE_IO
+#define MA_NO_RESOURCE_MANAGER
+#define MA_NO_NODE_GRAPH
+#define MA_NO_ENGINE
+#define MA_NO_GENERATION
+#define MA_API static
+#include "miniaudio/miniaudio.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb/stb_image.h"
+
 #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
 #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
 
@@ -12,17 +39,7 @@ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
     size_t n_tokens = 0;
     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
         auto chunk = mtmd_input_chunks_get(chunks, i);
-        auto chunk_type = mtmd_input_chunk_get_type(chunk);
-        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            size_t n_tokens_text;
-            mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
-            n_tokens += n_tokens_text;
-        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
-            n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
-        } else {
-            GGML_ASSERT(false && "chunk type not supported");
-        }
+        n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
     }
     return n_tokens;
 }
@@ -31,17 +48,7 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
     llama_pos n_pos = 0;
     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
         auto chunk = mtmd_input_chunks_get(chunks, i);
-        auto chunk_type = mtmd_input_chunk_get_type(chunk);
-        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            size_t n_tokens_text;
-            mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
-            n_pos += n_tokens_text;
-        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
-            n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
-        } else {
-            GGML_ASSERT(false && "chunk type not supported");
-        }
+        n_pos += mtmd_input_chunk_get_n_pos(chunk);
     }
     return n_pos;
 }
@@ -86,7 +93,8 @@ struct decode_embd_batch {
         }
     }
 
-    void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+    // M-RoPE for image
+    void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
         GGML_ASSERT(n_pos_per_embd == 4);
         seq_id_0[0] = seq_id;
         for (int y = 0; y < ny; y++) {
@@ -105,6 +113,23 @@ struct decode_embd_batch {
         }
     }
 
+    // M-RoPE for audio
+    void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int i = 0; i < batch.n_tokens; i++) {
+            pos[i                     ] = pos_0 + i;
+            pos[i + batch.n_tokens    ] = pos_0 + i;
+            pos[i + batch.n_tokens * 2] = pos_0 + i;
+            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
     llama_batch get_view(int offset, int n_tokens) {
         llama_pos * pos_ptr;
         pos_view.clear();
@@ -149,13 +174,10 @@ int32_t mtmd_helper_decode_image_chunk(
         llama_seq_id seq_id,
         int32_t n_batch,
         llama_pos * new_n_past) {
-    if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-        LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
-        return -1;
-    }
-    const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-    if (!image_tokens) {
-        LOG_ERR("failed to decode image chunk: image tokens are null\n");
+    auto chunk_type = mtmd_input_chunk_get_type(chunk);
+    const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
+    if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n");
         return -1;
     }
 
@@ -163,16 +185,26 @@ int32_t mtmd_helper_decode_image_chunk(
     int n_mmproj_embd = llama_model_n_embd(model);
     int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
 
-    int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+    int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
     int32_t i_batch = 0;
     int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
     decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
 
-    const int nx = mtmd_image_tokens_get_nx(image_tokens);
-    const int ny = mtmd_image_tokens_get_ny(image_tokens);
-
     if (mtmd_decode_use_mrope(ctx)) {
-        batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            if (!image_tokens) {
+                LOG_ERR("failed to decode chunk: image tokens are null\n");
+                return -1;
+            }
+            const int nx = mtmd_image_tokens_get_nx(image_tokens);
+            const int ny = mtmd_image_tokens_get_ny(image_tokens);
+            batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            batch_embd.set_position_mrope_1d(n_past, seq_id);
+        } else {
+            GGML_ABORT("invalid chunk type for M-RoPE");
+        }
     } else {
         batch_embd.set_position_normal(n_past, seq_id);
     }
@@ -187,22 +219,22 @@ int32_t mtmd_helper_decode_image_chunk(
         int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
         llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
 
-        LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+        LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch);
 
         int64_t t1 = ggml_time_ms();
         int32_t ret = llama_decode(lctx, batch_embd_view);
         if (ret != 0) {
-            LOG_ERR("failed to decode image\n");
+            LOG_ERR("failed to decode %s\n", name);
             llama_set_causal_attn(lctx, true); // restore causal attn
             return ret;
         }
 
-        LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
+        LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);
 
         i_batch++;
     }
 
-    n_past += mtmd_image_tokens_get_n_pos(image_tokens);
+    n_past += mtmd_input_chunk_get_n_pos(chunk);
     *new_n_past = n_past;
 
     if (mtmd_decode_use_non_causal(ctx)) {
@@ -253,25 +285,25 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
             *new_n_past += text_batch.n_tokens;
         }
 
-    } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-        const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+    } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
         int64_t t0 = ggml_time_ms();
 
-        LOG_INF("encoding image or slice...\n");
+        LOG_INF("encoding %s slice...\n", name);
 
-        ret = mtmd_encode(ctx, image_tokens);
+        ret = mtmd_encode_chunk(ctx, chunk);
         if (ret != 0) {
-            LOG_ERR("failed to encode image\n");
+            LOG_ERR("failed to encode %s slice\n", name);
             llama_batch_free(text_batch);
             return ret;
         }
 
-        LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
 
         float * embd = mtmd_get_output_embd(ctx);
         ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
         if (ret != 0) {
-            LOG_ERR("failed to decode image\n");
+            LOG_ERR("failed to decode %s\n", name);
             llama_batch_free(text_batch);
             return ret;
         }
@@ -310,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
 
     return 0;
 }
+
+namespace audio_helpers {
+
+static bool is_audio_file(const char * buf, size_t len) {
+    if (len < 12) {
+        return false;
+    }
+
+    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
+    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
+    bool is_mp3 = len >= 3 && (
+        memcmp(buf, "ID3", 3) == 0 ||
+        // Check for MPEG sync word (simplified check)
+        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
+    );
+    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
+
+    return is_wav || is_mp3 || is_flac;
+}
+
+// returns true if the buffer is a valid audio file
+static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
+    ma_result result;
+    const int channels = 1;
+    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
+    ma_decoder decoder;
+
+    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
+    if (result != MA_SUCCESS) {
+        return false;
+    }
+
+    ma_uint64 frame_count;
+    ma_uint64 frames_read;
+    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+
+    pcmf32_mono.resize(frame_count);
+    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+
+#ifdef MTMD_AUDIO_DEBUG
+    // save audio to wav file
+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
+    ma_encoder encoder;
+    ma_encoder_init_file("output.wav", &config, &encoder);
+    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
+    ma_encoder_uninit(&encoder);
+#endif
+
+    ma_decoder_uninit(&decoder);
+    return true;
+}
+
+} // namespace audio_helpers
+
+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
+    if (audio_helpers::is_audio_file((const char *)buf, len)) {
+        std::vector<float> pcmf32;
+        int bitrate = mtmd_get_audio_bitrate(ctx);
+        if (bitrate < 0) {
+            LOG_ERR("This model does not support audio input\n");
+            return nullptr;
+        }
+        if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
+            LOG_ERR("Unable to read WAV audio file from buffer\n");
+            return nullptr;
+        }
+        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
+    }
+
+    // otherwise, we assume it's an image
+    mtmd_bitmap * result = nullptr;
+    {
+        int nx, ny, nc;
+        auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
+        if (!data) {
+            LOG_ERR("%s: failed to decode image bytes\n", __func__);
+            return nullptr;
+        }
+        result = mtmd_bitmap_init(nx, ny, data);
+        stbi_image_free(data);
+    }
+    return result;
+}
+
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+    std::vector<unsigned char> buf;
+    FILE * f = fopen(fname, "rb");
+    if (!f) {
+        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
+        return nullptr;
+    }
+
+    fseek(f, 0, SEEK_END);
+    long file_size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    buf.resize(file_size);
+
+    size_t n_read = fread(buf.data(), 1, file_size, f);
+    fclose(f);
+    if (n_read != (size_t)file_size) {
+        LOG_ERR("Failed to read entire file %s", fname);
+        return nullptr;
+    }
+
+    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+}
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
new file mode 100644
index 000000000..5c0edc693
--- /dev/null
+++ b/tools/mtmd/mtmd-helper.h
@@ -0,0 +1,91 @@
+#ifndef MTMD_HELPER_H
+#define MTMD_HELPER_H
+
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// libmtmd helper functions
+//
+// Please note that these helpers are not guaranteed to be stable.
+// BREAKING CHANGES are expected.
+//
+
+// helper function to construct a mtmd_bitmap from a file
+// it calls mtmd_helper_bitmap_init_from_buf() internally
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+
+// helper function to construct a mtmd_bitmap from a buffer containing a file
+// supported formats:
+//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+//     audio: formats supported by miniaudio: wav, mp3, flac
+// note: audio files will be auto-detected based on magic bytes
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
+
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                         struct llama_context * lctx,
+                                         const mtmd_input_chunks * chunks,
+                                         llama_pos n_past,
+                                         llama_seq_id seq_id,
+                                         int32_t n_batch,
+                                         bool logits_last,
+                                         llama_pos * new_n_past);
+
+// works like mtmd_helper_eval_chunks(), but only for a single chunk
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+                                               struct llama_context * lctx,
+                                               const mtmd_input_chunk * chunk,
+                                               llama_pos n_past,
+                                               llama_seq_id seq_id,
+                                               int32_t n_batch,
+                                               bool logits_last,
+                                               llama_pos * new_n_past);
+
+// helper function to decode an image whose embeddings have already been calculated
+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+                                                struct llama_context * lctx,
+                                                const mtmd_input_chunk * chunk,
+                                                float * encoded_embd,
+                                                llama_pos n_past,
+                                                llama_seq_id seq_id,
+                                                int32_t n_batch,
+                                                llama_pos * new_n_past);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#endif
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 1234dbb46..8573f1143 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1,6 +1,7 @@
 #include "clip.h"
 #include "clip-impl.h"
 #include "mtmd.h"
+#include "mtmd-audio.h"
 
 #include "llama.h"
 
@@ -19,17 +20,49 @@ struct mtmd_bitmap {
     uint32_t ny;
     std::vector<unsigned char> data;
     std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
+    bool is_audio = false; // true if the bitmap is audio
 };
 
-struct mtmd_image_tokens_deleter {
-    void operator()(mtmd_image_tokens * val); // forward declaration
+struct mtmd_image_tokens {
+    uint32_t nx; // number of tokens in x direction
+    uint32_t ny; // number of tokens in y direction
+    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
+    uint32_t n_tokens() const { return nx * ny; }
+    clip_image_f32_batch batch_f32; // preprocessed image patches
+    std::string id; // optional user-defined ID, useful for KV cache tracking
+
+    mtmd_image_tokens clone() {
+        return mtmd_image_tokens{
+            nx,
+            ny,
+            use_mrope_pos,
+            batch_f32.clone(),
+            id
+        };
+    }
 };
-using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
+
+struct mtmd_audio_tokens {
+    uint32_t n_tokens; // number of tokens
+    clip_image_f32_batch batch_f32; // preprocessed image patches
+    std::string id; // optional user-defined ID, useful for KV cache tracking
+
+    mtmd_audio_tokens clone() {
+        return mtmd_audio_tokens{
+            n_tokens,
+            batch_f32.clone(),
+            id
+        };
+    }
+};
+using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;
 
 struct mtmd_input_chunk {
     mtmd_input_chunk_type type;
     std::vector<llama_token> tokens_text;
     mtmd_image_tokens_ptr tokens_image;
+    mtmd_audio_tokens_ptr tokens_audio;
 };
 
 struct mtmd_input_chunks {
@@ -46,6 +79,10 @@ enum mtmd_slice_tmpl {
     // TODO @ngxson : add support for idefics (SmolVLM)
 };
 
+const char * mtmd_default_marker() {
+    return "<__media__>";
+}
+
 mtmd_context_params mtmd_context_params_default() {
     mtmd_context_params params;
     params.use_gpu = true;
@@ -53,17 +90,26 @@ mtmd_context_params mtmd_context_params_default() {
     params.n_threads = 4;
     params.verbosity = GGML_LOG_LEVEL_INFO;
     params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
+    params.media_marker = mtmd_default_marker();
     return params;
 }
 
 struct mtmd_context {
-    struct clip_ctx * ctx_clip;
+    struct clip_ctx * ctx_v; // vision
+    struct clip_ctx * ctx_a; // audio
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
 
     bool print_timings;
     int n_threads;
-    std::string image_marker;
+    std::string media_marker;
+    const int n_embd_text;
+
+    // these are not token, but strings used to mark the beginning and end of image/audio embeddings
+    std::string img_beg;
+    std::string img_end;
+    std::string aud_beg;
+    std::string aud_end;
 
     // for llava-uhd style models, we need special tokens in-between slices
     // minicpmv calls them "slices", llama 4 calls them "tiles"
@@ -81,6 +127,9 @@ struct mtmd_context {
 
     bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
 
+    // for whisper, we pre-calculate the mel filter bank
+    whisper_preprocessor::whisper_filters w_filters;
+
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
@@ -89,20 +138,61 @@ struct mtmd_context {
         text_model   (text_model),
         print_timings(ctx_params.print_timings),
         n_threads    (ctx_params.n_threads),
-        image_marker (ctx_params.image_marker)
+        media_marker (ctx_params.media_marker),
+        n_embd_text  (llama_model_n_embd(text_model))
     {
+        if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
+            throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
+        }
+
+        if (media_marker.empty()) {
+            throw std::runtime_error("media_marker must not be empty");
+        }
+
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
-        ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
-        if (!ctx_clip) {
+        auto res = clip_init(mmproj_fname, ctx_clip_params);
+        ctx_v = res.ctx_v;
+        ctx_a = res.ctx_a;
+        if (!ctx_v && !ctx_a) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
 
-        use_mrope = clip_is_qwen2vl(ctx_clip);
+        // if both vision and audio mmproj are present, we need to validate their n_embd
+        if (ctx_v && ctx_a) {
+            int n_embd_v = clip_n_mmproj_embd(ctx_v);
+            int n_embd_a = clip_n_mmproj_embd(ctx_a);
+            if (n_embd_v != n_embd_a) {
+                throw std::runtime_error(string_format(
+                    "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
+                    n_embd_v, n_embd_a));
+            }
+        }
 
-        projector_type proj = clip_get_projector_type(ctx_clip);
-        int minicpmv_version = clip_is_minicpmv(ctx_clip);
+        // since we already validate n_embd of vision and audio mmproj,
+        // we can safely assume that they are the same
+        int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
+        if (n_embd_text != n_embd_clip) {
+            throw std::runtime_error(string_format(
+                "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
+                "hint: you may be using wrong mmproj\n",
+                n_embd_text, n_embd_clip));
+        }
+        if (ctx_v) {
+            init_vision();
+        }
+        if (ctx_a) {
+            init_audio();
+        }
+    }
+
+    void init_vision() {
+        GGML_ASSERT(ctx_v != nullptr);
+        use_mrope = clip_is_qwen2vl(ctx_v);
+
+        projector_type proj = clip_get_projector_type(ctx_v);
+        int minicpmv_version = clip_is_minicpmv(ctx_v);
         if (minicpmv_version == 2) {
             // minicpmv 2.5 format:
             // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -146,10 +236,83 @@ struct mtmd_context {
             tok_row_end_trail = true; // add trailing end-of-row token
             ov_img_first      = false; // overview image is last
         }
+
+        // set boi/eoi
+        if (proj == PROJECTOR_TYPE_GEMMA3) {
+            // <start_of_image> ... (image embeddings) ... <end_of_image>
+            img_beg = "<start_of_image>";
+            img_end = "<end_of_image>";
+
+        } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
+            // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
+            img_beg = "<fake_token_around_image><global-img>";
+            img_end = "<fake_token_around_image>";
+
+        } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
+            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
+            img_end = "[IMG_END]";
+
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
+            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+            img_beg = "<|vision_start|>";
+            img_end = "<|vision_end|>";
+
+        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+            // (more details in mtmd_context constructor)
+            img_beg = "<|image_start|>";
+            img_end = "<|image_end|>";
+            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+
+        } else if (proj == PROJECTOR_TYPE_INTERNVL) {
+            // <img> ... (image embeddings) ... </img>
+            img_beg = "<img>";
+            img_end = "</img>";
+
+        }
+    }
+
+    void init_audio() {
+        GGML_ASSERT(ctx_a != nullptr);
+        projector_type proj = clip_get_projector_type(ctx_a);
+
+        if (clip_has_whisper_encoder(ctx_a)) {
+            // TODO @ngxson : check if model n_mel is 128 or 80
+            w_filters = whisper_precalc_filters::get_128_bins();
+        }
+
+        LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
+                "    https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
+
+        if (proj == PROJECTOR_TYPE_QWEN2A) {
+            // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
+            aud_beg = "<|audio_bos|>";
+            aud_end = "<|audio_eos|>";
+
+        }
+    }
+
+    // get clip ctx based on chunk type
+    clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
+        if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            return ctx_v;
+        } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            return ctx_a;
+        }
+        GGML_ABORT("unknown chunk type");
+    }
+
+    projector_type proj_type_v() const {
+        return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
+    }
+
+    projector_type proj_type_a() const {
+        return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
     }
 
     ~mtmd_context() {
-        clip_free(ctx_clip);
+        clip_free(ctx_a);
+        clip_free(ctx_v);
     }
 
 private:
@@ -179,29 +342,6 @@ private:
     }
 };
 
-struct mtmd_image_tokens_data {
-    clip_image_f32_batch batch_f32; // preprocessed image patches
-};
-
-struct mtmd_image_tokens {
-    uint32_t nx; // number of tokens in x direction
-    uint32_t ny; // number of tokens in y direction
-    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
-    uint32_t n_tokens() const { return nx * ny; }
-    clip_image_f32_batch batch_f32; // preprocessed image patches
-    std::string id; // optional user-defined ID, useful for KV cache tracking
-
-    mtmd_image_tokens clone() {
-        return mtmd_image_tokens{
-            nx,
-            ny,
-            use_mrope_pos,
-            batch_f32.clone(),
-            id
-        };
-    }
-};
-
 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
         const struct llama_model * text_model,
         const struct mtmd_context_params ctx_params) {
@@ -219,145 +359,137 @@ void mtmd_free(mtmd_context * ctx) {
     }
 }
 
-// copied from common_tokenize
-static std::vector<llama_token> mtmd_tokenize_text_internal(
-    const struct llama_vocab * vocab,
-           const std::string & text,
-                        bool   add_special,
-                        bool   parse_special) {
-    // upper limit for the number of tokens
-    int n_tokens = text.length() + 2 * add_special;
-    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-        GGML_ASSERT(check == -n_tokens);
-    } else {
-        result.resize(n_tokens);
-    }
-    return result;
-}
+struct mtmd_tokenizer {
+    mtmd_context * ctx;
+    std::vector<const mtmd_bitmap *> bitmaps;
 
-int32_t mtmd_tokenize(mtmd_context * ctx,
-            mtmd_input_chunks * output,
+    std::string input_text;
+    bool add_special;
+    bool parse_special;
+    const llama_vocab * vocab;
+
+    mtmd_input_chunks cur;
+
+    mtmd_tokenizer(mtmd_context * ctx,
             const mtmd_input_text * text,
             const mtmd_bitmap ** bitmaps,
-            size_t n_bitmaps) {
-    auto vocab = llama_model_get_vocab(ctx->text_model);
-
-    std::string prompt_modified(text->text);
-    std::string marker_modified(ctx->image_marker);
-    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
-
-    // a bit hacky here, but works for now
-    // for some models, we need to add prefix and suffix to the image embeddings
-    if (clip_is_gemma3(ctx->ctx_clip)) {
-        // gemma 3
-        // <start_of_image> ... (image embeddings) ... <end_of_image>
-        marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-
-    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
-        // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
-        marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-
-    } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
-        // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
-        marker_modified = ctx->image_marker + "[IMG_END]";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-
-    } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
-        // <|vision_start|> ... (image embeddings) ... <|vision_end|>
-        marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-
-    } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
-        // (more details in mtmd_context constructor)
-        marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-
-    } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
-        // <img> ... (image embeddings) ... </img>
-        marker_modified = "<img>" + ctx->image_marker + "</img>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+            size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
+        add_special   = text->add_special;
+        parse_special = text->parse_special;
+        input_text    = text->text;
+        vocab         = llama_model_get_vocab(ctx->text_model);
 
+        // for compatibility, we convert image marker to media marker
+        string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
     }
 
-    // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
-    // for glm-edge, BOI and EOI token's embeddings are not present in the text model
-
-    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
-    output->entries.clear();
-    output->entries.reserve(parts.size());
-
-    size_t i_img = 0;
-
-    // utility for adding raw tokens
-    auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
-        mtmd_input_chunk chunk{
-            MTMD_INPUT_CHUNK_TYPE_TEXT,
-            std::move(tokens),
-            {},
-        };
-        output->entries.emplace_back(std::move(chunk));
-    };
-
-    // utility for splitting batch of multiple images into chunks of batch having single images
-    auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
-        std::vector<mtmd_input_chunk> chunks;
-
-        for (auto & entry : batch_f32.entries) {
-            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
-            image_tokens->ny = 1;
-            image_tokens->batch_f32.entries.push_back(std::move(entry));
-            image_tokens->id = id;
-
-            mtmd_input_chunk chunk{
-                MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                {},
-                std::move(image_tokens),
-            };
-            chunks.emplace_back(std::move(chunk));
+    int32_t tokenize(mtmd_input_chunks * output) {
+        cur.entries.clear();
+        std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
+        size_t i_bm = 0; // index of the current bitmap
+        for (auto & part : parts) {
+            if (part == ctx->media_marker) {
+                // this is a marker, we should add the next bitmap
+                if (i_bm >= bitmaps.size()) {
+                    LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+                            __func__, bitmaps.size(), parts.size() - 1);
+                    return 1;
+                }
+                const mtmd_bitmap * bitmap = bitmaps[i_bm++];
+                int32_t res = add_media(bitmap);
+                if (res != 0) {
+                    return res;
+                }
+            } else {
+                // this is a text part, we should add it as text
+                add_text(part, parse_special);
+            }
         }
 
-        return chunks;
-    };
+        if (add_special && llama_vocab_get_add_bos(vocab)) {
+            // if first chunk is text, we add BOS token to first text chunk
+            // otherwise, create a new text chunk with BOS token
+            if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                // add BOS token to the beginning of first text chunk
+                cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
+            } else {
+                // create a new text chunk with BOS token at the beginning
+                mtmd_input_chunk bos_chunk{
+                    MTMD_INPUT_CHUNK_TYPE_TEXT,
+                    {llama_vocab_bos(vocab)},
+                    nullptr, // image tokens
+                    nullptr, // audio tokens
+                };
+                cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
+            }
+        }
 
-    for (const auto & part : parts) {
-        // printf("tokenizing part: %s\n", part.c_str());
-        bool add_bos = &parts.front() == &part;
-        auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
+        if (add_special && llama_vocab_get_add_eos(vocab)) {
+            // if last chunk is text, we add EOS token to it
+            add_text({llama_vocab_eos(vocab)});
+        }
+
+        if (i_bm != bitmaps.size()) {
+            LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+                    __func__, bitmaps.size(), parts.size() - 1);
+            return 1;
+        }
+
+        *output = std::move(cur);
+
+        return 0;
+    }
+
+    void add_text(const std::string & txt, bool parse_special) {
+        LOG_DBG("%s: %s\n", __func__, txt.c_str());
+        auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
+        add_text(tokens);
+    }
+
+    void add_text(const std::vector<llama_token> & tokens) {
         if (tokens.empty()) {
-            continue;
+            return;
         }
-        mtmd_input_chunk chunk{
-            MTMD_INPUT_CHUNK_TYPE_TEXT,
-            std::move(tokens),
-            {},
-        };
-        output->entries.emplace_back(std::move(chunk));
+        // if last entry is also a text chunk, add tokens to it instead of creating new chunk
+        if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            cur.entries.back().tokens_text.insert(
+                                            cur.entries.back().tokens_text.end(),
+                                            tokens.begin(),
+                                            tokens.end());
+        } else {
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_TEXT,
+                tokens,
+                nullptr, // image tokens
+                nullptr, // audio tokens
+            };
+            cur.entries.emplace_back(std::move(chunk));
+        }
+    }
 
-        if (&parts.back() != &part) {
-            // add image token to middle of 2 parts
+    int32_t add_media(const mtmd_bitmap * bitmap) {
+        if (!bitmap->is_audio) {
+            // handle image
 
-            if (i_img >= n_bitmaps) {
-                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
-                return 1;
+            if (!ctx->ctx_v) {
+                LOG_ERR("%s: error: model does not support vision input\n", __func__);
+                return 2;
+            }
+
+            if (!ctx->img_beg.empty()) {
+                add_text(ctx->img_beg, true); // add image begin token
             }
 
             // convert mtmd_bitmap to clip_image_u8
             clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmaps[i_img]->nx;
-            img_u8->ny = bitmaps[i_img]->ny;
-            img_u8->buf.resize(bitmaps[i_img]->data.size());
-            std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
+            img_u8->nx = bitmap->nx;
+            img_u8->ny = bitmap->ny;
+            img_u8->buf.resize(bitmap->data.size());
+            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
 
             // preprocess image
             clip_image_f32_batch batch_f32;
-            bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
+            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
             if (!ok) {
                 LOG_ERR("Unable to preprocess image\n");
                 return 2;
@@ -370,7 +502,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
             ) {
                 // split batch into chunks of single images
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
                 GGML_ASSERT(chunks.size() > 0);
 
                 auto ov_chunk = std::move(chunks.front());
@@ -379,11 +511,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 // add overview image (first)
                 if (ctx->ov_img_first) {
                     if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_start});
+                        add_text({ctx->tok_ov_img_start});
                     }
-                    output->entries.emplace_back(std::move(ov_chunk));
+                    cur.entries.emplace_back(std::move(ov_chunk));
                     if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_end});
+                        add_text({ctx->tok_ov_img_end});
                     }
                 }
 
@@ -392,53 +524,53 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     const int n_col = batch_f32.grid_x;
                     const int n_row = batch_f32.grid_y;
                     if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_slices_start});
+                        add_text({ctx->tok_slices_start});
                     }
                     for (int y = 0; y < n_row; y++) {
                         for (int x = 0; x < n_col; x++) {
                             const bool is_last_in_row = (x == n_col - 1);
                             if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_img_start});
+                                add_text({ctx->tok_sli_img_start});
                             }
-                            output->entries.emplace_back(std::move(chunks[y * n_col + x]));
+                            cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
                             if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_img_end});
+                                add_text({ctx->tok_sli_img_end});
                             }
                             if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_img_mid});
+                                add_text({ctx->tok_sli_img_mid});
                             }
                         }
                         if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
-                            add_text_chunk({ctx->tok_row_end});
+                            add_text({ctx->tok_row_end});
                         }
                     }
                     if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_slices_end});
+                        add_text({ctx->tok_slices_end});
                     }
                 }
 
                 // add overview image (last)
                 if (!ctx->ov_img_first) {
                     if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_start});
+                        add_text({ctx->tok_ov_img_start});
                     }
-                    output->entries.emplace_back(std::move(ov_chunk));
+                    cur.entries.emplace_back(std::move(ov_chunk));
                     if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_end});
+                        add_text({ctx->tok_ov_img_end});
                     }
                 }
 
             } else {
                 size_t n_tokens = 0;
                 for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
+                    n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
                 }
 
                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
                 if (ctx->use_mrope) {
                     // for Qwen2VL, we need this information for M-RoPE decoding positions
-                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
-                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
+                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
                     image_tokens->use_mrope_pos = true;
                 } else {
                     // other models, we only need the total number of tokens
@@ -446,7 +578,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     image_tokens->ny = 1;
                 }
                 image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmaps[i_img]->id; // optional
+                image_tokens->id = bitmap->id; // optional
 
                 LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
                 LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -454,44 +586,208 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 
                 mtmd_input_chunk chunk{
                     MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                    {},
+                    {}, // text tokens
                     std::move(image_tokens),
+                    nullptr, // audio tokens
                 };
-                output->entries.emplace_back(std::move(chunk));
+                cur.entries.emplace_back(std::move(chunk));
             }
 
-            i_img++; // move to next image
+            if (!ctx->img_end.empty()) {
+                add_text(ctx->img_end, true); // add image end token
+            }
+
+        } else {
+            // handle audio
+
+            if (!ctx->ctx_a) {
+                LOG_ERR("%s: error: model does not support audio input\n", __func__);
+                return 2;
+            }
+
+            if (bitmap->data.size() == 0) {
+                LOG_ERR("%s: error: empty audio data\n", __func__);
+                return 2;
+            }
+
+            if (!ctx->aud_beg.empty()) {
+                add_text(ctx->aud_beg, true); // add audio begin token
+            }
+
+            // preprocess audio
+            GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
+            std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
+            const float * samples = (const float *)bitmap->data.data();
+            size_t n_samples = bitmap->data.size() / sizeof(float);
+            bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
+            if (!ok) {
+                LOG_ERR("Unable to preprocess audio\n");
+                return 2;
+            }
+
+            // consider each mel_spec as a separate audio chunk
+            // TODO: maybe support batching, but this may come with memory cost
+            for (auto & mel_spec : mel_spec_chunks) {
+                clip_image_f32_ptr mel_f32(clip_image_f32_init());
+                mel_f32->nx  = mel_spec.n_len;
+                mel_f32->ny  = mel_spec.n_mel;
+                mel_f32->buf = std::move(mel_spec.data);
+                size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
+
+                clip_image_f32_batch batch_f32;
+                batch_f32.is_audio = true;
+                batch_f32.entries.push_back(std::move(mel_f32));
+
+                mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
+                audio_tokens->n_tokens = n_tokens;
+                audio_tokens->batch_f32 = std::move(batch_f32);
+                audio_tokens->id = bitmap->id; // optional
+
+                LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
+
+                mtmd_input_chunk chunk{
+                    MTMD_INPUT_CHUNK_TYPE_AUDIO,
+                    {}, // text tokens
+                    nullptr, // image tokens
+                    std::move(audio_tokens),
+                };
+                cur.entries.emplace_back(std::move(chunk));
+            }
+
+            if (!ctx->aud_end.empty()) {
+                add_text(ctx->aud_end, true); // add audio end token
+            }
         }
+
+        return 0;
     }
 
-    return 0;
+    std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
+        std::vector<mtmd_input_chunk> chunks;
+
+        for (auto & entry : batch_f32.entries) {
+            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
+            image_tokens->ny = 1;
+            image_tokens->batch_f32.entries.push_back(std::move(entry));
+            image_tokens->id = id;
+
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                {}, // text tokens
+                std::move(image_tokens),
+                nullptr, // audio tokens
+            };
+            chunks.emplace_back(std::move(chunk));
+        }
+
+        return chunks;
+    }
+
+    // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
+    static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
+        std::vector<std::string> result;
+        if (input.empty()) {
+            return result;
+        }
+        size_t start = 0;
+        size_t pos = 0;
+        while ((pos = input.find(delimiter, start)) != std::string::npos) {
+            if (pos > start) {
+                result.push_back(input.substr(start, pos - start));
+            }
+            result.push_back(delimiter);
+            start = pos + delimiter.length();
+        }
+        if (start < input.length()) {
+            result.push_back(input.substr(start));
+        }
+        return result;
+    }
+
+    // copied from common_tokenize
+    static std::vector<llama_token> mtmd_tokenize_text_internal(
+        const struct llama_vocab * vocab,
+               const std::string & text,
+                            bool   add_special,
+                            bool   parse_special) {
+        // upper limit for the number of tokens
+        int n_tokens = text.length() + 2 * add_special;
+        std::vector<llama_token> result(n_tokens);
+        n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        if (n_tokens < 0) {
+            result.resize(-n_tokens);
+            int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+            GGML_ASSERT(check == -n_tokens);
+        } else {
+            result.resize(n_tokens);
+        }
+        return result;
+    }
+};
+
+int32_t mtmd_tokenize(mtmd_context * ctx,
+            mtmd_input_chunks * output,
+            const mtmd_input_text * text,
+            const mtmd_bitmap ** bitmaps,
+            size_t n_bitmaps) {
+    mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
+    return tokenizer.tokenize(output);
 }
 
-static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
-    if (image_tokens) {
-        delete image_tokens;
+int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
+        return 0;
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: model does not support vision input\n", __func__);
+            return 1;
+        }
+        return mtmd_encode(ctx, chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        if (!ctx->ctx_a) {
+            LOG_ERR("%s: model does not support audio input\n", __func__);
+            return 1;
+        }
+        int n_mmproj_embd = ctx->n_embd_text;
+        ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+        bool ok = clip_image_batch_encode(
+            ctx->ctx_a,
+            ctx->n_threads,
+            &chunk->tokens_audio->batch_f32,
+            ctx->image_embd_v.data());
+        return ok ? 0 : 1;
     }
+
+    LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
+    return 1;
 }
 
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
-    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
+    clip_ctx * ctx_clip = ctx->ctx_v;
+    if (!ctx_clip) {
+        LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
+        return 1;
+    }
+    int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
     bool ok = false;
 
-    if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
+    if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         for (size_t i = 0; i < entries.size(); i++) {
-            int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
+            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
             ok = clip_image_encode(
-                ctx->ctx_clip,
+                ctx_clip,
                 ctx->n_threads,
                 entries[i].get(),
                 ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
         }
     } else {
         ok = clip_image_batch_encode(
-            ctx->ctx_clip,
+            ctx_clip,
             ctx->n_threads,
             &image_tokens->batch_f32,
             ctx->image_embd_v.data());
@@ -505,8 +801,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }
 
 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
-    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
         return true;
     }
     return false;
@@ -516,37 +811,20 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
     return ctx->use_mrope;
 }
 
-void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
-    mtmd_image_tokens_free(val);
+bool mtmd_support_vision(mtmd_context * ctx) {
+    return ctx->ctx_v != nullptr;
 }
 
-// these 2 helpers below use internal clip_image_u8_ptr,
-// so unfortunately they cannot moved to mtmd-helper.h
-// however, in theory, user can decode image file to bitmap using
-// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
-
-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
-    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
-    if (!ok) {
-        LOG_ERR("Unable to load image from buffer\n");
-        return nullptr;
-    }
-    uint32_t nx, ny;
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
-    return mtmd_bitmap_init(nx, ny, data);
+bool mtmd_support_audio(mtmd_context * ctx) {
+    return ctx->ctx_a != nullptr;
 }
 
-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
-    bool ok = clip_image_load_from_file(fname, img_u8.get());
-    if (!ok) {
-        LOG_ERR("Unable to load image %s\n", fname);
-        return nullptr;
+int mtmd_get_audio_bitrate(mtmd_context * ctx) {
+    if (!ctx->ctx_a) {
+        return -1;
     }
-    uint32_t nx, ny;
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
-    return mtmd_bitmap_init(nx, ny, data);
+    // for now, we assume that all audio models have the same bitrate
+    return 16000; // 16kHz
 }
 
 //
@@ -567,6 +845,18 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
     return bitmap;
 }
 
+mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
+                                          const float * data) {
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = n_samples;
+    bitmap->ny = 1;
+    bitmap->is_audio = true;
+    size_t data_size = n_samples * sizeof(float);
+    bitmap->data.resize(data_size);
+    std::memcpy(bitmap->data.data(), data, data_size);
+    return bitmap;
+}
+
 uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
     return bitmap->nx;
 }
@@ -579,6 +869,14 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
     return bitmap->data.data();
 }
 
+size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
+    return bitmap->data.size();
+}
+
+bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
+    return bitmap->is_audio;
+}
+
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
     return bitmap->id.c_str();
 }
@@ -642,17 +940,56 @@ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chu
     return nullptr;
 }
 
+size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        return chunk->tokens_text.size();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->n_tokens;
+    } else {
+        GGML_ABORT("invalid chunk type");
+    }
+}
+
+llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        return chunk->tokens_text.size();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->n_tokens;
+    } else {
+        GGML_ABORT("invalid chunk type");
+    }
+}
+
+const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return chunk->tokens_image->id.c_str();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->id.c_str();
+    }
+    return nullptr;
+}
+
 mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
     mtmd_input_chunk * copy = new mtmd_input_chunk{
         chunk->type,
         chunk->tokens_text,
-        mtmd_image_tokens_ptr(),
+        nullptr,
+        nullptr,
     };
     if (chunk->tokens_image) {
         // copy the image tokens
         copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
         *copy->tokens_image = chunk->tokens_image->clone();
     }
+    if (chunk->tokens_audio) {
+        // copy the audio tokens
+        copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
+        *copy->tokens_audio = chunk->tokens_audio->clone();
+    }
     return copy;
 }
 
@@ -700,7 +1037,8 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
     mtmd_input_chunk chunk_text{
         MTMD_INPUT_CHUNK_TYPE_TEXT,
         std::move(tokens_text),
-        {},
+        nullptr, // image tokens
+        nullptr, // audio tokens
     };
     chunks->entries.emplace_back(std::move(chunk_text));
 
@@ -712,8 +1050,9 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
     image_tokens->id = "image_1";
     mtmd_input_chunk chunk_image{
         MTMD_INPUT_CHUNK_TYPE_IMAGE,
-        {},
+        {}, // text tokens
         std::move(image_tokens),
+        nullptr, // audio tokens
     };
     chunks->entries.emplace_back(std::move(chunk_image));
 
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 0ada78c90..f4ea07d3a 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -3,7 +3,6 @@
 
 #include "ggml.h"
 #include "llama.h"
-#include "clip.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -39,6 +38,7 @@
 #    define MTMD_API
 #endif
 
+// deprecated marker, use mtmd_default_marker() instead
 #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
 
 #ifdef __cplusplus
@@ -48,6 +48,7 @@ extern "C" {
 enum mtmd_input_chunk_type {
     MTMD_INPUT_CHUNK_TYPE_TEXT,
     MTMD_INPUT_CHUNK_TYPE_IMAGE,
+    MTMD_INPUT_CHUNK_TYPE_AUDIO,
 };
 
 // opaque types
@@ -79,9 +80,12 @@ struct mtmd_context_params {
     bool print_timings;
     int n_threads;
     enum ggml_log_level verbosity;
-    const char * image_marker;
+    const char * image_marker; // deprecated, use media_marker instead
+    const char * media_marker;
 };
 
+MTMD_API const char * mtmd_default_marker(void);
+
 MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
 
 // initialize the mtmd context
@@ -98,18 +102,32 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 // whether the current model use M-RoPE for llama_decode
 MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
 
+// whether the current model supports vision input
+MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+
+// whether the current model supports audio input
+MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+
+// get audio bitrate in Hz, for example 16000 for Whisper
+// return -1 if audio is not supported
+MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
 
 // mtmd_bitmap
 //
-// length of data must be nx * ny * 3
-// the data is in RGBRGBRGB... format
-MTMD_API mtmd_bitmap *         mtmd_bitmap_init    (uint32_t nx,
-                                                    uint32_t ny,
-                                                    const unsigned char * data);
-MTMD_API uint32_t              mtmd_bitmap_get_nx  (const mtmd_bitmap * bitmap);
-MTMD_API uint32_t              mtmd_bitmap_get_ny  (const mtmd_bitmap * bitmap);
-MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
-MTMD_API void                  mtmd_bitmap_free    (mtmd_bitmap * bitmap);
+// if bitmap is image:
+//     length of data must be nx * ny * 3
+//     the data is in RGBRGBRGB... format
+// if bitmap is audio:
+//     length of data must be n_samples * sizeof(float)
+//     the data is in float format (PCM F32)
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
+MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
+MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
+MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
 // bitmap ID is optional, but useful for KV cache tracking
 // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
 MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
@@ -132,6 +150,11 @@ MTMD_API void                     mtmd_input_chunks_free(mtmd_input_chunks * chu
 MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
 MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
 MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
+// returns nullptr for ID on text chunk
+MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
+// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
 
 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
 // you can move the chunk ownership to your own code by copying it
@@ -144,27 +167,28 @@ MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 //
 // the instance will be constructed via mtmd_tokenize()
 // it will be freed along with mtmd_input_chunk
-MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
 MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
 MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
-MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
-MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens);
+MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 
-// tokenize an input text prompt and an image
-// the prompt must have the input image marker (default: "<__image__>") in it
-// the marker will be replaced with the image tokens
+// tokenize an input text prompt and a list of bitmaps (images/audio)
+// the prompt must have the input image marker (default: "<__media__>") in it
+// the default marker is defined by mtmd_default_marker()
+// the marker will be replaced with the image/audio chunk
 // for example:
-//   "here is an image: <__image__>\ndescribe it in detail."
+//   "here is an image: <__media__>\ndescribe it in detail."
 //   this will gives 3 chunks:
 //   1. "here is an image: <start_of_image>"
-//   2. (image tokens)
+//   2. (image/audio tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
-// number of bitmaps must be equal to the number of image markers in the prompt
+// number of bitmaps must be equal to the number of markers in the prompt
 // this function is thread-safe (shared ctx)
 // return values:
 //   0 on success
-//   1 on number of images not matching the number of markers
+//   1 on number of bitmaps not matching the number of markers
 //   2 on image preprocessing error
 MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                                mtmd_input_chunks * output,
@@ -173,79 +197,21 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                                size_t n_bitmaps);
 
 // returns 0 on success
+// TODO: deprecate
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
                              const mtmd_image_tokens * image_tokens);
 
+// returns 0 on success
+MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
+                                   const mtmd_input_chunk * chunk);
+
 // get output embeddings from the last encode pass
+// the reading size (in bytes) is equal to:
+// llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
 /////////////////////////////////////////
 
-//
-// Helper functions (can be implemented based on other functions)
-//
-// Please note that these helpers are not guaranteed to be stable.
-// BREAKING CHANGES are expected.
-//
-
-// helper function to construct a mtmd_bitmap from a file
-// returns nullptr on failure
-// this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
-
-// helper function to construct a mtmd_bitmap from a buffer containing a file
-// the file content must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
-// returns nullptr on failure
-// this function is thread-safe
-MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
-
-// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
-MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
-
-// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
-// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
-MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
-
-// helper function that automatically:
-// 1. run llama_decode() on text chunks
-// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
-// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
-// otherwise, returns 0 on success
-// this function is NOT thread-safe
-MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
-                                         struct llama_context * lctx,
-                                         const mtmd_input_chunks * chunks,
-                                         llama_pos n_past,
-                                         llama_seq_id seq_id,
-                                         int32_t n_batch,
-                                         bool logits_last,
-                                         llama_pos * new_n_past);
-
-// works like mtmd_helper_eval_chunks(), but only for a single chunk
-// this function is NOT thread-safe
-MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
-                                               struct llama_context * lctx,
-                                               const mtmd_input_chunk * chunk,
-                                               llama_pos n_past,
-                                               llama_seq_id seq_id,
-                                               int32_t n_batch,
-                                               bool logits_last,
-                                               llama_pos * new_n_past);
-
-// helper function to decode an image whose embeddings have already been calculated
-// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
-// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
-MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
-                                                struct llama_context * lctx,
-                                                const mtmd_input_chunk * chunk,
-                                                float * encoded_embd,
-                                                llama_pos n_past,
-                                                llama_seq_id seq_id,
-                                                int32_t n_batch,
-                                                llama_pos * new_n_past);
-
-/////////////////////////////////////////
-
 // test function, to be used in test-mtmd-c-api.c
 MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 
@@ -293,6 +259,7 @@ struct bitmap {
     uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
     uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
     const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
+    size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
     std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
     void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
 };
diff --git a/tools/mtmd/test-2.mp3 b/tools/mtmd/test-2.mp3
new file mode 100644
index 000000000..aa9d7ec2c
Binary files /dev/null and b/tools/mtmd/test-2.mp3 differ
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
index 15a37b0d2..aa0019893 100755
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -25,80 +25,99 @@ RUN_HUGE_TESTS=false
 if [ "${1:-}" = "huge" ]; then
     RUN_HUGE_TESTS=true
     RUN_BIG_TESTS=true
-    echo "Include BIG models..."
+    echo "Include BIG and HUGE models..."
 fi
 
 ###############
 
-arr_bin=()
+arr_prefix=()
 arr_hf=()
 arr_tmpl=() # chat template
+arr_file=()
 
-add_test() {
-    local bin=$1
-    local hf=$2
-    local tmpl=${3:-""} # default to empty string if not provided
-    arr_bin+=("$bin")
+add_test_vision() {
+    local hf=$1
+    local tmpl=${2:-""} # default to empty string if not provided
+    arr_prefix+=("[vision]")
     arr_hf+=("$hf")
     arr_tmpl+=("$tmpl")
+    arr_file+=("test-1.jpeg")
 }
 
-add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
-add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
-add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
-add_test "llama-mtmd-cli"  "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
-add_test "llama-mtmd-cli"  "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
-add_test "llama-mtmd-cli"  "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
-add_test "llama-mtmd-cli"  "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M"         "vicuna"
-add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
-add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
-add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
-add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
-add_test "llama-mtmd-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
-add_test "llama-mtmd-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
-add_test "llama-mtmd-cli"  "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
-add_test "llama-mtmd-cli"  "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
+add_test_audio() {
+    local hf=$1
+    arr_prefix+=("[audio] ")
+    arr_hf+=("$hf")
+    arr_tmpl+=("") # no need for chat tmpl
+    arr_file+=("test-2.mp3")
+}
+
+add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
+add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
+add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
+add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M"         "vicuna"
+add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
+add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
+add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
+add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
+add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
+add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
+add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
+
+add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
+add_test_audio  "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
 
 # to test the big models, run: ./tests.sh big
 if [ "$RUN_BIG_TESTS" = true ]; then
-    add_test "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
-    # add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
+    add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
+    add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
+    # add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
+
+    add_test_audio  "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
+    add_test_audio  "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
 fi
 
 # to test the huge models, run: ./tests.sh huge
 # this will run both the big and huge models
 # huge models are > 32B parameters
 if [ "$RUN_HUGE_TESTS" = true ]; then
-    add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
-    add_test "llama-mtmd-cli" "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
+    add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
+    add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
 fi
 
 # these models always give the wrong answer, not sure why
-# add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
-# add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
-# add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
+# add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
+# add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
+# add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
 
 # this model has broken chat template, not usable
-# add_test "llama-mtmd-cli"  "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
-# add_test "llama-mtmd-cli"  "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
+# add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
+# add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
 
 ###############
 
-cmake --build build -j --target "${arr_bin[@]}"
+cmake --build build -j --target llama-mtmd-cli
 
 arr_res=()
 
-for i in "${!arr_bin[@]}"; do
-    bin="${arr_bin[$i]}"
+for i in "${!arr_hf[@]}"; do
+    bin="llama-mtmd-cli"
+    prefix="${arr_prefix[$i]}"
     hf="${arr_hf[$i]}"
     tmpl="${arr_tmpl[$i]}"
+    inp_file="${arr_file[$i]}"
 
     echo "Running test with binary: $bin and HF model: $hf"
     echo ""
@@ -107,7 +126,7 @@ for i in "${!arr_bin[@]}"; do
     output=$(\
         "$PROJ_ROOT/build/bin/$bin" \
         -hf "$hf" \
-        --image $SCRIPT_DIR/test-1.jpeg \
+        --image $SCRIPT_DIR/$inp_file \
         -p "what is the publisher name of the newspaper?" \
         --temp 0 -n 128 \
         ${tmpl:+--chat-template "$tmpl"} \
@@ -116,9 +135,9 @@ for i in "${!arr_bin[@]}"; do
     echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
 
     if echo "$output" | grep -iq "new york"; then
-        result="\033[32mOK\033[0m:   $bin $hf"
+        result="$prefix \033[32mOK\033[0m:   $bin $hf"
     else
-        result="\033[31mFAIL\033[0m: $bin $hf"
+        result="$prefix \033[31mFAIL\033[0m: $bin $hf"
     fi
     echo -e "$result"
     arr_res+=("$result")
diff --git a/tools/rpc/rpc-server.cpp b/tools/rpc/rpc-server.cpp
index 581c74018..b4b7c4748 100644
--- a/tools/rpc/rpc-server.cpp
+++ b/tools/rpc/rpc-server.cpp
@@ -111,7 +111,7 @@ static std::string fs_get_cache_directory() {
     if (getenv("LLAMA_CACHE")) {
         cache_directory = std::getenv("LLAMA_CACHE");
     } else {
-#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
         if (std::getenv("XDG_CACHE_HOME")) {
             cache_directory = std::getenv("XDG_CACHE_HOME");
         } else {
diff --git a/tools/run/run.cpp b/tools/run/run.cpp
index 702c307d8..4aef93863 100644
--- a/tools/run/run.cpp
+++ b/tools/run/run.cpp
@@ -1,3 +1,13 @@
+#include "chat.h"
+#include "common.h"
+#include "llama-cpp.h"
+#include "log.h"
+
+#include "linenoise.cpp/linenoise.h"
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
 #if defined(_WIN32)
 #    include <windows.h>
 #    include <io.h>
@@ -24,13 +34,6 @@
 #include <string>
 #include <vector>
 
-#include "chat.h"
-#include "common.h"
-#include "json.hpp"
-#include "linenoise.cpp/linenoise.h"
-#include "llama-cpp.h"
-#include "log.h"
-
 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
 [[noreturn]] static void sigint_handler(int) {
     printf("\n" LOG_COL_DEFAULT);
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index 17109fddb..c2a56aaa7 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -12,7 +12,6 @@ endif()
 set(TARGET_SRCS
     server.cpp
     utils.hpp
-    httplib.h
 )
 set(PUBLIC_ASSETS
     index.html.gz
diff --git a/tools/server/README.md b/tools/server/README.md
index 0b84966ae..06533c172 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -173,7 +173,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)<br/>controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).<br/>only supported for non-streamed responses<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
+| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index 02fb00339..f8e304342 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 7424da523..4b92eeac9 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1,3 +1,4 @@
+#include "chat.h"
 #include "utils.hpp"
 
 #include "arg.h"
@@ -8,10 +9,8 @@
 #include "sampling.h"
 #include "speculative.h"
 #include "mtmd.h"
+#include "mtmd-helper.h"
 
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
 // mime type for sending response
 #define MIMETYPE_JSON "application/json; charset=utf-8"
 
@@ -114,11 +113,11 @@ struct slot_params {
     struct common_params_speculative speculative;
 
     // OAI-compat fields
-    bool                  verbose                   = false;
-    oaicompat_type        oaicompat                 = OAICOMPAT_TYPE_NONE;
-    std::string           oaicompat_model;
-    std::string           oaicompat_cmpl_id;
-    common_chat_format    oaicompat_chat_format     = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    bool                         verbose                   = false;
+    oaicompat_type               oaicompat                 = OAICOMPAT_TYPE_NONE;
+    std::string                  oaicompat_model;
+    std::string                  oaicompat_cmpl_id;
+    common_chat_syntax           oaicompat_chat_syntax;
 
     json to_json() const {
         std::vector<std::string> samplers;
@@ -176,7 +175,10 @@ struct slot_params {
             {"grammar_lazy",              sampling.grammar_lazy},
             {"grammar_triggers",          grammar_triggers},
             {"preserved_tokens",          sampling.preserved_tokens},
-            {"chat_format",               common_chat_format_name(oaicompat_chat_format)},
+            {"chat_format",               common_chat_format_name(oaicompat_chat_syntax.format)},
+            {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
+            {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
+            {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
             {"samplers",                  samplers},
             {"speculative.n_max",         speculative.n_max},
             {"speculative.n_min",         speculative.n_min},
@@ -352,11 +354,15 @@ struct server_task {
         {
             auto it = data.find("chat_format");
             if (it != data.end()) {
-                params.oaicompat_chat_format = static_cast<common_chat_format>(it->get<int>());
-                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str());
+                params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
+                SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
             } else {
-                params.oaicompat_chat_format = defaults.oaicompat_chat_format;
+                params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
             }
+            params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
+            params.oaicompat_chat_syntax.reasoning_in_content = params.stream;
+            params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+            params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         }
 
         {
@@ -396,7 +402,14 @@ struct server_task {
                             params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
                         }
                     } else {
-                        params.sampling.grammar_triggers.push_back(std::move(ct.value));
+                        if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
+                            SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str());
+                        } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
+                            SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str());
+                        } else {
+                            throw std::runtime_error("Unknown grammar trigger type");
+                        }
+                        params.sampling.grammar_triggers.emplace_back(std::move(ct.value));
                     }
                 }
             }
@@ -639,11 +652,12 @@ struct server_task_result_cmpl_final : server_task_result {
     slot_params generation_params;
 
     // OAI-compat fields
-    bool                  verbose                  = false;
-    oaicompat_type        oaicompat                = OAICOMPAT_TYPE_NONE;
-    std::string           oaicompat_model;
-    std::string           oaicompat_cmpl_id;
-    common_chat_format    oaicompat_chat_format    = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    bool               verbose                  = false;
+    oaicompat_type     oaicompat                = OAICOMPAT_TYPE_NONE;
+    std::string        oaicompat_model;
+    std::string        oaicompat_cmpl_id;
+    common_chat_msg    oaicompat_msg;
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
 
     virtual int get_index() override {
         return index;
@@ -738,47 +752,20 @@ struct server_task_result_cmpl_final : server_task_result {
     json to_json_oaicompat_chat() {
         std::string finish_reason = "length";
         common_chat_msg msg;
-        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            SRV_DBG("Parsing chat message: %s\n", content.c_str());
-            msg = common_chat_parse(content, oaicompat_chat_format);
-            finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
+        if (!oaicompat_msg.empty()) {
+            msg = oaicompat_msg;
         } else {
+            msg.role = "assistant";
             msg.content = content;
         }
-
-        json message {
-            {"role", "assistant"},
-        };
-        if (!msg.reasoning_content.empty()) {
-            message["reasoning_content"] = msg.reasoning_content;
-        }
-        if (msg.content.empty() && !msg.tool_calls.empty()) {
-            message["content"] = json();
-        } else {
-            message["content"] = msg.content;
-        }
-        if (!msg.tool_calls.empty()) {
-            auto tool_calls = json::array();
-            for (const auto & tc : msg.tool_calls) {
-                tool_calls.push_back({
-                    {"type", "function"},
-                    {"function", {
-                        {"name", tc.name},
-                        {"arguments", tc.arguments},
-                    }},
-                    // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo).
-                    // We only generate a random id for the ones that don't generate one by themselves
-                    // (they also won't get to see it as their template likely doesn't use it, so it's all for the client)
-                    {"id", tc.id.empty() ? gen_tool_call_id() : tc.id},
-                });
-            }
-            message["tool_calls"] = tool_calls;
+        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
         }
 
         json choice {
             {"finish_reason", finish_reason},
             {"index", 0},
-            {"message", message},
+            {"message", msg.to_json_oaicompat<json>()},
         };
 
         if (!stream && probs_output.size() > 0) {
@@ -818,17 +805,35 @@ struct server_task_result_cmpl_final : server_task_result {
         std::time_t t = std::time(0);
         std::string finish_reason = "length";
         if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            finish_reason = "stop";
+            finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls";
         }
 
-        json choice = json {
-            {"finish_reason", finish_reason},
-            {"index", 0},
-            {"delta", json::object()}
-        };
+        json deltas = json::array();
+        for (const auto & diff : oaicompat_msg_diffs) {
+            deltas.push_back({
+                {"choices", json::array({
+                    json {
+                        {"finish_reason", nullptr},
+                        {"index", 0},
+                        {"delta", common_chat_msg_diff_to_json_oaicompat<json>(diff)},
+                    },
+                })},
+                {"created", t},
+                {"id", oaicompat_cmpl_id},
+                {"model", oaicompat_model},
+                {"system_fingerprint", build_info},
+                {"object", "chat.completion.chunk"},
+            });
+        }
 
-        json ret = json {
-            {"choices",            json::array({choice})},
+        deltas.push_back({
+            {"choices", json::array({
+                json {
+                    {"finish_reason", finish_reason},
+                    {"index", 0},
+                    {"delta", json::object()},
+                },
+            })},
             {"created",            t},
             {"id",                 oaicompat_cmpl_id},
             {"model",              oaicompat_model},
@@ -839,18 +844,18 @@ struct server_task_result_cmpl_final : server_task_result {
                 {"prompt_tokens",     n_prompt_tokens},
                 {"total_tokens",      n_decoded + n_prompt_tokens},
             }},
-        };
+        });
 
         if (timings.prompt_n >= 0) {
-            ret.push_back({"timings", timings.to_json()});
+            deltas.back().push_back({"timings", timings.to_json()});
         }
 
         // extra fields for debugging purposes
-        if (verbose) {
-            ret["__verbose"] = to_json_non_oaicompat();
+        if (verbose && !deltas.empty()) {
+            deltas.front()["__verbose"] = to_json_non_oaicompat();
         }
 
-        return ret;
+        return deltas;
     }
 };
 
@@ -868,10 +873,11 @@ struct server_task_result_cmpl_partial : server_task_result {
     result_timings timings;
 
     // OAI-compat fields
-    bool           verbose   = false;
-    oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
-    std::string    oaicompat_model;
-    std::string    oaicompat_cmpl_id;
+    bool            verbose   = false;
+    oaicompat_type  oaicompat = OAICOMPAT_TYPE_NONE;
+    std::string     oaicompat_model;
+    std::string     oaicompat_cmpl_id;
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
 
     virtual int get_index() override {
         return index;
@@ -955,84 +961,50 @@ struct server_task_result_cmpl_partial : server_task_result {
         std::time_t t = std::time(0);
         json choices;
 
-        if (first) {
-            if (content.empty()) {
-                choices = json::array({json{{"finish_reason", nullptr},
-                                            {"index", 0},
-                                            {"delta", json{{"role", "assistant"}}}}});
-            } else {
-                // We have to send this as two updates to conform to openai behavior
-                // initial_ret is the role message for stream=True
-                json initial_ret = json{{"choices", json::array({json{
-                                        {"finish_reason", nullptr},
-                                        {"index", 0},
-                                        {"delta", json{
-                                            {"role", "assistant"},
-                                            {"content", ""}
-                                        }}}})},
-                            {"created", t},
-                            {"id", oaicompat_cmpl_id},
-                            {"model", oaicompat_model},
-                            {"system_fingerprint", build_info},
-                            {"object", "chat.completion.chunk"}};
-
-                json second_ret = json{
-                            {"choices", json::array({json{{"finish_reason", nullptr},
-                                                            {"index", 0},
-                                                            {"delta", json {
-                                                            {"content", content}}}
-                                                            }})},
-                            {"created", t},
-                            {"id", oaicompat_cmpl_id},
-                            {"model", oaicompat_model},
-                            {"system_fingerprint", build_info},
-                            {"object", "chat.completion.chunk"}};
-
-                if (prob_output.probs.size() > 0) {
-                    second_ret["choices"][0]["logprobs"] = json{
-                        {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
-                    };
-                }
-
-                if (timings.prompt_n >= 0) {
-                    second_ret.push_back({"timings", timings.to_json()});
-                }
-
-                return std::vector<json>({initial_ret, second_ret});
-            }
-        } else {
-            choices = json::array({json{
-                {"finish_reason", nullptr},
-                {"index", 0},
-                {"delta",
-                json {
-                    {"content", content},
-                }},
-            }});
-        }
-
-        GGML_ASSERT(choices.size() >= 1);
-
-        if (prob_output.probs.size() > 0) {
-            choices[0]["logprobs"] = json{
-                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
-            };
-        }
-
-        json ret = json {
-            {"choices",            choices},
-            {"created",            t},
-            {"id",                 oaicompat_cmpl_id},
-            {"model",              oaicompat_model},
-            {"system_fingerprint", build_info},
-            {"object",             "chat.completion.chunk"}
+        std::vector<json> deltas;
+        auto add_delta = [&](const json & delta) {
+            deltas.push_back({
+                {"choices", json::array({
+                    json {
+                        {"finish_reason", nullptr},
+                        {"index", 0},
+                        {"delta", delta},
+                    },
+                })},
+                {"created", t},
+                {"id", oaicompat_cmpl_id},
+                {"model", oaicompat_model},
+                {"system_fingerprint", build_info},
+                {"object", "chat.completion.chunk"},
+            });
         };
-
-        if (timings.prompt_n >= 0) {
-            ret.push_back({"timings", timings.to_json()});
+        // We have to send an initial update to conform to openai behavior
+        if (first) {
+            add_delta({
+                {"role", "assistant"},
+                {"content", nullptr},
+            });
         }
 
-        return std::vector<json>({ret});
+        for (const auto & diff : oaicompat_msg_diffs) {
+            add_delta(common_chat_msg_diff_to_json_oaicompat<json>(diff));
+        }
+
+        if (!deltas.empty()) {
+            GGML_ASSERT(deltas[deltas.size() - 1].at("choices").size() >= 1);
+
+            if (prob_output.probs.size() > 0) {
+                deltas[deltas.size() - 1].at("choices").at(0)["logprobs"] = json {
+                    {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+                };
+            }
+
+            if (timings.prompt_n >= 0) {
+                deltas[deltas.size() - 1].push_back({"timings", timings.to_json()});
+            }
+        }
+
+        return deltas;
     }
 };
 
@@ -1293,6 +1265,7 @@ struct server_slot {
 
     std::string  generated_text;
     llama_tokens generated_tokens;
+    common_chat_msg chat_msg;
 
     server_tokens cache_tokens;
 
@@ -1313,6 +1286,7 @@ struct server_slot {
     llama_token sampled;
 
     common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    std::vector<std::string> generated_tool_call_ids;
 
     // stats
     size_t n_sent_text        = 0; // number of sent text character
@@ -1342,9 +1316,13 @@ struct server_slot {
         n_past             = 0;
         n_sent_text        = 0;
         task_type          = SERVER_TASK_TYPE_COMPLETION;
+        chat_format        = COMMON_CHAT_FORMAT_CONTENT_ONLY;
 
         generated_tokens.clear();
         generated_token_probs.clear();
+        chat_msg = {};
+        json_schema = json();
+        generated_tool_call_ids.clear();
 
         // clear speculative decoding stats
         n_draft_total = 0;
@@ -1424,6 +1402,21 @@ struct server_slot {
         return timings;
     }
 
+    const common_chat_msg & update_chat_msg(std::vector<common_chat_msg_diff> & diffs) {
+        auto previous_msg = chat_msg;
+        SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
+        auto new_msg = common_chat_parse(
+            generated_text,
+            /* is_partial= */ stop != STOP_TYPE_EOS,
+            params.oaicompat_chat_syntax);
+        if (!new_msg.empty()) {
+            new_msg.ensure_tool_call_ids_set(generated_tool_call_ids, gen_tool_call_id);
+            chat_msg = new_msg;
+            diffs = common_chat_msg_diff::compute_diffs(previous_msg, new_msg.empty() ? previous_msg : new_msg);
+        }
+        return chat_msg;
+    }
+
     size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
         size_t stop_pos = std::string::npos;
 
@@ -1891,6 +1884,7 @@ struct server_context {
     float slot_prompt_similarity = 0.0f;
 
     common_chat_templates_ptr chat_templates;
+    oaicompat_parser_options  oai_parser_opt;
 
     ~server_context() {
         mtmd_free(mctx);
@@ -2022,11 +2016,6 @@ struct server_context {
                 params_base.n_cache_reuse = 0;
                 SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
             }
-
-            if (!params_base.speculative.model.path.empty()) {
-                SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
-                return false;
-            }
         }
 
         return true;
@@ -2086,6 +2075,16 @@ struct server_context {
         }
 
         metrics.init();
+
+        oai_parser_opt = {
+            /* use_jinja             */ params_base.use_jinja,
+            /* prefill_assistant     */ params_base.prefill_assistant,
+            /* reasoning_format      */ params_base.reasoning_format,
+            /* common_chat_templates */ chat_templates.get(),
+            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
+            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
+            /* enable_thinking       */ params_base.reasoning_budget != 0,
+        };
     }
 
     server_slot * get_slot_by_id(int id) {
@@ -2465,10 +2464,12 @@ struct server_context {
         res->n_prompt_tokens     = slot.n_prompt_tokens;
         res->post_sampling_probs = slot.params.post_sampling_probs;
 
-        res->verbose           = slot.params.verbose;
-        res->oaicompat         = slot.params.oaicompat;
-        res->oaicompat_model   = slot.params.oaicompat_model;
-        res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
+        res->verbose               = slot.params.verbose;
+        res->oaicompat             = slot.params.oaicompat;
+        res->oaicompat_model       = slot.params.oaicompat_model;
+        res->oaicompat_cmpl_id     = slot.params.oaicompat_cmpl_id;
+
+        slot.update_chat_msg(res->oaicompat_msg_diffs);
 
         // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
@@ -2489,7 +2490,7 @@ struct server_context {
         res->id_slot         = slot.id;
 
         res->index           = slot.index;
-        res->content         = std::move(slot.generated_text);
+        res->content         = slot.generated_text;
         res->tokens          = std::move(slot.generated_tokens);
         res->timings         = slot.get_timings();
         res->prompt          = slot.prompt_tokens.detokenize(ctx, true);
@@ -2509,7 +2510,8 @@ struct server_context {
         res->oaicompat             = slot.params.oaicompat;
         res->oaicompat_model       = slot.params.oaicompat_model;
         res->oaicompat_cmpl_id     = slot.params.oaicompat_cmpl_id;
-        res->oaicompat_chat_format = slot.params.oaicompat_chat_format;
+        res->oaicompat_msg         = slot.update_chat_msg(res->oaicompat_msg_diffs);
+
         // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
             if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) {
@@ -3207,9 +3209,18 @@ struct server_context {
                             }
 
                             if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
-                                if (llama_kv_self_seq_pos_min(ctx, slot.id) > 0) {
+                                const auto pos_min = llama_kv_self_seq_pos_min(ctx, slot.id);
+                                if (pos_min == -1) {
+                                    SLT_ERR(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min);
+                                    GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
+                                }
+
+                                const auto n_swa = llama_model_n_swa(model);
+                                if (pos_min > slot.n_past - n_swa) {
+                                    SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
                                     SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
                                             "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+                                    llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                     slot.n_past = 0;
                                 }
                             }
@@ -3341,8 +3352,41 @@ struct server_context {
             common_set_adapter_lora(ctx, slot_batched->lora);
         }
 
+        const bool do_encode = (params_base.embedding || params_base.reranking);
+
+        // pad the batch so that batch.n_tokens >= n_slots
+        // TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
+        if (do_encode) {
+            const int n_slots = slots.size();
+
+            if (batch.n_tokens < n_slots) {
+                std::set<llama_seq_id> seq_ids;
+                for (int j = 0; j < batch.n_tokens; ++j) {
+                    seq_ids.insert(batch.seq_id[j][0]);
+                }
+
+                // find unused sequence id
+                llama_seq_id seq_id = -1;
+                for (int i = 0; i < n_slots; ++i) {
+                    if (seq_ids.find(i) == seq_ids.end()) {
+                        seq_id = i;
+                    }
+                }
+
+                const int n_add = n_slots - batch.n_tokens;
+
+                SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
+
+                for (int j = 0; j < n_add; ++j) {
+                    common_batch_add(batch, 0, j, { seq_id }, false);
+                }
+            }
+        }
+
+        int32_t i_next = 0;
+
         // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
+        for (int32_t i = 0; i < batch.n_tokens; i = i_next) {
             const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
             llama_batch batch_view = {
@@ -3355,13 +3399,7 @@ struct server_context {
                 batch.logits   + i,
             };
 
-            int ret = 0;
-
-            if (params_base.embedding || params_base.reranking) {
-                ret = llama_encode(ctx, batch_view);
-            } else {
-                ret = llama_decode(ctx, batch_view);
-            }
+            const int ret = llama_decode(ctx, batch_view);
 
             metrics.on_decoded(slots);
 
@@ -3393,13 +3431,18 @@ struct server_context {
 
                 // retry with half the batch size to try to find a free slot in the KV cache
                 n_batch /= 2;
-                i -= n_batch;
 
-                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
 
                 continue; // continue loop of n_batch
             }
 
+            // move the head of the batch forward with the number of tokens we just processed
+            i_next = i + n_tokens;
+
+            // on successful decode, restore the original batch size
+            n_batch = llama_n_batch(ctx);
+
             for (auto & slot : slots) {
                 if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
                     continue; // continue loop of slots
@@ -4061,7 +4104,10 @@ int main(int argc, char ** argv) {
             { "default_generation_settings", ctx_server.default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params_base.n_parallel },
             { "model_path",                  ctx_server.params_base.model.path },
-            { "modalities",                  json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
+            { "modalities",                  json{
+                {"vision", ctx_server.oai_parser_opt.allow_image},
+                {"audio",  ctx_server.oai_parser_opt.allow_audio},
+            } },
             { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
             { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
             { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -4150,12 +4196,12 @@ int main(int argc, char ** argv) {
                     throw std::runtime_error("This server does not support multimodal");
                 }
                 for (auto & file : files) {
-                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
+                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
                     if (!bmp.ptr) {
-                        throw std::runtime_error("Failed to load image");
+                        throw std::runtime_error("Failed to load image or audio file");
                     }
                     // calculate bitmap hash (for KV caching)
-                    std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
+                    std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
                     bmp.set_id(hash.c_str());
                     bitmaps.entries.push_back(std::move(bmp));
                 }
@@ -4387,7 +4433,7 @@ int main(int argc, char ** argv) {
             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
     };
 
-    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
         LOG_DBG("request: %s\n", req.body.c_str());
         if (ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
@@ -4396,13 +4442,9 @@ int main(int argc, char ** argv) {
 
         auto body = json::parse(req.body);
         std::vector<raw_buffer> files;
-        json data = oaicompat_completion_params_parse(
+        json data = oaicompat_chat_params_parse(
             body,
-            params.use_jinja,
-            params.prefill_assistant,
-            params.reasoning_format,
-            ctx_server.chat_templates.get(),
-            ctx_server.mctx,
+            ctx_server.oai_parser_opt,
             files);
 
         handle_completions_impl(
@@ -4415,16 +4457,12 @@ int main(int argc, char ** argv) {
     };
 
     // same with handle_chat_completions, but without inference part
-    const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_apply_template = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
         auto body = json::parse(req.body);
         std::vector<raw_buffer> files; // dummy, unused
-        json data = oaicompat_completion_params_parse(
+        json data = oaicompat_chat_params_parse(
             body,
-            params.use_jinja,
-            params.prefill_assistant,
-            params.reasoning_format,
-            ctx_server.chat_templates.get(),
-            ctx_server.mctx,
+            ctx_server.oai_parser_opt,
             files);
         res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
     };
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
index bab5d005d..1b5205f79 100644
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -75,7 +75,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
         choice = data["choices"][0]
         if i == 0:
             # Check first role message for stream=True
-            assert choice["delta"]["content"] == ""
+            assert choice["delta"]["content"] is None
             assert choice["delta"]["role"] == "assistant"
         else:
             assert "role" not in choice["delta"]
@@ -92,7 +92,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
             assert choice["finish_reason"] == finish_reason
         else:
             assert choice["finish_reason"] is None
-            content += choice["delta"]["content"]
+            content += choice["delta"]["content"] or ''
 
 
 def test_chat_completion_with_openai_library():
@@ -251,8 +251,9 @@ def test_chat_completion_with_timings_per_token():
     for i, data in enumerate(res):
         if i == 0:
             # Check first role message for stream=True
-            assert data["choices"][0]["delta"]["content"] == ""
+            assert data["choices"][0]["delta"]["content"] is None
             assert data["choices"][0]["delta"]["role"] == "assistant"
+            assert "timings" not in data, f'First event should not have timings: {data}'
         else:
             assert "role" not in data["choices"][0]["delta"]
             assert "timings" in data
@@ -311,7 +312,7 @@ def test_logprobs_stream():
         choice = data.choices[0]
         if i == 0:
             # Check first role message for stream=True
-            assert choice.delta.content == ""
+            assert choice.delta.content is None
             assert choice.delta.role == "assistant"
         else:
             assert choice.delta.role is None
diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py
index 4099c4e25..f6909e9ae 100644
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -121,6 +121,30 @@ def test_completion_stream_with_openai_library():
     assert match_regex("(going|bed)+", output_text)
 
 
+# Test case from https://github.com/ggml-org/llama.cpp/issues/13780
+@pytest.mark.slow
+def test_completion_stream_with_openai_library_stops():
+    global server
+    server.model_hf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M"
+    server.model_hf_file = None
+    server.start()
+    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
+    res = client.completions.create(
+        model="davinci-002",
+        prompt="System: You are helpfull assistant.\nAssistant:\nHey! How could I help?\nUser:\nTell me a joke.\nAssistant:\n",
+        stop=["User:\n", "Assistant:\n"],
+        max_tokens=200,
+        stream=True,
+    )
+    output_text = ''
+    for data in res:
+        choice = data.choices[0]
+        if choice.finish_reason is None:
+            assert choice.text is not None
+            output_text += choice.text
+    assert match_regex("Sure, here's one for[\\s\\S]*", output_text), f'Unexpected output: {output_text}'
+
+
 @pytest.mark.parametrize("n_slots", [1, 2])
 def test_consistent_result_same_seed(n_slots: int):
     global server
diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py
index cf9f96a7f..c53eda5b8 100644
--- a/tools/server/tests/unit/test_template.py
+++ b/tools/server/tests/unit/test_template.py
@@ -25,6 +25,40 @@ def create_server():
     server.n_slots = 1
 
 
+@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
+@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",   -1, "<think>\n"),
+    ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B",    0, "<think>\n</think>"),
+
+    ("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"),
+    ("Qwen-Qwen3-0.6B",  0, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
+
+    ("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n<think>\n"),
+    ("Qwen-QwQ-32B",  0, "<|im_start|>assistant\n<think>\n</think>"),
+
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
+    ("CohereForAI-c4ai-command-r7b-12-2024-tool_use",  0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
+])
+def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]):
+    global server
+    server.jinja = True
+    server.reasoning_budget = reasoning_budget
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "user", "content": "What is today?"},
+        ],
+        "tools": tools,
+    })
+    assert res.status_code == 200
+    prompt = res.body["prompt"]
+
+    assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'"
+
+
 @pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
 @pytest.mark.parametrize("template_name,format", [
     ("meta-llama-Llama-3.3-70B-Instruct",    "%d %b %Y"),
@@ -47,3 +81,28 @@ def test_date_inside_prompt(template_name: str, format: str, tools: list[dict]):
 
     today_str = datetime.date.today().strftime(format)
     assert today_str in prompt, f"Expected today's date ({today_str}) in content ({prompt})"
+
+
+@pytest.mark.parametrize("add_generation_prompt", [False, True])
+@pytest.mark.parametrize("template_name,expected_generation_prompt", [
+    ("meta-llama-Llama-3.3-70B-Instruct",    "<|start_header_id|>assistant<|end_header_id|>"),
+])
+def test_add_generation_prompt(template_name: str, expected_generation_prompt: str, add_generation_prompt: bool):
+    global server
+    server.jinja = True
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+
+    res = server.make_request("POST", "/apply-template", data={
+        "messages": [
+            {"role": "user", "content": "What is today?"},
+        ],
+        "add_generation_prompt": add_generation_prompt,
+    })
+    assert res.status_code == 200
+    prompt = res.body["prompt"]
+
+    if add_generation_prompt:
+        assert expected_generation_prompt in prompt, f"Expected generation prompt ({expected_generation_prompt}) in content ({prompt})"
+    else:
+        assert expected_generation_prompt not in prompt, f"Did not expect generation prompt ({expected_generation_prompt}) in content ({prompt})"
diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py
index 1f2c151c1..610610749 100755
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@@ -8,6 +8,7 @@ path = Path(__file__).resolve().parents[1]
 sys.path.insert(0, str(path))
 
 from utils import *
+from enum import Enum
 
 server: ServerProcess
 
@@ -20,7 +21,11 @@ def create_server():
     server = ServerPreset.tinyllama2()
     server.model_alias = "tinyllama-2-tool-call"
     server.server_port = 8081
+    server.n_slots = 1
 
+class CompletionMode(Enum):
+    NORMAL = "normal"
+    STREAMED = "streamed"
 
 TEST_TOOL = {
     "type":"function",
@@ -73,9 +78,8 @@ WEATHER_TOOL = {
   }
 }
 
-
 def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
-    res = server.make_request("POST", "/v1/chat/completions", data={
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "system", "content": "You are a coding assistant."},
@@ -86,13 +90,13 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
         "parallel_tool_calls": False,
         **kwargs,
     })
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
+    # assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
+    choice = body["choices"][0]
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
     tool_call = tool_calls[0]
     assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
-    assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
+    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
     expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
     assert expected_function_name == tool_call["function"]["name"]
     actual_arguments = tool_call["function"]["arguments"]
@@ -102,12 +106,16 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
         assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
 
 
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 @pytest.mark.parametrize("template_name,tool,argument_key", [
     ("google-gemma-2-2b-it",                          TEST_TOOL,            "success"),
+    ("google-gemma-2-2b-it",                          TEST_TOOL,            "success"),
+    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
     ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
+    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
 ])
-def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
+def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
     global server
     n_predict = 1024
     # server = ServerPreset.stories15m_moe()
@@ -115,31 +123,43 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict,
     server.n_predict = n_predict
     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, temperature=0.0, top_k=1, top_p=1.0)
+    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
 
 
 @pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 @pytest.mark.parametrize("template_name,tool,argument_key", [
     ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
     ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
+
     ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
     ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
+
     ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
-    ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
+    # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
+    # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
+
     ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
     ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
+
     ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
     ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
+
     ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
     ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
+
     ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
     ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
+
     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
+
     ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
+    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
+
 ])
-def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
+def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
     global server
     n_predict = 512
     # server = ServerPreset.stories15m_moe()
@@ -147,10 +167,11 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
     server.n_predict = n_predict
     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict)
+    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
 
 
 @pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 @pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
     (TEST_TOOL,    "success",  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
     (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
@@ -184,9 +205,9 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
     (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
     (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
 
-    (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+    # (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    # (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    # (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
 
     (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
     (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
@@ -203,10 +224,9 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
     (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
     (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
     global server
     n_predict = 512
-    server.n_slots = 1
     server.jinja = True
     server.n_ctx = 8192
     server.n_predict = n_predict
@@ -219,7 +239,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/v1/chat/completions", data={
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "system", "content": "You are a coding assistant."},
@@ -228,12 +248,12 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
         "tool_choice": "required",
         "tools": [tool],
         "parallel_tool_calls": False,
+        "stream": stream == CompletionMode.STREAMED,
         "temperature": 0.0,
         "top_k": 1,
         "top_p": 1.0,
     }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
+    choice = body["choices"][0]
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
     tool_call = tool_calls[0]
@@ -248,7 +268,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
 
 
 def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs):
-    res = server.make_request("POST", "/v1/chat/completions", data={
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "system", "content": "You are a coding assistant."},
@@ -258,26 +278,27 @@ def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int,
         "tool_choice": tool_choice,
         **kwargs,
     }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
+    choice = body["choices"][0]
     assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
 
 
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 @pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
     ("meta-llama-Llama-3.3-70B-Instruct",         128, [],            None),
     ("meta-llama-Llama-3.3-70B-Instruct",         128, [TEST_TOOL],   None),
     ("meta-llama-Llama-3.3-70B-Instruct",         128, [PYTHON_TOOL], 'none'),
 ])
-def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
+def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None, stream: CompletionMode):
     global server
-    server.jinja = True
     server.n_predict = n_predict
+    server.jinja = True
     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice)
+    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
 
 
 @pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 @pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
     ("meetkai-functionary-medium-v3.2",               256, [],            None),
     ("meetkai-functionary-medium-v3.2",               256, [TEST_TOOL],   None),
@@ -289,16 +310,17 @@ def test_completion_without_tool_call_fast(template_name: str, n_predict: int, t
     ("meta-llama-Llama-3.2-3B-Instruct",              256, [TEST_TOOL],   None),
     ("meta-llama-Llama-3.2-3B-Instruct",              256, [PYTHON_TOOL], 'none'),
 ])
-def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
+def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None, stream: CompletionMode):
     global server
-    server.jinja = True
     server.n_predict = n_predict
+    server.jinja = True
     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice)
+    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
 
 
 @pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 @pytest.mark.parametrize("hf_repo,template_override", [
     ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
     ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
@@ -321,11 +343,11 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
     ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
     ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
 
-    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+    # ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    # ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
 
-    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
-    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+    # ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    # ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
 
     ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
     ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
@@ -339,10 +361,9 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
 
     # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
 ])
-def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
     global server
     n_predict = 512
-    server.n_slots = 1
     server.jinja = True
     server.n_ctx = 8192
     server.n_predict = n_predict
@@ -355,11 +376,11 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    do_test_weather(server, max_tokens=n_predict)
+    do_test_weather(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
 
 
 def do_test_weather(server: ServerProcess, **kwargs):
-    res = server.make_request("POST", "/v1/chat/completions", data={
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
         "messages": [
             {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
             {"role": "user", "content": "What is the weather in Istanbul?"},
@@ -367,14 +388,13 @@ def do_test_weather(server: ServerProcess, **kwargs):
         "tools": [WEATHER_TOOL],
         **kwargs,
     }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
+    choice = body["choices"][0]
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
     tool_call = tool_calls[0]
     # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
     assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
-    assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
+    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
     actual_arguments = json.loads(tool_call["function"]["arguments"])
     assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
     location = actual_arguments["location"]
@@ -383,6 +403,7 @@ def do_test_weather(server: ServerProcess, **kwargs):
 
 
 @pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"),
     (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
@@ -400,9 +421,8 @@ def do_test_weather(server: ServerProcess, **kwargs):
     # (None,                                           128,  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M",  None),
     # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
     global server
-    server.n_slots = 1
     server.jinja = True
     server.n_ctx = 8192 * 2
     server.n_predict = n_predict
@@ -415,11 +435,11 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    do_test_calc_result(server, result_override, n_predict)
+    do_test_calc_result(server, result_override, n_predict, stream=stream == CompletionMode.STREAMED)
 
 
 def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs):
-    res = server.make_request("POST", "/v1/chat/completions", data={
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."},
@@ -466,8 +486,7 @@ def do_test_calc_result(server: ServerProcess, result_override: str | None, n_pr
         ],
         **kwargs,
     }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
+    choice = body["choices"][0]
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls is None, f'Expected no tool call in {choice["message"]}'
     content = choice["message"].get("content")
@@ -480,18 +499,18 @@ def do_test_calc_result(server: ServerProcess, result_override: str | None, n_pr
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [
-    (128, 'deepseek',  "^The sum of 102 and 7 is 109[\\s\\S]*",                        None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (128,  None,        "^The sum of 102 and 7 is 109[\\s\\S]*",                       None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-
-    (1024, 'deepseek',  "To find the sum of[\\s\\S]*",                                 "I need to calculate the sum of 102 and 7[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'none',      "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                None,                                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-
-    (1024, 'deepseek',  "To find the sum of[\\s\\S]*",                                 "First, I [\\s\\S]*",                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+@pytest.mark.parametrize("n_predict,reasoning_format,stream,expect_reasoning_content,expect_content,hf_repo,template_override", [
+    (128, 'deepseek',   CompletionMode.NORMAL,   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (128,  None,        CompletionMode.NORMAL,   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (1024, 'deepseek',  CompletionMode.NORMAL,   "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (1024, 'deepseek',  CompletionMode.STREAMED, None, "^<think>I need to calculate [\\s\\S]*?</think>To find the sum of [\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (1024, 'deepseek',  CompletionMode.NORMAL,   "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+    (1024, 'deepseek',  CompletionMode.STREAMED, None, "^<think>First, I [\\s\\S]*?</think>To find the sum of[\\s\\S]*",              "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+    # (1024, 'none',      CompletionMode.NORMAL,   None, "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    # (128,  'deepseek',  None, "^Okay, let me figure out the sum of 102 and 7[\\s\\S]*",                      "bartowski/Qwen_QwQ-32B-GGUF:Q4_K_M",                None),
 ])
-def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
     global server
-    server.n_slots = 1
     server.reasoning_format = reasoning_format
     server.jinja = True
     server.n_ctx = 8192 * 2
@@ -505,14 +524,14 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/v1/chat/completions", data={
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "user", "content": "What's the sum of 102 and 7?"},
-        ]
+        ],
+        "stream": stream == CompletionMode.STREAMED,
     }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
+    choice = body["choices"][0]
     assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
 
     content = choice["message"].get("content")
@@ -529,6 +548,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
 
 
 @pytest.mark.slow
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
 @pytest.mark.parametrize("hf_repo,template_override", [
     ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 
@@ -562,10 +582,9 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
     ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
     ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"),
 ])
-def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
     global server
     n_predict = 512 # High because of DeepSeek R1
-    server.n_slots = 1
     server.jinja = True
     server.n_ctx = 8192
     server.n_predict = n_predict
@@ -579,11 +598,11 @@ def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | Non
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
 
-    do_test_hello_world(server, max_tokens=n_predict)
+    do_test_hello_world(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
 
 
 def do_test_hello_world(server: ServerProcess, **kwargs):
-    res = server.make_request("POST", "/v1/chat/completions", data={
+    body = server.make_any_request("POST", "/v1/chat/completions", data={
         "messages": [
             {"role": "system", "content": "You are a tool-calling agent."},
             {"role": "user", "content": "say hello world with python"},
@@ -591,16 +610,15 @@ def do_test_hello_world(server: ServerProcess, **kwargs):
         "tools": [PYTHON_TOOL],
         **kwargs,
     }, timeout=TIMEOUT_HTTP_REQUEST)
-    assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
-    choice = res.body["choices"][0]
+    choice = body["choices"][0]
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
     tool_call = tool_calls[0]
     # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
     assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
-    assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
+    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
     actual_arguments = json.loads(tool_call["function"]["arguments"])
     assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
     code = actual_arguments["code"]
     assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
-    assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
+    assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', re.sub(r'#.*\n?', '', code)), f'Expected hello world, got {code}'
diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py
index 7cc4096f1..fc63caa13 100644
--- a/tools/server/tests/unit/test_vision_api.py
+++ b/tools/server/tests/unit/test_vision_api.py
@@ -30,6 +30,7 @@ def create_server():
         ("What is this:\n", "malformed",              False, None),
         ("What is this:\n", "https://google.com/404", False, None), # non-existent image
         ("What is this:\n", "https://ggml.ai",        False, None), # non-image data
+        # TODO @ngxson : test with multiple images, no images and with audio
     ]
 )
 def test_vision_chat_completion(prompt, image_url, success, re_content):
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 27a0f0356..f7e1b3b3b 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -84,7 +84,8 @@ class ServerProcess:
     draft_max: int | None = None
     no_webui: bool | None = None
     jinja: bool | None = None
-    reasoning_format: Literal['deepseek', 'none'] | None = None
+    reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
+    reasoning_budget: int | None = None
     chat_template: str | None = None
     chat_template_file: str | None = None
     server_path: str | None = None
@@ -191,6 +192,8 @@ class ServerProcess:
             server_args.append("--jinja")
         if self.reasoning_format is not None:
             server_args.extend(("--reasoning-format", self.reasoning_format))
+        if self.reasoning_budget is not None:
+            server_args.extend(("--reasoning-budget", self.reasoning_budget))
         if self.chat_template:
             server_args.extend(["--chat-template", self.chat_template])
         if self.chat_template_file:
@@ -294,6 +297,81 @@ class ServerProcess:
                 print("Partial response from server", json.dumps(data, indent=2))
                 yield data
 
+    def make_any_request(
+        self,
+        method: str,
+        path: str,
+        data: dict | None = None,
+        headers: dict | None = None,
+        timeout: float | None = None,
+    ) -> dict:
+        stream = data.get('stream', False)
+        if stream:
+            content: list[str] = []
+            tool_calls: list[dict] = []
+            finish_reason: Optional[str] = None
+
+            content_parts = 0
+            tool_call_parts = 0
+            arguments_parts = 0
+
+            for chunk in self.make_stream_request(method, path, data, headers):
+                assert len(chunk['choices']) == 1, f'Expected 1 choice, got {len(chunk["choices"])}'
+                choice = chunk['choices'][0]
+                if choice['delta'].get('content') is not None:
+                    assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!'
+                    content.append(choice['delta']['content'])
+                    content_parts += 1
+                if choice['delta'].get('finish_reason') is not None:
+                    finish_reason = choice['delta']['finish_reason']
+                for tc in choice['delta'].get('tool_calls', []):
+                    if 'function' not in tc:
+                        raise ValueError(f"Expected function type, got {tc['type']}")
+                    if tc['index'] >= len(tool_calls):
+                        assert 'id' in tc
+                        assert tc.get('type') == 'function'
+                        assert 'function' in tc and 'name' in tc['function'] and len(tc['function']['name']) > 0, \
+                            f"Expected function call with name, got {tc.get('function')}"
+                        tool_calls.append(dict(
+                            id="",
+                            type="function",
+                            function=dict(
+                                name="",
+                                arguments="",
+                            )
+                        ))
+                    tool_call = tool_calls[tc['index']]
+                    if tc.get('id') is not None:
+                        tool_call['id'] = tc['id']
+                    fct = tc['function']
+                    assert 'id' not in fct, f"Function call should not have id: {fct}"
+                    if fct.get('name') is not None:
+                        tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
+                    if fct.get('arguments') is not None:
+                        tool_call['function']['arguments'] += fct['arguments']
+
+            print(f'Streamed response had {content_parts} content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
+            result = dict(
+                choices=[
+                    dict(
+                        index=0,
+                        finish_reason=finish_reason,
+                        message=dict(
+                            role='assistant',
+                            content=''.join(content) if content else None,
+                            tool_calls=tool_calls if tool_calls else None,
+                        ),
+                    )
+                ],
+            )
+            print("Final response from server", json.dumps(result, indent=2))
+            return result
+        else:
+            response = self.make_request(method, path, data, headers, timeout=timeout)
+            assert response.status_code == 200, f"Server returned error: {response.status_code}"
+            return response.body
+
+
 
 server_instances: Set[ServerProcess] = set()
 
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 3e7733539..f3e0392a4 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -6,17 +6,17 @@
 #include "arg.h" // common_remote_get_content
 #include "base64.hpp"
 #include "mtmd.h"
+#include "mtmd-helper.h"
+#include "chat.h"
 
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 // disable Nagle's algorithm
 #define CPPHTTPLIB_TCP_NODELAY true
-#include "httplib.h"
+#include <cpp-httplib/httplib.h>
 
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
-#include "chat.h"
+#include <nlohmann/json.hpp>
 
 #include <random>
 #include <sstream>
@@ -264,13 +264,19 @@ static size_t validate_utf8(const std::string& text) {
 static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
     llama_tokens result;
 
+    // Get EOS token - use SEP token as fallback if EOS is not available
+    llama_token eos_token = llama_vocab_eos(vocab);
+    if (eos_token == LLAMA_TOKEN_NULL) {
+        eos_token = llama_vocab_sep(vocab);
+    }
+
     result.reserve(doc.size() + query.size() + 4);
     result.push_back(llama_vocab_bos(vocab));
     result.insert(result.end(), query.begin(), query.end());
-    result.push_back(llama_vocab_eos(vocab));
+    result.push_back(eos_token);
     result.push_back(llama_vocab_sep(vocab));
     result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(llama_vocab_eos(vocab));
+    result.push_back(eos_token);
 
     return result;
 }
@@ -474,26 +480,6 @@ static std::string gen_tool_call_id() {
 // other common utils
 //
 
-static bool ends_with(const std::string & str, const std::string & suffix) {
-    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static size_t find_partial_stop_string(const std::string &stop, const std::string &text) {
-    if (!text.empty() && !stop.empty()) {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
-            if (stop[char_index] == text_last_char) {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial)) {
-                    return text.size() - char_index - 1;
-                }
-            }
-        }
-    }
-
-    return std::string::npos;
-}
-
 // TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
@@ -536,6 +522,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
 // OAI utils
 //
 
+// used by /completions endpoint
 static json oaicompat_completion_params_parse(const json & body) {
     json llama_params;
 
@@ -580,31 +567,35 @@ static json oaicompat_completion_params_parse(const json & body) {
     return llama_params;
 }
 
-static json oaicompat_completion_params_parse(
-    const json & body, /* openai api json semantics */
-    bool use_jinja,
-    bool prefill_assistant,
-    common_reasoning_format reasoning_format,
-    const struct common_chat_templates * tmpls,
-    bool allow_non_text,
+struct oaicompat_parser_options {
+    bool use_jinja;
+    bool prefill_assistant;
+    common_reasoning_format reasoning_format;
+    common_chat_templates * tmpls;
+    bool allow_image;
+    bool allow_audio;
+    bool enable_thinking = true;
+};
+
+// used by /chat/completions endpoint
+static json oaicompat_chat_params_parse(
+    json & body, /* openai api json semantics */
+    const oaicompat_parser_options & opt,
     std::vector<raw_buffer> & out_files)
 {
     json llama_params;
 
     auto tools = json_value(body, "tools", json());
+    auto has_tools = tools.is_array() && !tools.empty();
     auto stream = json_value(body, "stream", false);
+    auto tool_choice = json_value(body, "tool_choice", std::string("auto"));
 
-    if (tools.is_array() && !tools.empty()) {
-        if (stream) {
-            throw std::runtime_error("Cannot use tools with stream");
-        }
-        if (!use_jinja) {
+    if (!opt.use_jinja) {
+        if (has_tools) {
             throw std::runtime_error("tools param requires --jinja flag");
         }
-    }
-    if (!use_jinja) {
-        if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
-            throw std::runtime_error("Unsupported param: tool_choice");
+        if (tool_choice != "auto") {
+            throw std::runtime_error("tool_choice param requires --jinja flag");
         }
     }
 
@@ -639,7 +630,7 @@ static json oaicompat_completion_params_parse(
     if (!body.contains("messages")) {
         throw std::runtime_error("'messages' is required");
     }
-    json messages = body.at("messages");
+    json & messages = body.at("messages");
     if (!messages.is_array()) {
         throw std::runtime_error("Expected 'messages' to be an array");
     }
@@ -667,12 +658,12 @@ static json oaicompat_completion_params_parse(
 
         for (auto & p : content) {
             std::string type      = json_value(p, "type", std::string());
-            json        image_url = json_value(p, "image_url", json::object());
             if (type == "image_url") {
-                if (!allow_non_text) {
-                    throw std::runtime_error("image input is not supported by this server");
+                if (!opt.allow_image) {
+                    throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
                 }
 
+                json image_url  = json_value(p, "image_url", json::object());
                 std::string url = json_value(image_url, "url", std::string());
                 if (string_starts_with(url, "http")) {
                     // download remote image
@@ -710,8 +701,31 @@ static json oaicompat_completion_params_parse(
 
                 // replace this chunk with a marker
                 p["type"] = "text";
-                p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
+                p["text"] = mtmd_default_marker();
                 p.erase("image_url");
+
+            } else if (type == "input_audio") {
+                if (!opt.allow_audio) {
+                    throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json input_audio   = json_value(p, "input_audio", json::object());
+                std::string data   = json_value(input_audio, "data", std::string());
+                std::string format = json_value(input_audio, "format", std::string());
+                // while we also support flac, we don't allow it here so we matches the OAI spec
+                if (format != "wav" && format != "mp3") {
+                    throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
+                }
+                auto decoded_data = base64_decode(data); // expected to be base64 encoded
+                out_files.push_back(decoded_data);
+
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("input_audio");
+
+            } else if (type != "text") {
+                throw std::runtime_error("unsupported content[].type");
             }
         }
     }
@@ -719,21 +733,24 @@ static json oaicompat_completion_params_parse(
     common_chat_templates_inputs inputs;
     inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
     inputs.tools                 = common_chat_tools_parse_oaicompat(tools);
-    inputs.tool_choice           = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
+    inputs.tool_choice           = common_chat_tool_choice_parse_oaicompat(tool_choice);
     inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
     inputs.grammar               = grammar;
-    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
-    inputs.use_jinja             = use_jinja;
+    inputs.use_jinja             = opt.use_jinja;
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
-    inputs.extract_reasoning     = reasoning_format != COMMON_REASONING_FORMAT_NONE;
     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
-    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
-        throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+    inputs.reasoning_format      = opt.reasoning_format;
+    inputs.enable_thinking       = opt.enable_thinking;
+    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+        if (body.contains("grammar")) {
+            throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+        }
+        llama_params["parse_tool_calls"] = true;
     }
 
     // if the assistant message appears at the end of list, we do not add end-of-turn token
     // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
     common_chat_msg last_message;
     if (prefill_assistant_message) {
         last_message = inputs.messages.back();
@@ -744,12 +761,13 @@ static json oaicompat_completion_params_parse(
             throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
         }
 
-        inputs.extract_reasoning = false;
+        /* TODO: test this properly */
+        inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
         inputs.add_generation_prompt = true;
     }
 
     // Apply chat template to the list of messages
-    auto chat_params = common_chat_templates_apply(tmpls, inputs);
+    auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
 
     /* Append assistant prefilled message */
     if (prefill_assistant_message) {
@@ -769,6 +787,7 @@ static json oaicompat_completion_params_parse(
     }
     llama_params["grammar_triggers"] = grammar_triggers;
     llama_params["preserved_tokens"] = chat_params.preserved_tokens;
+    llama_params["thinking_forced_open"]     = chat_params.thinking_forced_open;
     for (const auto & stop : chat_params.additional_stops) {
         llama_params["stop"].push_back(stop);
     }
@@ -782,6 +801,9 @@ static json oaicompat_completion_params_parse(
     // Handle "logprobs" field
     // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
     if (json_value(body, "logprobs", false)) {
+        if (has_tools && stream) {
+            throw std::runtime_error("logprobs is not supported with tools + stream");
+        }
         llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
     } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
         throw std::runtime_error("top_logprobs requires logprobs to be set to true");
@@ -1040,7 +1062,7 @@ struct server_tokens {
 private: // disallow accessing these members directly, risking out-of-sync
 
     // map a **start** position in tokens to the image chunk
-    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
+    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;
 
     // list of tokens
     // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
@@ -1051,7 +1073,7 @@ private: // disallow accessing these members directly, risking out-of-sync
     // for ex. with input of 5 text tokens and 2 images:
     //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
     // pos  0   1   2   3   4   5      6      7      8      9
-    // map_pos_to_image will contain: {5, img0}, {8, img1}
+    // map_pos_to_media will contain: {5, img0}, {8, img1}
 
 public:
     server_tokens() = default;
@@ -1090,15 +1112,15 @@ public:
         }
         oss << "\n";
         oss << "image pos: ";
-        for (const auto & it : map_pos_to_image) {
+        for (const auto & it : map_pos_to_media) {
             oss << it.first << ", ";
         }
         return oss.str();
     }
 
     const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
-        auto it = map_pos_to_image.find(pos);
-        if (it != map_pos_to_image.end()) {
+        auto it = map_pos_to_media.find(pos);
+        if (it != map_pos_to_media.end()) {
             return it->second;
         } else {
             throw std::runtime_error("Chunk not found");
@@ -1115,16 +1137,15 @@ public:
     // will create a copy of the chunk if it contains non-text data
     void push_back(const mtmd_input_chunk * chunk) {
         auto type = mtmd_input_chunk_get_type(chunk);
-        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
             GGML_ASSERT(has_mtmd);
-            auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-            const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
+            const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
             llama_pos start_pos = tokens.size();
             for (int i = 0; i < n_pos; ++i) {
                 tokens.emplace_back(LLAMA_TOKEN_NULL);
             }
             mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
-            map_pos_to_image[start_pos] = std::move(new_chunk);
+            map_pos_to_media[start_pos] = std::move(new_chunk);
         } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             size_t n_tokens;
             auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
@@ -1169,6 +1190,9 @@ public:
     void keep_first(size_t n) {
         GGML_ASSERT(n <= tokens.size());
         if (has_mtmd) {
+            if (n == tokens.size()) {
+                return; // nothing to do
+            }
             // we throw an error if we try to remove a token in the middle of an image
             // for ex. with input of 5 text tokens and 2 images:
             //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
@@ -1183,10 +1207,10 @@ public:
                 }
             }
             // remove all image chunks that are not used anymore
-            for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
+            for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
                 llama_pos pos = it->first;
                 if (pos >= (llama_pos)n) {
-                    it = map_pos_to_image.erase(it);
+                    it = map_pos_to_media.erase(it);
                 } else {
                     ++it;
                 }
@@ -1217,14 +1241,12 @@ public:
                 const auto & a_chunk =   find_chunk(i);
                 const auto & b_chunk = b.find_chunk(i);
                 GGML_ASSERT(a_chunk && b_chunk);
-                const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
-                const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
-                std::string ai_id  = mtmd_image_tokens_get_id(a_img);
-                std::string bi_id  = mtmd_image_tokens_get_id(b_img);
-                size_t a_pos       = mtmd_image_tokens_get_n_pos(a_img);
-                size_t b_pos       = mtmd_image_tokens_get_n_pos(b_img);
+                std::string ai_id  = mtmd_input_chunk_get_id(a_chunk.get());
+                std::string bi_id  = mtmd_input_chunk_get_id(b_chunk.get());
+                size_t a_pos       = mtmd_input_chunk_get_n_pos(a_chunk.get());
+                size_t b_pos       = mtmd_input_chunk_get_n_pos(b_chunk.get());
                 if (ai_id == bi_id && a_pos == b_pos) {
-                    GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
+                    GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
                     i += a_pos - 1; // will be +1 by the for loop
                     continue;
                 } else {
@@ -1250,8 +1272,7 @@ public:
             if (t == LLAMA_TOKEN_NULL) {
                 try {
                     const auto & chunk = find_chunk(i);
-                    const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
-                    size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
+                    size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
                     i += n_pos - 1; // will be +1 by the for loop
                 } catch (const std::exception & e) {
                     return false;
@@ -1270,22 +1291,21 @@ public:
                 llama_pos n_past,
                 int32_t seq_id,
                 llama_pos & n_pos_out) {
-        auto it = map_pos_to_image.find(n_past);
-        if (it == map_pos_to_image.end()) {
-            throw std::runtime_error("Chunk not found");
-        }
-        SRV_INF("%s\n", "processing image...");
+        auto & chunk = find_chunk(n_past);
+        const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
+                            ? "image" : "audio";
+        SRV_INF("processing %s...\n", name);
         int32_t n_batch = llama_n_batch(ctx);
         int64_t t0 = ggml_time_ms();
         llama_pos new_n_past = n_past;
         int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
-            it->second.get(), // chunk
+            chunk.get(),
             n_past,
             seq_id,
             n_batch,
             true, // logits last
             &new_n_past);
-        SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
         if (result != 0) {
             LOG_ERR("mtmd_helper_eval failed with status %d", result);
             n_pos_out = n_past;
diff --git a/tools/server/webui/src/App.tsx b/tools/server/webui/src/App.tsx
index 1b673bbaa..02f1719d3 100644
--- a/tools/server/webui/src/App.tsx
+++ b/tools/server/webui/src/App.tsx
@@ -5,21 +5,24 @@ import { AppContextProvider, useAppContext } from './utils/app.context';
 import ChatScreen from './components/ChatScreen';
 import SettingDialog from './components/SettingDialog';
 import { Toaster } from 'react-hot-toast';
+import { ModalProvider } from './components/ModalProvider';
 
 function App() {
   return (
-    <HashRouter>
-      <div className="flex flex-row drawer lg:drawer-open">
-        <AppContextProvider>
-          <Routes>
-            <Route element={<AppLayout />}>
-              <Route path="/chat/:convId" element={<ChatScreen />} />
-              <Route path="*" element={<ChatScreen />} />
-            </Route>
-          </Routes>
-        </AppContextProvider>
-      </div>
-    </HashRouter>
+    <ModalProvider>
+      <HashRouter>
+        <div className="flex flex-row drawer lg:drawer-open">
+          <AppContextProvider>
+            <Routes>
+              <Route element={<AppLayout />}>
+                <Route path="/chat/:convId" element={<ChatScreen />} />
+                <Route path="*" element={<ChatScreen />} />
+              </Route>
+            </Routes>
+          </AppContextProvider>
+        </div>
+      </HashRouter>
+    </ModalProvider>
   );
 }
 
diff --git a/tools/server/webui/src/components/ChatInputExtraContextItem.tsx b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
index 4f28f8874..2d4179ea4 100644
--- a/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
+++ b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
@@ -1,4 +1,8 @@
-import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline';
+import {
+  DocumentTextIcon,
+  SpeakerWaveIcon,
+  XMarkIcon,
+} from '@heroicons/react/24/outline';
 import { MessageExtra } from '../utils/types';
 import { useState } from 'react';
 import { classNames } from '../utils/misc';
@@ -66,7 +70,11 @@ export default function ChatInputExtraContextItem({
                   className="w-14 h-14 flex items-center justify-center"
                   aria-description="Document icon"
                 >
-                  <DocumentTextIcon className="h-8 w-14 text-base-content/50" />
+                  {item.type === 'audioFile' ? (
+                    <SpeakerWaveIcon className="h-8 w-8 text-gray-500" />
+                  ) : (
+                    <DocumentTextIcon className="h-8 w-8 text-gray-500" />
+                  )}
                 </div>
 
                 <div className="text-xs pr-4">
@@ -98,6 +106,19 @@ export default function ChatInputExtraContextItem({
                 src={showingItem.base64Url}
                 alt={`Preview image for ${showingItem.name}`}
               />
+            ) : showingItem.type === 'audioFile' ? (
+              <audio
+                controls
+                className="w-full"
+                aria-description={`Audio file ${showingItem.name}`}
+              >
+                <source
+                  src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`}
+                  type={showingItem.mimeType}
+                  aria-description={`Audio file ${showingItem.name}`}
+                />
+                Your browser does not support the audio element.
+              </audio>
             ) : (
               <div className="overflow-x-auto">
                 <pre className="whitespace-pre-wrap break-words text-sm">
diff --git a/tools/server/webui/src/components/ChatScreen.tsx b/tools/server/webui/src/components/ChatScreen.tsx
index 09c601ef2..c1a669144 100644
--- a/tools/server/webui/src/components/ChatScreen.tsx
+++ b/tools/server/webui/src/components/ChatScreen.tsx
@@ -278,6 +278,13 @@ export default function ChatScreen() {
 
 function ServerInfo() {
   const { serverProps } = useAppContext();
+  const modalities = [];
+  if (serverProps?.modalities?.audio) {
+    modalities.push('audio');
+  }
+  if (serverProps?.modalities?.vision) {
+    modalities.push('vision');
+  }
   return (
     <div
       className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
@@ -291,6 +298,13 @@ function ServerInfo() {
           <br />
           <b>Build</b>: {serverProps?.build_info}
           <br />
+          {modalities.length > 0 ? (
+            <>
+              <b>Supported modalities:</b> {modalities.join(', ')}
+            </>
+          ) : (
+            ''
+          )}
         </p>
       </div>
     </div>
diff --git a/tools/server/webui/src/components/ModalProvider.tsx b/tools/server/webui/src/components/ModalProvider.tsx
new file mode 100644
index 000000000..f2ebf8e0a
--- /dev/null
+++ b/tools/server/webui/src/components/ModalProvider.tsx
@@ -0,0 +1,151 @@
+import React, { createContext, useState, useContext } from 'react';
+
+type ModalContextType = {
+  showConfirm: (message: string) => Promise<boolean>;
+  showPrompt: (
+    message: string,
+    defaultValue?: string
+  ) => Promise<string | undefined>;
+  showAlert: (message: string) => Promise<void>;
+};
+const ModalContext = createContext<ModalContextType>(null!);
+
+interface ModalState<T> {
+  isOpen: boolean;
+  message: string;
+  defaultValue?: string;
+  resolve: ((value: T) => void) | null;
+}
+
+export function ModalProvider({ children }: { children: React.ReactNode }) {
+  const [confirmState, setConfirmState] = useState<ModalState<boolean>>({
+    isOpen: false,
+    message: '',
+    resolve: null,
+  });
+  const [promptState, setPromptState] = useState<
+    ModalState<string | undefined>
+  >({ isOpen: false, message: '', resolve: null });
+  const [alertState, setAlertState] = useState<ModalState<void>>({
+    isOpen: false,
+    message: '',
+    resolve: null,
+  });
+  const inputRef = React.useRef<HTMLInputElement>(null);
+
+  const showConfirm = (message: string): Promise<boolean> => {
+    return new Promise((resolve) => {
+      setConfirmState({ isOpen: true, message, resolve });
+    });
+  };
+
+  const showPrompt = (
+    message: string,
+    defaultValue?: string
+  ): Promise<string | undefined> => {
+    return new Promise((resolve) => {
+      setPromptState({ isOpen: true, message, defaultValue, resolve });
+    });
+  };
+
+  const showAlert = (message: string): Promise<void> => {
+    return new Promise((resolve) => {
+      setAlertState({ isOpen: true, message, resolve });
+    });
+  };
+
+  const handleConfirm = (result: boolean) => {
+    confirmState.resolve?.(result);
+    setConfirmState({ isOpen: false, message: '', resolve: null });
+  };
+
+  const handlePrompt = (result?: string) => {
+    promptState.resolve?.(result);
+    setPromptState({ isOpen: false, message: '', resolve: null });
+  };
+
+  const handleAlertClose = () => {
+    alertState.resolve?.();
+    setAlertState({ isOpen: false, message: '', resolve: null });
+  };
+
+  return (
+    <ModalContext.Provider value={{ showConfirm, showPrompt, showAlert }}>
+      {children}
+
+      {/* Confirm Modal */}
+      {confirmState.isOpen && (
+        <dialog className="modal modal-open z-[1100]">
+          <div className="modal-box">
+            <h3 className="font-bold text-lg">{confirmState.message}</h3>
+            <div className="modal-action">
+              <button
+                className="btn btn-ghost"
+                onClick={() => handleConfirm(false)}
+              >
+                Cancel
+              </button>
+              <button
+                className="btn btn-error"
+                onClick={() => handleConfirm(true)}
+              >
+                Confirm
+              </button>
+            </div>
+          </div>
+        </dialog>
+      )}
+
+      {/* Prompt Modal */}
+      {promptState.isOpen && (
+        <dialog className="modal modal-open z-[1100]">
+          <div className="modal-box">
+            <h3 className="font-bold text-lg">{promptState.message}</h3>
+            <input
+              type="text"
+              className="input input-bordered w-full mt-2"
+              defaultValue={promptState.defaultValue}
+              ref={inputRef}
+              onKeyDown={(e) => {
+                if (e.key === 'Enter') {
+                  handlePrompt((e.target as HTMLInputElement).value);
+                }
+              }}
+            />
+            <div className="modal-action">
+              <button className="btn btn-ghost" onClick={() => handlePrompt()}>
+                Cancel
+              </button>
+              <button
+                className="btn btn-primary"
+                onClick={() => handlePrompt(inputRef.current?.value)}
+              >
+                Submit
+              </button>
+            </div>
+          </div>
+        </dialog>
+      )}
+
+      {/* Alert Modal */}
+      {alertState.isOpen && (
+        <dialog className="modal modal-open z-[1100]">
+          <div className="modal-box">
+            <h3 className="font-bold text-lg">{alertState.message}</h3>
+            <div className="modal-action">
+              <button className="btn" onClick={handleAlertClose}>
+                OK
+              </button>
+            </div>
+          </div>
+        </dialog>
+      )}
+    </ModalContext.Provider>
+  );
+}
+
+export function useModals() {
+  const context = useContext(ModalContext);
+  if (!context) throw new Error('useModals must be used within ModalProvider');
+  return context;
+}
diff --git a/tools/server/webui/src/components/SettingDialog.tsx b/tools/server/webui/src/components/SettingDialog.tsx
index e4684be7e..45a8d73b0 100644
--- a/tools/server/webui/src/components/SettingDialog.tsx
+++ b/tools/server/webui/src/components/SettingDialog.tsx
@@ -13,6 +13,7 @@ import {
   SquaresPlusIcon,
 } from '@heroicons/react/24/outline';
 import { OpenInNewTab } from '../utils/common';
+import { useModals } from './ModalProvider';
 
 type SettKey = keyof typeof CONFIG_DEFAULT;
 
@@ -282,14 +283,15 @@ export default function SettingDialog({
   const [localConfig, setLocalConfig] = useState<typeof CONFIG_DEFAULT>(
     JSON.parse(JSON.stringify(config))
   );
+  const { showConfirm, showAlert } = useModals();
 
-  const resetConfig = () => {
-    if (window.confirm('Are you sure you want to reset all settings?')) {
+  const resetConfig = async () => {
+    if (await showConfirm('Are you sure you want to reset all settings?')) {
       setLocalConfig(CONFIG_DEFAULT);
     }
   };
 
-  const handleSave = () => {
+  const handleSave = async () => {
     // copy the local config to prevent direct mutation
     const newConfig: typeof CONFIG_DEFAULT = JSON.parse(
       JSON.stringify(localConfig)
@@ -302,14 +304,14 @@ export default function SettingDialog({
       const mustBeNumeric = isNumeric(CONFIG_DEFAULT[key as SettKey]);
       if (mustBeString) {
         if (!isString(value)) {
-          alert(`Value for ${key} must be string`);
+          await showAlert(`Value for ${key} must be string`);
           return;
         }
       } else if (mustBeNumeric) {
         const trimmedValue = value.toString().trim();
         const numVal = Number(trimmedValue);
         if (isNaN(numVal) || !isNumeric(numVal) || trimmedValue.length === 0) {
-          alert(`Value for ${key} must be numeric`);
+          await showAlert(`Value for ${key} must be numeric`);
           return;
         }
         // force conversion to number
@@ -317,7 +319,7 @@ export default function SettingDialog({
         newConfig[key] = numVal;
       } else if (mustBeBoolean) {
         if (!isBoolean(value)) {
-          alert(`Value for ${key} must be boolean`);
+          await showAlert(`Value for ${key} must be boolean`);
           return;
         }
       } else {
diff --git a/tools/server/webui/src/components/Sidebar.tsx b/tools/server/webui/src/components/Sidebar.tsx
index 8cac52f4c..a77cb83b4 100644
--- a/tools/server/webui/src/components/Sidebar.tsx
+++ b/tools/server/webui/src/components/Sidebar.tsx
@@ -14,6 +14,7 @@ import {
 import { BtnWithTooltips } from '../utils/common';
 import { useAppContext } from '../utils/app.context';
 import toast from 'react-hot-toast';
+import { useModals } from './ModalProvider';
 
 export default function Sidebar() {
   const params = useParams();
@@ -38,6 +39,7 @@ export default function Sidebar() {
       StorageUtils.offConversationChanged(handleConversationChange);
     };
   }, []);
+  const { showConfirm, showPrompt } = useModals();
 
   const groupedConv = useMemo(
     () => groupConversationsByDate(conversations),
@@ -130,7 +132,7 @@ export default function Sidebar() {
                   onSelect={() => {
                     navigate(`/chat/${conv.id}`);
                   }}
-                  onDelete={() => {
+                  onDelete={async () => {
                     if (isGenerating(conv.id)) {
                       toast.error(
                         'Cannot delete conversation while generating'
@@ -138,7 +140,7 @@ export default function Sidebar() {
                       return;
                     }
                     if (
-                      window.confirm(
+                      await showConfirm(
                         'Are you sure to delete this conversation?'
                       )
                     ) {
@@ -167,14 +169,14 @@ export default function Sidebar() {
                     document.body.removeChild(a);
                     URL.revokeObjectURL(url);
                   }}
-                  onRename={() => {
+                  onRename={async () => {
                     if (isGenerating(conv.id)) {
                       toast.error(
                         'Cannot rename conversation while generating'
                       );
                       return;
                     }
-                    const newName = window.prompt(
+                    const newName = await showPrompt(
                       'Enter new name for the conversation',
                       conv.name
                     );
diff --git a/tools/server/webui/src/components/useChatExtraContext.tsx b/tools/server/webui/src/components/useChatExtraContext.tsx
index b9794405a..6f0701290 100644
--- a/tools/server/webui/src/components/useChatExtraContext.tsx
+++ b/tools/server/webui/src/components/useChatExtraContext.tsx
@@ -11,6 +11,7 @@ pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
 // This file handles uploading extra context items (a.k.a files)
 // It allows processing these kinds of files:
 // - image files (converted to base64)
+// - audio files (converted to base64)
 // - text files (including code files)
 // - pdf (converted to text)
 
@@ -41,96 +42,76 @@ export function useChatExtraContext(): ChatExtraContextApi {
 
   const isSupportVision = serverProps?.modalities?.vision;
 
-  const onFileAdded = (files: File[]) => {
-    for (const file of files) {
-      const mimeType = file.type;
-      console.debug({ mimeType, file });
-      if (file.size > 10 * 1024 * 1024) {
-        toast.error('File is too large. Maximum size is 10MB.');
-        break;
-      }
+  const onFileAdded = async (files: File[]) => {
+    try {
+      for (const file of files) {
+        const mimeType = file.type;
 
-      if (mimeType.startsWith('image/')) {
-        if (!isSupportVision) {
-          toast.error('Multimodal is not supported by this server or model.');
+        // this limit is only to prevent accidental uploads of huge files
+        // it can potentially crashes the browser because we read the file as base64
+        if (file.size > 500 * 1024 * 1024) {
+          toast.error('File is too large. Maximum size is 500MB.');
           break;
         }
-        const reader = new FileReader();
-        reader.onload = async (event) => {
-          if (event.target?.result) {
-            let base64Url = event.target.result as string;
 
-            if (mimeType === 'image/svg+xml') {
-              // Convert SVG to PNG
-              base64Url = await svgBase64UrlToPngDataURL(base64Url);
-            }
+        if (mimeType.startsWith('image/')) {
+          if (!isSupportVision) {
+            toast.error('Multimodal is not supported by this server or model.');
+            break;
+          }
 
-            addItems([
-              {
+          let base64Url = await getFileAsBase64(file);
+          if (mimeType === 'image/svg+xml') {
+            // Convert SVG to PNG
+            base64Url = await svgBase64UrlToPngDataURL(base64Url);
+          }
+          addItems([
+            {
+              type: 'imageFile',
+              name: file.name,
+              base64Url,
+            },
+          ]);
+        } else if (mimeType.startsWith('video/')) {
+          toast.error('Video files are not supported yet.');
+          break;
+        } else if (mimeType.startsWith('audio/')) {
+          if (!/mpeg|wav/.test(mimeType)) {
+            toast.error('Only mp3 and wav audio files are supported.');
+            break;
+          }
+
+          // plain base64, not a data URL
+          const base64Data = await getFileAsBase64(file, false);
+          addItems([
+            {
+              type: 'audioFile',
+              name: file.name,
+              mimeType,
+              base64Data,
+            },
+          ]);
+        } else if (mimeType.startsWith('application/pdf')) {
+          if (config.pdfAsImage && !isSupportVision) {
+            toast(
+              'Multimodal is not supported, PDF will be converted to text instead of image.'
+            );
+            break;
+          }
+
+          if (config.pdfAsImage && isSupportVision) {
+            // Convert PDF to images
+            const base64Urls = await convertPDFToImage(file);
+            addItems(
+              base64Urls.map((base64Url) => ({
                 type: 'imageFile',
                 name: file.name,
                 base64Url,
-              },
-            ]);
-          }
-        };
-        reader.readAsDataURL(file);
-      } else if (
-        mimeType.startsWith('video/') ||
-        mimeType.startsWith('audio/')
-      ) {
-        toast.error('Video and audio files are not supported yet.');
-        break;
-      } else if (mimeType.startsWith('application/pdf')) {
-        if (config.pdfAsImage && !isSupportVision) {
-          toast(
-            'Multimodal is not supported, PDF will be converted to text instead of image.'
-          );
-          break;
-        }
-
-        const promise =
-          config.pdfAsImage && isSupportVision
-            ? convertPDFToImage(file).then((base64Urls) => {
-                addItems(
-                  base64Urls.map((base64Url) => ({
-                    type: 'imageFile',
-                    name: file.name,
-                    base64Url,
-                  }))
-                );
-              })
-            : convertPDFToText(file).then((content) => {
-                if (isSupportVision) {
-                  toast.success(
-                    'PDF file converted to text. You can also convert it to image, see in Settings.'
-                  );
-                }
-                addItems([
-                  {
-                    type: 'textFile',
-                    name: file.name,
-                    content,
-                  },
-                ]);
-              });
-
-        promise.catch((error) => {
-          console.error(error);
-          toast.error('Failed to parse PDF file.');
-        });
-        break;
-      } else {
-        // Because there can be many text file types (like code file), we will not check the mime type
-        // and will just check if the file is not binary.
-        const reader = new FileReader();
-        reader.onload = (event) => {
-          if (event.target?.result) {
-            const content = event.target.result as string;
-            if (!isLikelyNotBinary(content)) {
-              toast.error('File is binary. Please upload a text file.');
-              return;
-            }
+              }))
+            );
+          } else {
+            // Convert PDF to text
+            const content = await convertPDFToText(file);
             addItems([
               {
                 type: 'textFile',
@@ -138,10 +119,40 @@ export function useChatExtraContext(): ChatExtraContextApi {
                 content,
               },
             ]);
+            if (isSupportVision) {
+              toast.success(
+                'PDF file converted to text. You can also convert it to image, see in Settings.'
+              );
+            }
           }
-        };
-        reader.readAsText(file);
+          break;
+        } else {
+          // Because there can be many text file types (like code file), we will not check the mime type
+          // and will just check if the file is not binary.
+          const reader = new FileReader();
+          reader.onload = (event) => {
+            if (event.target?.result) {
+              const content = event.target.result as string;
+              if (!isLikelyNotBinary(content)) {
+                toast.error('File is binary. Please upload a text file.');
+                return;
+              }
+              addItems([
+                {
+                  type: 'textFile',
+                  name: file.name,
+                  content,
+                },
+              ]);
+            }
+          };
+          reader.readAsText(file);
+        }
       }
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const errorMessage = `Error processing file: ${message}`;
+      toast.error(errorMessage);
     }
   };
 
@@ -154,6 +165,25 @@ export function useChatExtraContext(): ChatExtraContextApi {
   };
 }
 
+async function getFileAsBase64(file: File, outputUrl = true): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = (event) => {
+      if (event.target?.result) {
+        let result = event.target.result as string;
+        if (!outputUrl) {
+          // remove base64 url prefix and correct characters
+          result = result.substring(result.indexOf(',') + 1);
+        }
+        resolve(result);
+      } else {
+        reject(new Error('Failed to read file.'));
+      }
+    };
+    reader.readAsDataURL(file);
+  });
+}
+
 async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
   return new Promise((resolve, reject) => {
     const reader = new FileReader();
diff --git a/tools/server/webui/src/utils/misc.ts b/tools/server/webui/src/utils/misc.ts
index ba760e83b..d60a68cd2 100644
--- a/tools/server/webui/src/utils/misc.ts
+++ b/tools/server/webui/src/utils/misc.ts
@@ -89,6 +89,14 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
           type: 'image_url',
           image_url: { url: extra.base64Url },
         });
+      } else if (extra.type === 'audioFile') {
+        contentArr.push({
+          type: 'input_audio',
+          input_audio: {
+            data: extra.base64Data,
+            format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
+          },
+        });
       } else {
         throw new Error('Unknown extra type');
       }
diff --git a/tools/server/webui/src/utils/types.ts b/tools/server/webui/src/utils/types.ts
index ba673dd94..ea7d641dc 100644
--- a/tools/server/webui/src/utils/types.ts
+++ b/tools/server/webui/src/utils/types.ts
@@ -51,6 +51,7 @@ export interface Message {
 export type MessageExtra =
   | MessageExtraTextFile
   | MessageExtraImageFile
+  | MessageExtraAudioFile
   | MessageExtraContext;
 
 export interface MessageExtraTextFile {
@@ -65,6 +66,13 @@ export interface MessageExtraImageFile {
   base64Url: string;
 }
 
+export interface MessageExtraAudioFile {
+  type: 'audioFile';
+  name: string;
+  base64Data: string;
+  mimeType: string;
+}
+
 export interface MessageExtraContext {
   type: 'context';
   name: string;
@@ -79,6 +87,10 @@ export type APIMessageContentPart =
   | {
       type: 'image_url';
       image_url: { url: string };
+    }
+  | {
+      type: 'input_audio';
+      input_audio: { data: string; format: 'wav' | 'mp3' };
     };
 
 export type APIMessage = {
@@ -120,6 +132,7 @@ export interface LlamaCppServerProps {
   n_ctx: number;
   modalities?: {
     vision: boolean;
+    audio: boolean;
   };
   // TODO: support params
 }
diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp
index 0f0479869..a71e9bf5b 100644
--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@@ -5,7 +5,9 @@
 #include "sampling.h"
 #include "log.h"
 #include "llama.h"
-#include "json.hpp"
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
 
 #include <algorithm>
 #include <cmath>
@@ -579,6 +581,8 @@ int main(int argc, char ** argv) {
 
     params.model = params.vocoder.model;
     params.embedding = true;
+    params.ctx_shift = false; // silence warning
+    params.n_ubatch = params.n_batch;
 
     common_init_result llama_init_cts = common_init_from_params(params);
 
@@ -1020,8 +1024,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
     }
     GGML_ASSERT(batch.n_tokens == n_codes);
 
-    if (llama_decode(ctx_cts, batch) != 0) {
-        LOG_ERR("%s: llama_decode() failed\n", __func__);
+    if (llama_encode(ctx_cts, batch) != 0) {
+        LOG_ERR("%s: llama_encode() failed\n", __func__);
         return 1;
     }
 
diff --git a/tools/server/httplib.h b/vendor/cpp-httplib/httplib.h
similarity index 99%
rename from tools/server/httplib.h
rename to vendor/cpp-httplib/httplib.h
index 0f981dc89..0aa4e6274 100644
--- a/tools/server/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,7 +8,7 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H
 
-#define CPPHTTPLIB_VERSION "0.20.0"
+#define CPPHTTPLIB_VERSION "0.20.1"
 
 /*
  * Configuration
@@ -145,6 +145,10 @@
 #define CPPHTTPLIB_LISTEN_BACKLOG 5
 #endif
 
+#ifndef CPPHTTPLIB_MAX_LINE_LENGTH
+#define CPPHTTPLIB_MAX_LINE_LENGTH 32768
+#endif
+
 /*
  * Headers
  */
@@ -3067,6 +3071,11 @@ inline bool stream_line_reader::getline() {
 #endif
 
   for (size_t i = 0;; i++) {
+    if (size() >= CPPHTTPLIB_MAX_LINE_LENGTH) {
+      // Treat exceptionally long lines as an error to
+      // prevent infinite loops/memory exhaustion
+      return false;
+    }
     char byte;
     auto n = strm_.read(&byte, 1);
 
@@ -6055,6 +6064,8 @@ inline void calc_actual_timeout(time_t max_timeout_msec, time_t duration_msec,
   auto actual_timeout_msec =
       (std::min)(max_timeout_msec - duration_msec, timeout_msec);
 
+  if (actual_timeout_msec < 0) { actual_timeout_msec = 0; }
+
   actual_timeout_sec = actual_timeout_msec / 1000;
   actual_timeout_usec = (actual_timeout_msec % 1000) * 1000;
 }
@@ -7327,8 +7338,9 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
   }
 
   // Setup `is_connection_closed` method
-  req.is_connection_closed = [&]() {
-    return !detail::is_socket_alive(strm.socket());
+  auto sock = strm.socket();
+  req.is_connection_closed = [sock]() {
+    return !detail::is_socket_alive(sock);
   };
 
   // Routing
diff --git a/vendor/miniaudio/miniaudio.h b/vendor/miniaudio/miniaudio.h
new file mode 100644
index 000000000..c74bebeb3
--- /dev/null
+++ b/vendor/miniaudio/miniaudio.h
@@ -0,0 +1,93468 @@
+/*
+Audio playback and capture library. Choice of public domain or MIT-0. See license statements at the end of this file.
+miniaudio - v0.11.22 - 2025-02-24
+
+David Reid - mackron@gmail.com
+
+Website:       https://miniaud.io
+Documentation: https://miniaud.io/docs
+GitHub:        https://github.com/mackron/miniaudio
+*/
+
+/*
+1. Introduction
+===============
+To use miniaudio, include "miniaudio.h":
+
+    ```c
+    #include "miniaudio.h"
+    ```
+
+The implementation is contained in "miniaudio.c". Just compile this like any other source file. You
+can include miniaudio.c if you want to compile your project as a single translation unit:
+
+    ```c
+    #include "miniaudio.c"
+    ```
+
+miniaudio includes both low level and high level APIs. The low level API is good for those who want
+to do all of their mixing themselves and only require a light weight interface to the underlying
+audio device. The high level API is good for those who have complex mixing and effect requirements.
+
+In miniaudio, objects are transparent structures. Unlike many other libraries, there are no handles
+to opaque objects which means you need to allocate memory for objects yourself. In the examples
+presented in this documentation you will often see objects declared on the stack. You need to be
+careful when translating these examples to your own code so that you don't accidentally declare
+your objects on the stack and then cause them to become invalid once the function returns. In
+addition, you must ensure the memory address of your objects remain the same throughout their
+lifetime. You therefore cannot be making copies of your objects.
+
+A config/init pattern is used throughout the entire library. The idea is that you set up a config
+object and pass that into the initialization routine. The advantage to this system is that the
+config object can be initialized with logical defaults and new properties added to it without
+breaking the API. The config object can be allocated on the stack and does not need to be
+maintained after initialization of the corresponding object.
+
+
+1.1. Low Level API
+------------------
+The low level API gives you access to the raw audio data of an audio device. It supports playback,
+capture, full-duplex and loopback (WASAPI only). You can enumerate over devices to determine which
+physical device(s) you want to connect to.
+
+The low level API uses the concept of a "device" as the abstraction for physical devices. The idea
+is that you choose a physical device to emit or capture audio from, and then move data to/from the
+device when miniaudio tells you to. Data is delivered to and from devices asynchronously via a
+callback which you specify when initializing the device.
+
+When initializing the device you first need to configure it. The device configuration allows you to
+specify things like the format of the data delivered via the callback, the size of the internal
+buffer and the ID of the device you want to emit or capture audio from.
+
+Once you have the device configuration set up you can initialize the device. When initializing a
+device you need to allocate memory for the device object beforehand. This gives the application
+complete control over how the memory is allocated. In the example below we initialize a playback
+device on the stack, but you could allocate it on the heap if that suits your situation better.
+
+    ```c
+    void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount)
+    {
+        // In playback mode copy data to pOutput. In capture mode read data from pInput. In full-duplex mode, both
+        // pOutput and pInput will be valid and you can move data from pInput into pOutput. Never process more than
+        // frameCount frames.
+    }
+
+    int main()
+    {
+        ma_device_config config = ma_device_config_init(ma_device_type_playback);
+        config.playback.format   = ma_format_f32;   // Set to ma_format_unknown to use the device's native format.
+        config.playback.channels = 2;               // Set to 0 to use the device's native channel count.
+        config.sampleRate        = 48000;           // Set to 0 to use the device's native sample rate.
+        config.dataCallback      = data_callback;   // This function will be called when miniaudio needs more data.
+        config.pUserData         = pMyCustomData;   // Can be accessed from the device object (device.pUserData).
+
+        ma_device device;
+        if (ma_device_init(NULL, &config, &device) != MA_SUCCESS) {
+            return -1;  // Failed to initialize the device.
+        }
+
+        ma_device_start(&device);     // The device is sleeping by default so you'll need to start it manually.
+
+        // Do something here. Probably your program's main loop.
+
+        ma_device_uninit(&device);
+        return 0;
+    }
+    ```
+
+In the example above, `data_callback()` is where audio data is written and read from the device.
+The idea is in playback mode you cause sound to be emitted from the speakers by writing audio data
+to the output buffer (`pOutput` in the example). In capture mode you read data from the input
+buffer (`pInput`) to extract sound captured by the microphone. The `frameCount` parameter tells you
+how many frames can be written to the output buffer and read from the input buffer. A "frame" is
+one sample for each channel. For example, in a stereo stream (2 channels), one frame is 2
+samples: one for the left, one for the right. The channel count is defined by the device config.
+The size in bytes of an individual sample is defined by the sample format which is also specified
+in the device config. Multi-channel audio data is always interleaved, which means the samples for
+each frame are stored next to each other in memory. For example, in a stereo stream the first pair
+of samples will be the left and right samples for the first frame, the second pair of samples will
+be the left and right samples for the second frame, etc.
+
+The configuration of the device is defined by the `ma_device_config` structure. The config object
+is always initialized with `ma_device_config_init()`. It's important to always initialize the
+config with this function as it initializes it with logical defaults and ensures your program
+doesn't break when new members are added to the `ma_device_config` structure. The example above
+uses a fairly simple and standard device configuration. The call to `ma_device_config_init()` takes
+a single parameter, which is whether or not the device is a playback, capture, duplex or loopback
+device (loopback devices are not supported on all backends). The `config.playback.format` member
+sets the sample format which can be one of the following (all formats are native-endian):
+
+    +---------------+----------------------------------------+---------------------------+
+    | Symbol        | Description                            | Range                     |
+    +---------------+----------------------------------------+---------------------------+
+    | ma_format_f32 | 32-bit floating point                  | [-1, 1]                   |
+    | ma_format_s16 | 16-bit signed integer                  | [-32768, 32767]           |
+    | ma_format_s24 | 24-bit signed integer (tightly packed) | [-8388608, 8388607]       |
+    | ma_format_s32 | 32-bit signed integer                  | [-2147483648, 2147483647] |
+    | ma_format_u8  | 8-bit unsigned integer                 | [0, 255]                  |
+    +---------------+----------------------------------------+---------------------------+
+
+The `config.playback.channels` member sets the number of channels to use with the device. The
+channel count cannot exceed MA_MAX_CHANNELS. The `config.sampleRate` member sets the sample rate
+(which must be the same for both playback and capture in full-duplex configurations). This is
+usually set to 44100 or 48000, but can be set to anything. It's recommended to keep this between
+8000 and 384000, however.
+
+Note that leaving the format, channel count and/or sample rate at their default values will result
+in the internal device's native configuration being used which is useful if you want to avoid the
+overhead of miniaudio's automatic data conversion.
+
+In addition to the sample format, channel count and sample rate, the data callback and user data
+pointer are also set via the config. The user data pointer is not passed into the callback as a
+parameter, but is instead set to the `pUserData` member of `ma_device` which you can access
+directly since all miniaudio structures are transparent.
+
+Initializing the device is done with `ma_device_init()`. This will return a result code telling you
+what went wrong, if anything. On success it will return `MA_SUCCESS`. After initialization is
+complete the device will be in a stopped state. To start it, use `ma_device_start()`.
+Uninitializing the device will stop it, which is what the example above does, but you can also stop
+the device with `ma_device_stop()`. To resume the device simply call `ma_device_start()` again.
+Note that it's important to never stop or start the device from inside the callback. This will
+result in a deadlock. Instead you set a variable or signal an event indicating that the device
+needs to stop and handle it in a different thread. The following APIs must never be called inside
+the callback:
+
+    ```c
+    ma_device_init()
+    ma_device_init_ex()
+    ma_device_uninit()
+    ma_device_start()
+    ma_device_stop()
+    ```
+
+You must never try uninitializing and reinitializing a device inside the callback. You must also
+never try to stop and start it from inside the callback. There are a few other things you shouldn't
+do in the callback depending on your requirements, however this isn't so much a thread-safety
+thing, but rather a real-time processing thing which is beyond the scope of this introduction.
+
+The example above demonstrates the initialization of a playback device, but it works exactly the
+same for capture. All you need to do is change the device type from `ma_device_type_playback` to
+`ma_device_type_capture` when setting up the config, like so:
+
+    ```c
+    ma_device_config config = ma_device_config_init(ma_device_type_capture);
+    config.capture.format   = MY_FORMAT;
+    config.capture.channels = MY_CHANNEL_COUNT;
+    ```
+
+In the data callback you just read from the input buffer (`pInput` in the example above) and leave
+the output buffer alone (it will be set to NULL when the device type is set to
+`ma_device_type_capture`).
+
+These are the available device types and how you should handle the buffers in the callback:
+
+    +-------------------------+--------------------------------------------------------+
+    | Device Type             | Callback Behavior                                      |
+    +-------------------------+--------------------------------------------------------+
+    | ma_device_type_playback | Write to output buffer, leave input buffer untouched.  |
+    | ma_device_type_capture  | Read from input buffer, leave output buffer untouched. |
+    | ma_device_type_duplex   | Read from input buffer, write to output buffer.        |
+    | ma_device_type_loopback | Read from input buffer, leave output buffer untouched. |
+    +-------------------------+--------------------------------------------------------+
+
+You will notice in the example above that the sample format and channel count is specified
+separately for playback and capture. This is to support different data formats between the playback
+and capture devices in a full-duplex system. An example may be that you want to capture audio data
+as a monaural stream (one channel), but output sound to a stereo speaker system. Note that if you
+use different formats between playback and capture in a full-duplex configuration you will need to
+convert the data yourself. There are functions available to help you do this which will be
+explained later.
+
+The example above did not specify a physical device to connect to which means it will use the
+operating system's default device. If you have multiple physical devices connected and you want to
+use a specific one you will need to specify the device ID in the configuration, like so:
+
+    ```c
+    config.playback.pDeviceID = pMyPlaybackDeviceID;    // Only if requesting a playback or duplex device.
+    config.capture.pDeviceID = pMyCaptureDeviceID;      // Only if requesting a capture, duplex or loopback device.
+    ```
+
+To retrieve the device ID you will need to perform device enumeration, however this requires the
+use of a new concept called the "context". Conceptually speaking the context sits above the device.
+There is one context to many devices. The purpose of the context is to represent the backend at a
+more global level and to perform operations outside the scope of an individual device. Mainly it is
+used for performing run-time linking against backend libraries, initializing backends and
+enumerating devices. The example below shows how to enumerate devices.
+
+    ```c
+    ma_context context;
+    if (ma_context_init(NULL, 0, NULL, &context) != MA_SUCCESS) {
+        // Error.
+    }
+
+    ma_device_info* pPlaybackInfos;
+    ma_uint32 playbackCount;
+    ma_device_info* pCaptureInfos;
+    ma_uint32 captureCount;
+    if (ma_context_get_devices(&context, &pPlaybackInfos, &playbackCount, &pCaptureInfos, &captureCount) != MA_SUCCESS) {
+        // Error.
+    }
+
+    // Loop over each device info and do something with it. Here we just print the name with their index. You may want
+    // to give the user the opportunity to choose which device they'd prefer.
+    for (ma_uint32 iDevice = 0; iDevice < playbackCount; iDevice += 1) {
+        printf("%d - %s\n", iDevice, pPlaybackInfos[iDevice].name);
+    }
+
+    ma_device_config config = ma_device_config_init(ma_device_type_playback);
+    config.playback.pDeviceID = &pPlaybackInfos[chosenPlaybackDeviceIndex].id;
+    config.playback.format    = MY_FORMAT;
+    config.playback.channels  = MY_CHANNEL_COUNT;
+    config.sampleRate         = MY_SAMPLE_RATE;
+    config.dataCallback       = data_callback;
+    config.pUserData          = pMyCustomData;
+
+    ma_device device;
+    if (ma_device_init(&context, &config, &device) != MA_SUCCESS) {
+        // Error
+    }
+
+    ...
+
+    ma_device_uninit(&device);
+    ma_context_uninit(&context);
+    ```
+
+The first thing we do in this example is initialize a `ma_context` object with `ma_context_init()`.
+The first parameter is a pointer to a list of `ma_backend` values which are used to override the
+default backend priorities. When this is NULL, as in this example, miniaudio's default priorities
+are used. The second parameter is the number of backends listed in the array pointed to by the
+first parameter. The third parameter is a pointer to a `ma_context_config` object which can be
+NULL, in which case defaults are used. The context configuration is used for setting the logging
+callback, custom memory allocation callbacks, user-defined data and some backend-specific
+configurations.
+
+Once the context has been initialized you can enumerate devices. In the example above we use the
+simpler `ma_context_get_devices()`, however you can also use a callback for handling devices by
+using `ma_context_enumerate_devices()`. When using `ma_context_get_devices()` you provide a pointer
+to a pointer that will, upon output, be set to a pointer to a buffer containing a list of
+`ma_device_info` structures. You also provide a pointer to an unsigned integer that will receive
+the number of items in the returned buffer. Do not free the returned buffers as their memory is
+managed internally by miniaudio.
+
+The `ma_device_info` structure contains an `id` member which is the ID you pass to the device
+config. It also contains the name of the device which is useful for presenting a list of devices
+to the user via the UI.
+
+When creating your own context you will want to pass it to `ma_device_init()` when initializing the
+device. Passing in NULL, like we do in the first example, will result in miniaudio creating the
+context for you, which you don't want to do since you've already created a context. Note that
+internally the context is only tracked by it's pointer which means you must not change the location
+of the `ma_context` object. If this is an issue, consider using `malloc()` to allocate memory for
+the context.
+
+
+1.2. High Level API
+-------------------
+The high level API consists of three main parts:
+
+  * Resource management for loading and streaming sounds.
+  * A node graph for advanced mixing and effect processing.
+  * A high level "engine" that wraps around the resource manager and node graph.
+
+The resource manager (`ma_resource_manager`) is used for loading sounds. It supports loading sounds
+fully into memory and also streaming. It will also deal with reference counting for you which
+avoids the same sound being loaded multiple times.
+
+The node graph is used for mixing and effect processing. The idea is that you connect a number of
+nodes into the graph by connecting each node's outputs to another node's inputs. Each node can
+implement its own effect. By chaining nodes together, advanced mixing and effect processing can
+be achieved.
+
+The engine encapsulates both the resource manager and the node graph to create a simple, easy to
+use high level API. The resource manager and node graph APIs are covered in more later sections of
+this manual.
+
+The code below shows how you can initialize an engine using it's default configuration.
+
+    ```c
+    ma_result result;
+    ma_engine engine;
+
+    result = ma_engine_init(NULL, &engine);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to initialize the engine.
+    }
+    ```
+
+This creates an engine instance which will initialize a device internally which you can access with
+`ma_engine_get_device()`. It will also initialize a resource manager for you which can be accessed
+with `ma_engine_get_resource_manager()`. The engine itself is a node graph (`ma_node_graph`) which
+means you can pass a pointer to the engine object into any of the `ma_node_graph` APIs (with a
+cast). Alternatively, you can use `ma_engine_get_node_graph()` instead of a cast.
+
+Note that all objects in miniaudio, including the `ma_engine` object in the example above, are
+transparent structures. There are no handles to opaque structures in miniaudio which means you need
+to be mindful of how you declare them. In the example above we are declaring it on the stack, but
+this will result in the struct being invalidated once the function encapsulating it returns. If
+allocating the engine on the heap is more appropriate, you can easily do so with a standard call
+to `malloc()` or whatever heap allocation routine you like:
+
+    ```c
+    ma_engine* pEngine = malloc(sizeof(*pEngine));
+    ```
+
+The `ma_engine` API uses the same config/init pattern used all throughout miniaudio. To configure
+an engine, you can fill out a `ma_engine_config` object and pass it into the first parameter of
+`ma_engine_init()`:
+
+    ```c
+    ma_result result;
+    ma_engine engine;
+    ma_engine_config engineConfig;
+
+    engineConfig = ma_engine_config_init();
+    engineConfig.pResourceManager = &myCustomResourceManager;   // <-- Initialized as some earlier stage.
+
+    result = ma_engine_init(&engineConfig, &engine);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+    ```
+
+This creates an engine instance using a custom config. In this particular example it's showing how
+you can specify a custom resource manager rather than having the engine initialize one internally.
+This is particularly useful if you want to have multiple engine's share the same resource manager.
+
+The engine must be uninitialized with `ma_engine_uninit()` when it's no longer needed.
+
+By default the engine will be started, but nothing will be playing because no sounds have been
+initialized. The easiest but least flexible way of playing a sound is like so:
+
+    ```c
+    ma_engine_play_sound(&engine, "my_sound.wav", NULL);
+    ```
+
+This plays what miniaudio calls an "inline" sound. It plays the sound once, and then puts the
+internal sound up for recycling. The last parameter is used to specify which sound group the sound
+should be associated with which will be explained later. This particular way of playing a sound is
+simple, but lacks flexibility and features. A more flexible way of playing a sound is to first
+initialize a sound:
+
+    ```c
+    ma_result result;
+    ma_sound sound;
+
+    result = ma_sound_init_from_file(&engine, "my_sound.wav", 0, NULL, NULL, &sound);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    ma_sound_start(&sound);
+    ```
+
+This returns a `ma_sound` object which represents a single instance of the specified sound file. If
+you want to play the same file multiple times simultaneously, you need to create one sound for each
+instance.
+
+Sounds should be uninitialized with `ma_sound_uninit()`.
+
+Sounds are not started by default. Start a sound with `ma_sound_start()` and stop it with
+`ma_sound_stop()`. When a sound is stopped, it is not rewound to the start. Use
+`ma_sound_seek_to_pcm_frame(&sound, 0)` to seek back to the start of a sound. By default, starting
+and stopping sounds happens immediately, but sometimes it might be convenient to schedule the sound
+the be started and/or stopped at a specific time. This can be done with the following functions:
+
+    ```c
+    ma_sound_set_start_time_in_pcm_frames()
+    ma_sound_set_start_time_in_milliseconds()
+    ma_sound_set_stop_time_in_pcm_frames()
+    ma_sound_set_stop_time_in_milliseconds()
+    ```
+
+The start/stop time needs to be specified based on the absolute timer which is controlled by the
+engine. The current global time in PCM frames can be retrieved with
+`ma_engine_get_time_in_pcm_frames()`. The engine's global time can be changed with
+`ma_engine_set_time_in_pcm_frames()` for synchronization purposes if required. Note that scheduling
+a start time still requires an explicit call to `ma_sound_start()` before anything will play:
+
+    ```c
+    ma_sound_set_start_time_in_pcm_frames(&sound, ma_engine_get_time_in_pcm_frames(&engine) + (ma_engine_get_sample_rate(&engine) * 2);
+    ma_sound_start(&sound);
+    ```
+
+The third parameter of `ma_sound_init_from_file()` is a set of flags that control how the sound be
+loaded and a few options on which features should be enabled for that sound. By default, the sound
+is synchronously loaded fully into memory straight from the file system without any kind of
+decoding. If you want to decode the sound before storing it in memory, you need to specify the
+`MA_SOUND_FLAG_DECODE` flag. This is useful if you want to incur the cost of decoding at an earlier
+stage, such as a loading stage. Without this option, decoding will happen dynamically at mixing
+time which might be too expensive on the audio thread.
+
+If you want to load the sound asynchronously, you can specify the `MA_SOUND_FLAG_ASYNC` flag. This
+will result in `ma_sound_init_from_file()` returning quickly, but the sound will not start playing
+until the sound has had some audio decoded.
+
+The fourth parameter is a pointer to sound group. A sound group is used as a mechanism to organise
+sounds into groups which have their own effect processing and volume control. An example is a game
+which might have separate groups for sfx, voice and music. Each of these groups have their own
+independent volume control. Use `ma_sound_group_init()` or `ma_sound_group_init_ex()` to initialize
+a sound group.
+
+Sounds and sound groups are nodes in the engine's node graph and can be plugged into any `ma_node`
+API. This makes it possible to connect sounds and sound groups to effect nodes to produce complex
+effect chains.
+
+A sound can have its volume changed with `ma_sound_set_volume()`. If you prefer decibel volume
+control you can use `ma_volume_db_to_linear()` to convert from decibel representation to linear.
+
+Panning and pitching is supported with `ma_sound_set_pan()` and `ma_sound_set_pitch()`. If you know
+a sound will never have its pitch changed with `ma_sound_set_pitch()` or via the doppler effect,
+you can specify the `MA_SOUND_FLAG_NO_PITCH` flag when initializing the sound for an optimization.
+
+By default, sounds and sound groups have spatialization enabled. If you don't ever want to
+spatialize your sounds, initialize the sound with the `MA_SOUND_FLAG_NO_SPATIALIZATION` flag. The
+spatialization model is fairly simple and is roughly on feature parity with OpenAL. HRTF and
+environmental occlusion are not currently supported, but planned for the future. The supported
+features include:
+
+  * Sound and listener positioning and orientation with cones
+  * Attenuation models: none, inverse, linear and exponential
+  * Doppler effect
+
+Sounds can be faded in and out with `ma_sound_set_fade_in_pcm_frames()`.
+
+To check if a sound is currently playing, you can use `ma_sound_is_playing()`. To check if a sound
+is at the end, use `ma_sound_at_end()`. Looping of a sound can be controlled with
+`ma_sound_set_looping()`. Use `ma_sound_is_looping()` to check whether or not the sound is looping.
+
+
+
+2. Building
+===========
+miniaudio should work cleanly out of the box without the need to download or install any
+dependencies. See below for platform-specific details.
+
+Note that GCC and Clang require `-msse2`, `-mavx2`, etc. for SIMD optimizations.
+
+If you get errors about undefined references to `__sync_val_compare_and_swap_8`, `__atomic_load_8`,
+etc. you need to link with `-latomic`.
+
+
+2.1. Windows
+------------
+The Windows build should compile cleanly on all popular compilers without the need to configure any
+include paths nor link to any libraries.
+
+The UWP build may require linking to mmdevapi.lib if you get errors about an unresolved external
+symbol for `ActivateAudioInterfaceAsync()`.
+
+
+2.2. macOS and iOS
+------------------
+The macOS build should compile cleanly without the need to download any dependencies nor link to
+any libraries or frameworks. The iOS build needs to be compiled as Objective-C and will need to
+link the relevant frameworks but should compile cleanly out of the box with Xcode. Compiling
+through the command line requires linking to `-lpthread` and `-lm`.
+
+Due to the way miniaudio links to frameworks at runtime, your application may not pass Apple's
+notarization process. To fix this there are two options. The first is to compile with
+`-DMA_NO_RUNTIME_LINKING` which in turn will require linking with
+`-framework CoreFoundation -framework CoreAudio -framework AudioToolbox`. If you get errors about
+AudioToolbox, try with `-framework AudioUnit` instead. You may get this when using older versions
+of iOS. Alternatively, if you would rather keep using runtime linking you can add the following to
+your entitlements.xcent file:
+
+    ```
+    <key>com.apple.security.cs.allow-dyld-environment-variables</key>
+    <true/>
+    <key>com.apple.security.cs.allow-unsigned-executable-memory</key>
+    <true/>
+    ```
+
+See this discussion for more info: https://github.com/mackron/miniaudio/issues/203.
+
+
+2.3. Linux
+----------
+The Linux build only requires linking to `-ldl`, `-lpthread` and `-lm`. You do not need any
+development packages. You may need to link with `-latomic` if you're compiling for 32-bit ARM.
+
+
+2.4. BSD
+--------
+The BSD build only requires linking to `-lpthread` and `-lm`. NetBSD uses audio(4), OpenBSD uses
+sndio and FreeBSD uses OSS. You may need to link with `-latomic` if you're compiling for 32-bit
+ARM.
+
+
+2.5. Android
+------------
+AAudio is the highest priority backend on Android. This should work out of the box without needing
+any kind of compiler configuration. Support for AAudio starts with Android 8 which means older
+versions will fall back to OpenSL|ES which requires API level 16+.
+
+There have been reports that the OpenSL|ES backend fails to initialize on some Android based
+devices due to `dlopen()` failing to open "libOpenSLES.so". If this happens on your platform
+you'll need to disable run-time linking with `MA_NO_RUNTIME_LINKING` and link with -lOpenSLES.
+
+
+2.6. Emscripten
+---------------
+The Emscripten build emits Web Audio JavaScript directly and should compile cleanly out of the box.
+You cannot use `-std=c*` compiler flags, nor `-ansi`.
+
+You can enable the use of AudioWorkets by defining `MA_ENABLE_AUDIO_WORKLETS` and then compiling
+with the following options:
+
+    -sAUDIO_WORKLET=1 -sWASM_WORKERS=1 -sASYNCIFY
+
+An example for compiling with AudioWorklet support might look like this:
+
+    emcc program.c -o bin/program.html -DMA_ENABLE_AUDIO_WORKLETS -sAUDIO_WORKLET=1 -sWASM_WORKERS=1 -sASYNCIFY
+
+To run locally, you'll need to use emrun:
+
+    emrun bin/program.html
+
+
+
+2.7. Build Options
+------------------
+`#define` these options before including miniaudio.c, or pass them as compiler flags:
+
+    +----------------------------------+--------------------------------------------------------------------+
+    | Option                           | Description                                                        |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_WASAPI                     | Disables the WASAPI backend.                                       |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_DSOUND                     | Disables the DirectSound backend.                                  |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_WINMM                      | Disables the WinMM backend.                                        |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_ALSA                       | Disables the ALSA backend.                                         |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_PULSEAUDIO                 | Disables the PulseAudio backend.                                   |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_JACK                       | Disables the JACK backend.                                         |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_COREAUDIO                  | Disables the Core Audio backend.                                   |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_SNDIO                      | Disables the sndio backend.                                        |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_AUDIO4                     | Disables the audio(4) backend.                                     |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_OSS                        | Disables the OSS backend.                                          |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_AAUDIO                     | Disables the AAudio backend.                                       |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_OPENSL                     | Disables the OpenSL|ES backend.                                    |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_WEBAUDIO                   | Disables the Web Audio backend.                                    |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_CUSTOM                     | Disables support for custom backends.                              |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_NULL                       | Disables the null backend.                                         |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_ONLY_SPECIFIC_BACKENDS | Disables all backends by default and requires `MA_ENABLE_*` to     |
+    |                                  | enable specific backends.                                          |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_WASAPI                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the WASAPI backend.                                         |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_DSOUND                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the DirectSound backend.                                    |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_WINMM                  | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the WinMM backend.                                          |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_ALSA                   | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the ALSA backend.                                           |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_PULSEAUDIO             | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the PulseAudio backend.                                     |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_JACK                   | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the JACK backend.                                           |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_COREAUDIO              | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the Core Audio backend.                                     |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_SNDIO                  | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the sndio backend.                                          |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_AUDIO4                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the audio(4) backend.                                       |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_OSS                    | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the OSS backend.                                            |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_AAUDIO                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the AAudio backend.                                         |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_OPENSL                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the OpenSL|ES backend.                                      |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_WEBAUDIO               | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the Web Audio backend.                                      |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_CUSTOM                 | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable custom backends.                                            |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ENABLE_NULL                   | Used in conjunction with MA_ENABLE_ONLY_SPECIFIC_BACKENDS to       |
+    |                                  | enable the null backend.                                           |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_DECODING                   | Disables decoding APIs.                                            |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_ENCODING                   | Disables encoding APIs.                                            |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_WAV                        | Disables the built-in WAV decoder and encoder.                     |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_FLAC                       | Disables the built-in FLAC decoder.                                |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_MP3                        | Disables the built-in MP3 decoder.                                 |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_DEVICE_IO                  | Disables playback and recording. This will disable `ma_context`    |
+    |                                  | and `ma_device` APIs. This is useful if you only want to use       |
+    |                                  | miniaudio's data conversion and/or decoding APIs.                  |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_RESOURCE_MANAGER           | Disables the resource manager. When using the engine this will     |
+    |                                  | also disable the following functions:                              |
+    |                                  |                                                                    |
+    |                                  | ```                                                                |
+    |                                  | ma_sound_init_from_file()                                          |
+    |                                  | ma_sound_init_from_file_w()                                        |
+    |                                  | ma_sound_init_copy()                                               |
+    |                                  | ma_engine_play_sound_ex()                                          |
+    |                                  | ma_engine_play_sound()                                             |
+    |                                  | ```                                                                |
+    |                                  |                                                                    |
+    |                                  | The only way to initialize a `ma_sound` object is to initialize it |
+    |                                  | from a data source.                                                |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_NODE_GRAPH                 | Disables the node graph API. This will also disable the engine API |
+    |                                  | because it depends on the node graph.                              |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_ENGINE                     | Disables the engine API.                                           |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_THREADING                  | Disables the `ma_thread`, `ma_mutex`, `ma_semaphore` and           |
+    |                                  | `ma_event` APIs. This option is useful if you only need to use     |
+    |                                  | miniaudio for data conversion, decoding and/or encoding. Some      |
+    |                                  | families of APIs require threading which means the following       |
+    |                                  | options must also be set:                                          |
+    |                                  |                                                                    |
+    |                                  |     ```                                                            |
+    |                                  |     MA_NO_DEVICE_IO                                                |
+    |                                  |     ```                                                            |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_GENERATION                 | Disables generation APIs such a `ma_waveform` and `ma_noise`.      |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_SSE2                       | Disables SSE2 optimizations.                                       |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_AVX2                       | Disables AVX2 optimizations.                                       |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_NEON                       | Disables NEON optimizations.                                       |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_NO_RUNTIME_LINKING            | Disables runtime linking. This is useful for passing Apple's       |
+    |                                  | notarization process. When enabling this, you may need to avoid    |
+    |                                  | using `-std=c89` or `-std=c99` on Linux builds or else you may end |
+    |                                  | up with compilation errors due to conflicts with `timespec` and    |
+    |                                  | `timeval` data types.                                              |
+    |                                  |                                                                    |
+    |                                  | You may need to enable this if your target platform does not allow |
+    |                                  | runtime linking via `dlopen()`.                                    |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_USE_STDINT                    | (Pass this in as a compiler flag. Do not `#define` this before     |
+    |                                  | miniaudio.c) Forces the use of stdint.h for sized types.           |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_DEBUG_OUTPUT                  | Enable `printf()` output of debug logs (`MA_LOG_LEVEL_DEBUG`).     |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_COINIT_VALUE                  | Windows only. The value to pass to internal calls to               |
+    |                                  | `CoInitializeEx()`. Defaults to `COINIT_MULTITHREADED`.            |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_FORCE_UWP                     | Windows only. Affects only the WASAPI backend. Will force the      |
+    |                                  | WASAPI backend to use the UWP code path instead of the regular     |
+    |                                  | desktop path. This is normally auto-detected and should rarely be  |
+    |                                  | needed to be used explicitly, but can be useful for debugging.     |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ON_THREAD_ENTRY               | Defines some code that will be executed as soon as an internal     |
+    |                                  | miniaudio-managed thread is created. This will be the first thing  |
+    |                                  | to be executed by the thread entry point.                          |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_ON_THREAD_EXIT                | Defines some code that will be executed from the entry point of an |
+    |                                  | internal miniaudio-managed thread upon exit. This will be the last |
+    |                                  | thing to be executed before the thread's entry point exits.        |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_THREAD_DEFAULT_STACK_SIZE     | If set, specifies the default stack size used by miniaudio-managed |
+    |                                  | threads.                                                           |
+    +----------------------------------+--------------------------------------------------------------------+
+    | MA_API                           | Controls how public APIs should be decorated. Default is `extern`. |
+    +----------------------------------+--------------------------------------------------------------------+
+
+
+3. Definitions
+==============
+This section defines common terms used throughout miniaudio. Unfortunately there is often ambiguity
+in the use of terms throughout the audio space, so this section is intended to clarify how miniaudio
+uses each term.
+
+3.1. Sample
+-----------
+A sample is a single unit of audio data. If the sample format is f32, then one sample is one 32-bit
+floating point number.
+
+3.2. Frame / PCM Frame
+----------------------
+A frame is a group of samples equal to the number of channels. For a stereo stream a frame is 2
+samples, a mono frame is 1 sample, a 5.1 surround sound frame is 6 samples, etc. The terms "frame"
+and "PCM frame" are the same thing in miniaudio. Note that this is different to a compressed frame.
+If ever miniaudio needs to refer to a compressed frame, such as a FLAC frame, it will always
+clarify what it's referring to with something like "FLAC frame".
+
+3.3. Channel
+------------
+A stream of monaural audio that is emitted from an individual speaker in a speaker system, or
+received from an individual microphone in a microphone system. A stereo stream has two channels (a
+left channel, and a right channel), a 5.1 surround sound system has 6 channels, etc. Some audio
+systems refer to a channel as a complex audio stream that's mixed with other channels to produce
+the final mix - this is completely different to miniaudio's use of the term "channel" and should
+not be confused.
+
+3.4. Sample Rate
+----------------
+The sample rate in miniaudio is always expressed in Hz, such as 44100, 48000, etc. It's the number
+of PCM frames that are processed per second.
+
+3.5. Formats
+------------
+Throughout miniaudio you will see references to different sample formats:
+
+    +---------------+----------------------------------------+---------------------------+
+    | Symbol        | Description                            | Range                     |
+    +---------------+----------------------------------------+---------------------------+
+    | ma_format_f32 | 32-bit floating point                  | [-1, 1]                   |
+    | ma_format_s16 | 16-bit signed integer                  | [-32768, 32767]           |
+    | ma_format_s24 | 24-bit signed integer (tightly packed) | [-8388608, 8388607]       |
+    | ma_format_s32 | 32-bit signed integer                  | [-2147483648, 2147483647] |
+    | ma_format_u8  | 8-bit unsigned integer                 | [0, 255]                  |
+    +---------------+----------------------------------------+---------------------------+
+
+All formats are native-endian.
+
+
+
+4. Data Sources
+===============
+The data source abstraction in miniaudio is used for retrieving audio data from some source. A few
+examples include `ma_decoder`, `ma_noise` and `ma_waveform`. You will need to be familiar with data
+sources in order to make sense of some of the higher level concepts in miniaudio.
+
+The `ma_data_source` API is a generic interface for reading from a data source. Any object that
+implements the data source interface can be plugged into any `ma_data_source` function.
+
+To read data from a data source:
+
+    ```c
+    ma_result result;
+    ma_uint64 framesRead;
+
+    result = ma_data_source_read_pcm_frames(pDataSource, pFramesOut, frameCount, &framesRead);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to read data from the data source.
+    }
+    ```
+
+If you don't need the number of frames that were successfully read you can pass in `NULL` to the
+`pFramesRead` parameter. If this returns a value less than the number of frames requested it means
+the end of the file has been reached. `MA_AT_END` will be returned only when the number of frames
+read is 0.
+
+When calling any data source function, with the exception of `ma_data_source_init()` and
+`ma_data_source_uninit()`, you can pass in any object that implements a data source. For example,
+you could plug in a decoder like so:
+
+    ```c
+    ma_result result;
+    ma_uint64 framesRead;
+    ma_decoder decoder;   // <-- This would be initialized with `ma_decoder_init_*()`.
+
+    result = ma_data_source_read_pcm_frames(&decoder, pFramesOut, frameCount, &framesRead);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to read data from the decoder.
+    }
+    ```
+
+If you want to seek forward you can pass in `NULL` to the `pFramesOut` parameter. Alternatively you
+can use `ma_data_source_seek_pcm_frames()`.
+
+To seek to a specific PCM frame:
+
+    ```c
+    result = ma_data_source_seek_to_pcm_frame(pDataSource, frameIndex);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to seek to PCM frame.
+    }
+    ```
+
+You can retrieve the total length of a data source in PCM frames, but note that some data sources
+may not have the notion of a length, such as noise and waveforms, and others may just not have a
+way of determining the length such as some decoders. To retrieve the length:
+
+    ```c
+    ma_uint64 length;
+
+    result = ma_data_source_get_length_in_pcm_frames(pDataSource, &length);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to retrieve the length.
+    }
+    ```
+
+Care should be taken when retrieving the length of a data source where the underlying decoder is
+pulling data from a data stream with an undefined length, such as internet radio or some kind of
+broadcast. If you do this, `ma_data_source_get_length_in_pcm_frames()` may never return.
+
+The current position of the cursor in PCM frames can also be retrieved:
+
+    ```c
+    ma_uint64 cursor;
+
+    result = ma_data_source_get_cursor_in_pcm_frames(pDataSource, &cursor);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to retrieve the cursor.
+    }
+    ```
+
+You will often need to know the data format that will be returned after reading. This can be
+retrieved like so:
+
+    ```c
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_channel channelMap[MA_MAX_CHANNELS];
+
+    result = ma_data_source_get_data_format(pDataSource, &format, &channels, &sampleRate, channelMap, MA_MAX_CHANNELS);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to retrieve data format.
+    }
+    ```
+
+If you do not need a specific data format property, just pass in NULL to the respective parameter.
+
+There may be cases where you want to implement something like a sound bank where you only want to
+read data within a certain range of the underlying data. To do this you can use a range:
+
+    ```c
+    result = ma_data_source_set_range_in_pcm_frames(pDataSource, rangeBegInFrames, rangeEndInFrames);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to set the range.
+    }
+    ```
+
+This is useful if you have a sound bank where many sounds are stored in the same file and you want
+the data source to only play one of those sub-sounds. Note that once the range is set, everything
+that takes a position, such as cursors and loop points, should always be relatvie to the start of
+the range. When the range is set, any previously defined loop point will be reset.
+
+Custom loop points can also be used with data sources. By default, data sources will loop after
+they reach the end of the data source, but if you need to loop at a specific location, you can do
+the following:
+
+    ```c
+    result = ma_data_set_loop_point_in_pcm_frames(pDataSource, loopBegInFrames, loopEndInFrames);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to set the loop point.
+    }
+    ```
+
+The loop point is relative to the current range.
+
+It's sometimes useful to chain data sources together so that a seamless transition can be achieved.
+To do this, you can use chaining:
+
+    ```c
+    ma_decoder decoder1;
+    ma_decoder decoder2;
+
+    // ... initialize decoders with ma_decoder_init_*() ...
+
+    result = ma_data_source_set_next(&decoder1, &decoder2);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to set the next data source.
+    }
+
+    result = ma_data_source_read_pcm_frames(&decoder1, pFramesOut, frameCount, pFramesRead);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to read from the decoder.
+    }
+    ```
+
+In the example above we're using decoders. When reading from a chain, you always want to read from
+the top level data source in the chain. In the example above, `decoder1` is the top level data
+source in the chain. When `decoder1` reaches the end, `decoder2` will start seamlessly without any
+gaps.
+
+Note that when looping is enabled, only the current data source will be looped. You can loop the
+entire chain by linking in a loop like so:
+
+    ```c
+    ma_data_source_set_next(&decoder1, &decoder2);  // decoder1 -> decoder2
+    ma_data_source_set_next(&decoder2, &decoder1);  // decoder2 -> decoder1 (loop back to the start).
+    ```
+
+Note that setting up chaining is not thread safe, so care needs to be taken if you're dynamically
+changing links while the audio thread is in the middle of reading.
+
+Do not use `ma_decoder_seek_to_pcm_frame()` as a means to reuse a data source to play multiple
+instances of the same sound simultaneously. This can be extremely inefficient depending on the type
+of data source and can result in glitching due to subtle changes to the state of internal filters.
+Instead, initialize multiple data sources for each instance.
+
+
+4.1. Custom Data Sources
+------------------------
+You can implement a custom data source by implementing the functions in `ma_data_source_vtable`.
+Your custom object must have `ma_data_source_base` as it's first member:
+
+    ```c
+    struct my_data_source
+    {
+        ma_data_source_base base;
+        ...
+    };
+    ```
+
+In your initialization routine, you need to call `ma_data_source_init()` in order to set up the
+base object (`ma_data_source_base`):
+
+    ```c
+    static ma_result my_data_source_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+    {
+        // Read data here. Output in the same format returned by my_data_source_get_data_format().
+    }
+
+    static ma_result my_data_source_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+    {
+        // Seek to a specific PCM frame here. Return MA_NOT_IMPLEMENTED if seeking is not supported.
+    }
+
+    static ma_result my_data_source_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+    {
+        // Return the format of the data here.
+    }
+
+    static ma_result my_data_source_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+    {
+        // Retrieve the current position of the cursor here. Return MA_NOT_IMPLEMENTED and set *pCursor to 0 if there is no notion of a cursor.
+    }
+
+    static ma_result my_data_source_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
+    {
+        // Retrieve the length in PCM frames here. Return MA_NOT_IMPLEMENTED and set *pLength to 0 if there is no notion of a length or if the length is unknown.
+    }
+
+    static ma_data_source_vtable g_my_data_source_vtable =
+    {
+        my_data_source_read,
+        my_data_source_seek,
+        my_data_source_get_data_format,
+        my_data_source_get_cursor,
+        my_data_source_get_length
+    };
+
+    ma_result my_data_source_init(my_data_source* pMyDataSource)
+    {
+        ma_result result;
+        ma_data_source_config baseConfig;
+
+        baseConfig = ma_data_source_config_init();
+        baseConfig.vtable = &g_my_data_source_vtable;
+
+        result = ma_data_source_init(&baseConfig, &pMyDataSource->base);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        // ... do the initialization of your custom data source here ...
+
+        return MA_SUCCESS;
+    }
+
+    void my_data_source_uninit(my_data_source* pMyDataSource)
+    {
+        // ... do the uninitialization of your custom data source here ...
+
+        // You must uninitialize the base data source.
+        ma_data_source_uninit(&pMyDataSource->base);
+    }
+    ```
+
+Note that `ma_data_source_init()` and `ma_data_source_uninit()` are never called directly outside
+of the custom data source. It's up to the custom data source itself to call these within their own
+init/uninit functions.
+
+
+
+5. Engine
+=========
+The `ma_engine` API is a high level API for managing and mixing sounds and effect processing. The
+`ma_engine` object encapsulates a resource manager and a node graph, both of which will be
+explained in more detail later.
+
+Sounds are called `ma_sound` and are created from an engine. Sounds can be associated with a mixing
+group called `ma_sound_group` which are also created from the engine. Both `ma_sound` and
+`ma_sound_group` objects are nodes within the engine's node graph.
+
+When the engine is initialized, it will normally create a device internally. If you would rather
+manage the device yourself, you can do so and just pass a pointer to it via the engine config when
+you initialize the engine. You can also just use the engine without a device, which again can be
+configured via the engine config.
+
+The most basic way to initialize the engine is with a default config, like so:
+
+    ```c
+    ma_result result;
+    ma_engine engine;
+
+    result = ma_engine_init(NULL, &engine);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to initialize the engine.
+    }
+    ```
+
+This will result in the engine initializing a playback device using the operating system's default
+device. This will be sufficient for many use cases, but if you need more flexibility you'll want to
+configure the engine with an engine config:
+
+    ```c
+    ma_result result;
+    ma_engine engine;
+    ma_engine_config engineConfig;
+
+    engineConfig = ma_engine_config_init();
+    engineConfig.pDevice = &myDevice;
+
+    result = ma_engine_init(&engineConfig, &engine);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to initialize the engine.
+    }
+    ```
+
+In the example above we're passing in a pre-initialized device. Since the caller is the one in
+control of the device's data callback, it's their responsibility to manually call
+`ma_engine_read_pcm_frames()` from inside their data callback:
+
+    ```c
+    void playback_data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount)
+    {
+        ma_engine_read_pcm_frames(&g_Engine, pOutput, frameCount, NULL);
+    }
+    ```
+
+You can also use the engine independent of a device entirely:
+
+    ```c
+    ma_result result;
+    ma_engine engine;
+    ma_engine_config engineConfig;
+
+    engineConfig = ma_engine_config_init();
+    engineConfig.noDevice   = MA_TRUE;
+    engineConfig.channels   = 2;        // Must be set when not using a device.
+    engineConfig.sampleRate = 48000;    // Must be set when not using a device.
+
+    result = ma_engine_init(&engineConfig, &engine);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to initialize the engine.
+    }
+    ```
+
+Note that when you're not using a device, you must set the channel count and sample rate in the
+config or else miniaudio won't know what to use (miniaudio will use the device to determine this
+normally). When not using a device, you need to use `ma_engine_read_pcm_frames()` to process audio
+data from the engine. This kind of setup is useful if you want to do something like offline
+processing or want to use a different audio system for playback such as SDL.
+
+When a sound is loaded it goes through a resource manager. By default the engine will initialize a
+resource manager internally, but you can also specify a pre-initialized resource manager:
+
+    ```c
+    ma_result result;
+    ma_engine engine1;
+    ma_engine engine2;
+    ma_engine_config engineConfig;
+
+    engineConfig = ma_engine_config_init();
+    engineConfig.pResourceManager = &myResourceManager;
+
+    ma_engine_init(&engineConfig, &engine1);
+    ma_engine_init(&engineConfig, &engine2);
+    ```
+
+In this example we are initializing two engines, both of which are sharing the same resource
+manager. This is especially useful for saving memory when loading the same file across multiple
+engines. If you were not to use a shared resource manager, each engine instance would use their own
+which would result in any sounds that are used between both engine's being loaded twice. By using
+a shared resource manager, it would only be loaded once. Using multiple engine's is useful when you
+need to output to multiple playback devices, such as in a local multiplayer game where each player
+is using their own set of headphones.
+
+By default an engine will be in a started state. To make it so the engine is not automatically
+started you can configure it as such:
+
+    ```c
+    engineConfig.noAutoStart = MA_TRUE;
+
+    // The engine will need to be started manually.
+    ma_engine_start(&engine);
+
+    // Later on the engine can be stopped with ma_engine_stop().
+    ma_engine_stop(&engine);
+    ```
+
+The concept of starting or stopping an engine is only relevant when using the engine with a
+device. Attempting to start or stop an engine that is not associated with a device will result in
+`MA_INVALID_OPERATION`.
+
+The master volume of the engine can be controlled with `ma_engine_set_volume()` which takes a
+linear scale, with 0 resulting in silence and anything above 1 resulting in amplification. If you
+prefer decibel based volume control, use `ma_volume_db_to_linear()` to convert from dB to linear.
+
+When a sound is spatialized, it is done so relative to a listener. An engine can be configured to
+have multiple listeners which can be configured via the config:
+
+    ```c
+    engineConfig.listenerCount = 2;
+    ```
+
+The maximum number of listeners is restricted to `MA_ENGINE_MAX_LISTENERS`. By default, when a
+sound is spatialized, it will be done so relative to the closest listener. You can also pin a sound
+to a specific listener which will be explained later. Listener's have a position, direction, cone,
+and velocity (for doppler effect). A listener is referenced by an index, the meaning of which is up
+to the caller (the index is 0 based and cannot go beyond the listener count, minus 1). The
+position, direction and velocity are all specified in absolute terms:
+
+    ```c
+    ma_engine_listener_set_position(&engine, listenerIndex, worldPosX, worldPosY, worldPosZ);
+    ```
+
+The direction of the listener represents it's forward vector. The listener's up vector can also be
+specified and defaults to +1 on the Y axis.
+
+    ```c
+    ma_engine_listener_set_direction(&engine, listenerIndex, forwardX, forwardY, forwardZ);
+    ma_engine_listener_set_world_up(&engine, listenerIndex, 0, 1, 0);
+    ```
+
+The engine supports directional attenuation. The listener can have a cone the controls how sound is
+attenuated based on the listener's direction. When a sound is between the inner and outer cones, it
+will be attenuated between 1 and the cone's outer gain:
+
+    ```c
+    ma_engine_listener_set_cone(&engine, listenerIndex, innerAngleInRadians, outerAngleInRadians, outerGain);
+    ```
+
+When a sound is inside the inner code, no directional attenuation is applied. When the sound is
+outside of the outer cone, the attenuation will be set to `outerGain` in the example above. When
+the sound is in between the inner and outer cones, the attenuation will be interpolated between 1
+and the outer gain.
+
+The engine's coordinate system follows the OpenGL coordinate system where positive X points right,
+positive Y points up and negative Z points forward.
+
+The simplest and least flexible way to play a sound is like so:
+
+    ```c
+    ma_engine_play_sound(&engine, "my_sound.wav", pGroup);
+    ```
+
+This is a "fire and forget" style of function. The engine will manage the `ma_sound` object
+internally. When the sound finishes playing, it'll be put up for recycling. For more flexibility
+you'll want to initialize a sound object:
+
+    ```c
+    ma_sound sound;
+
+    result = ma_sound_init_from_file(&engine, "my_sound.wav", flags, pGroup, NULL, &sound);
+    if (result != MA_SUCCESS) {
+        return result;  // Failed to load sound.
+    }
+    ```
+
+Sounds need to be uninitialized with `ma_sound_uninit()`.
+
+The example above loads a sound from a file. If the resource manager has been disabled you will not
+be able to use this function and instead you'll need to initialize a sound directly from a data
+source:
+
+    ```c
+    ma_sound sound;
+
+    result = ma_sound_init_from_data_source(&engine, &dataSource, flags, pGroup, &sound);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+    ```
+
+Each `ma_sound` object represents a single instance of the sound. If you want to play the same
+sound multiple times at the same time, you need to initialize a separate `ma_sound` object.
+
+For the most flexibility when initializing sounds, use `ma_sound_init_ex()`. This uses miniaudio's
+standard config/init pattern:
+
+    ```c
+    ma_sound sound;
+    ma_sound_config soundConfig;
+
+    soundConfig = ma_sound_config_init();
+    soundConfig.pFilePath   = NULL; // Set this to load from a file path.
+    soundConfig.pDataSource = NULL; // Set this to initialize from an existing data source.
+    soundConfig.pInitialAttachment = &someNodeInTheNodeGraph;
+    soundConfig.initialAttachmentInputBusIndex = 0;
+    soundConfig.channelsIn  = 1;
+    soundConfig.channelsOut = 0;    // Set to 0 to use the engine's native channel count.
+
+    result = ma_sound_init_ex(&soundConfig, &sound);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+    ```
+
+In the example above, the sound is being initialized without a file nor a data source. This is
+valid, in which case the sound acts as a node in the middle of the node graph. This means you can
+connect other sounds to this sound and allow it to act like a sound group. Indeed, this is exactly
+what a `ma_sound_group` is.
+
+When loading a sound, you specify a set of flags that control how the sound is loaded and what
+features are enabled for that sound. When no flags are set, the sound will be fully loaded into
+memory in exactly the same format as how it's stored on the file system. The resource manager will
+allocate a block of memory and then load the file directly into it. When reading audio data, it
+will be decoded dynamically on the fly. In order to save processing time on the audio thread, it
+might be beneficial to pre-decode the sound. You can do this with the `MA_SOUND_FLAG_DECODE` flag:
+
+    ```c
+    ma_sound_init_from_file(&engine, "my_sound.wav", MA_SOUND_FLAG_DECODE, pGroup, NULL, &sound);
+    ```
+
+By default, sounds will be loaded synchronously, meaning `ma_sound_init_*()` will not return until
+the sound has been fully loaded. If this is prohibitive you can instead load sounds asynchronously
+by specifying the `MA_SOUND_FLAG_ASYNC` flag:
+
+    ```c
+    ma_sound_init_from_file(&engine, "my_sound.wav", MA_SOUND_FLAG_DECODE | MA_SOUND_FLAG_ASYNC, pGroup, NULL, &sound);
+    ```
+
+This will result in `ma_sound_init_*()` returning quickly, but the sound won't yet have been fully
+loaded. When you start the sound, it won't output anything until some sound is available. The sound
+will start outputting audio before the sound has been fully decoded when the `MA_SOUND_FLAG_DECODE`
+is specified.
+
+If you need to wait for an asynchronously loaded sound to be fully loaded, you can use a fence. A
+fence in miniaudio is a simple synchronization mechanism which simply blocks until it's internal
+counter hit's zero. You can specify a fence like so:
+
+    ```c
+    ma_result result;
+    ma_fence fence;
+    ma_sound sounds[4];
+
+    result = ma_fence_init(&fence);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    // Load some sounds asynchronously.
+    for (int iSound = 0; iSound < 4; iSound += 1) {
+        ma_sound_init_from_file(&engine, mySoundFilesPaths[iSound], MA_SOUND_FLAG_DECODE | MA_SOUND_FLAG_ASYNC, pGroup, &fence, &sounds[iSound]);
+    }
+
+    // ... do some other stuff here in the mean time ...
+
+    // Wait for all sounds to finish loading.
+    ma_fence_wait(&fence);
+    ```
+
+If loading the entire sound into memory is prohibitive, you can also configure the engine to stream
+the audio data:
+
+    ```c
+    ma_sound_init_from_file(&engine, "my_sound.wav", MA_SOUND_FLAG_STREAM, pGroup, NULL, &sound);
+    ```
+
+When streaming sounds, 2 seconds worth of audio data is stored in memory. Although it should work
+fine, it's inefficient to use streaming for short sounds. Streaming is useful for things like music
+tracks in games.
+
+When loading a sound from a file path, the engine will reference count the file to prevent it from
+being loaded if it's already in memory. When you uninitialize a sound, the reference count will be
+decremented, and if it hits zero, the sound will be unloaded from memory. This reference counting
+system is not used for streams. The engine will use a 64-bit hash of the file name when comparing
+file paths which means there's a small chance you might encounter a name collision. If this is an
+issue, you'll need to use a different name for one of the colliding file paths, or just not load
+from files and instead load from a data source.
+
+You can use `ma_sound_init_copy()` to initialize a copy of another sound. Note, however, that this
+only works for sounds that were initialized with `ma_sound_init_from_file()` and without the
+`MA_SOUND_FLAG_STREAM` flag.
+
+When you initialize a sound, if you specify a sound group the sound will be attached to that group
+automatically. If you set it to NULL, it will be automatically attached to the engine's endpoint.
+If you would instead rather leave the sound unattached by default, you can specify the
+`MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT` flag. This is useful if you want to set up a complex node
+graph.
+
+Sounds are not started by default. To start a sound, use `ma_sound_start()`. Stop a sound with
+`ma_sound_stop()`.
+
+Sounds can have their volume controlled with `ma_sound_set_volume()` in the same way as the
+engine's master volume.
+
+Sounds support stereo panning and pitching. Set the pan with `ma_sound_set_pan()`. Setting the pan
+to 0 will result in an unpanned sound. Setting it to -1 will shift everything to the left, whereas
++1 will shift it to the right. The pitch can be controlled with `ma_sound_set_pitch()`. A larger
+value will result in a higher pitch. The pitch must be greater than 0.
+
+The engine supports 3D spatialization of sounds. By default sounds will have spatialization
+enabled, but if a sound does not need to be spatialized it's best to disable it. There are two ways
+to disable spatialization of a sound:
+
+    ```c
+    // Disable spatialization at initialization time via a flag:
+    ma_sound_init_from_file(&engine, "my_sound.wav", MA_SOUND_FLAG_NO_SPATIALIZATION, NULL, NULL, &sound);
+
+    // Dynamically disable or enable spatialization post-initialization:
+    ma_sound_set_spatialization_enabled(&sound, isSpatializationEnabled);
+    ```
+
+By default sounds will be spatialized based on the closest listener. If a sound should always be
+spatialized relative to a specific listener it can be pinned to one:
+
+    ```c
+    ma_sound_set_pinned_listener_index(&sound, listenerIndex);
+    ```
+
+Like listeners, sounds have a position. By default, the position of a sound is in absolute space,
+but it can be changed to be relative to a listener:
+
+    ```c
+    ma_sound_set_positioning(&sound, ma_positioning_relative);
+    ```
+
+Note that relative positioning of a sound only makes sense if there is either only one listener, or
+the sound is pinned to a specific listener. To set the position of a sound:
+
+    ```c
+    ma_sound_set_position(&sound, posX, posY, posZ);
+    ```
+
+The direction works the same way as a listener and represents the sound's forward direction:
+
+    ```c
+    ma_sound_set_direction(&sound, forwardX, forwardY, forwardZ);
+    ```
+
+Sound's also have a cone for controlling directional attenuation. This works exactly the same as
+listeners:
+
+    ```c
+    ma_sound_set_cone(&sound, innerAngleInRadians, outerAngleInRadians, outerGain);
+    ```
+
+The velocity of a sound is used for doppler effect and can be set as such:
+
+    ```c
+    ma_sound_set_velocity(&sound, velocityX, velocityY, velocityZ);
+    ```
+
+The engine supports different attenuation models which can be configured on a per-sound basis. By
+default the attenuation model is set to `ma_attenuation_model_inverse` which is the equivalent to
+OpenAL's `AL_INVERSE_DISTANCE_CLAMPED`. Configure the attenuation model like so:
+
+    ```c
+    ma_sound_set_attenuation_model(&sound, ma_attenuation_model_inverse);
+    ```
+
+The supported attenuation models include the following:
+
+    +----------------------------------+----------------------------------------------+
+    | ma_attenuation_model_none        | No distance attenuation.                     |
+    +----------------------------------+----------------------------------------------+
+    | ma_attenuation_model_inverse     | Equivalent to `AL_INVERSE_DISTANCE_CLAMPED`. |
+    +----------------------------------+----------------------------------------------+
+    | ma_attenuation_model_linear      | Linear attenuation.                          |
+    +----------------------------------+----------------------------------------------+
+    | ma_attenuation_model_exponential | Exponential attenuation.                     |
+    +----------------------------------+----------------------------------------------+
+
+To control how quickly a sound rolls off as it moves away from the listener, you need to configure
+the rolloff:
+
+    ```c
+    ma_sound_set_rolloff(&sound, rolloff);
+    ```
+
+You can control the minimum and maximum gain to apply from spatialization:
+
+    ```c
+    ma_sound_set_min_gain(&sound, minGain);
+    ma_sound_set_max_gain(&sound, maxGain);
+    ```
+
+Likewise, in the calculation of attenuation, you can control the minimum and maximum distances for
+the attenuation calculation. This is useful if you want to ensure sounds don't drop below a certain
+volume after the listener moves further away and to have sounds play a maximum volume when the
+listener is within a certain distance:
+
+    ```c
+    ma_sound_set_min_distance(&sound, minDistance);
+    ma_sound_set_max_distance(&sound, maxDistance);
+    ```
+
+The engine's spatialization system supports doppler effect. The doppler factor can be configure on
+a per-sound basis like so:
+
+    ```c
+    ma_sound_set_doppler_factor(&sound, dopplerFactor);
+    ```
+
+You can fade sounds in and out with `ma_sound_set_fade_in_pcm_frames()` and
+`ma_sound_set_fade_in_milliseconds()`. Set the volume to -1 to use the current volume as the
+starting volume:
+
+    ```c
+    // Fade in over 1 second.
+    ma_sound_set_fade_in_milliseconds(&sound, 0, 1, 1000);
+
+    // ... sometime later ...
+
+    // Fade out over 1 second, starting from the current volume.
+    ma_sound_set_fade_in_milliseconds(&sound, -1, 0, 1000);
+    ```
+
+By default sounds will start immediately, but sometimes for timing and synchronization purposes it
+can be useful to schedule a sound to start or stop:
+
+    ```c
+    // Start the sound in 1 second from now.
+    ma_sound_set_start_time_in_pcm_frames(&sound, ma_engine_get_time_in_pcm_frames(&engine) + (ma_engine_get_sample_rate(&engine) * 1));
+
+    // Stop the sound in 2 seconds from now.
+    ma_sound_set_stop_time_in_pcm_frames(&sound, ma_engine_get_time_in_pcm_frames(&engine) + (ma_engine_get_sample_rate(&engine) * 2));
+    ```
+
+Note that scheduling a start time still requires an explicit call to `ma_sound_start()` before
+anything will play.
+
+The time is specified in global time which is controlled by the engine. You can get the engine's
+current time with `ma_engine_get_time_in_pcm_frames()`. The engine's global time is incremented
+automatically as audio data is read, but it can be reset with `ma_engine_set_time_in_pcm_frames()`
+in case it needs to be resynchronized for some reason.
+
+To determine whether or not a sound is currently playing, use `ma_sound_is_playing()`. This will
+take the scheduled start and stop times into account.
+
+Whether or not a sound should loop can be controlled with `ma_sound_set_looping()`. Sounds will not
+be looping by default. Use `ma_sound_is_looping()` to determine whether or not a sound is looping.
+
+Use `ma_sound_at_end()` to determine whether or not a sound is currently at the end. For a looping
+sound this should never return true. Alternatively, you can configure a callback that will be fired
+when the sound reaches the end. Note that the callback is fired from the audio thread which means
+you cannot be uninitializing sound from the callback. To set the callback you can use
+`ma_sound_set_end_callback()`. Alternatively, if you're using `ma_sound_init_ex()`, you can pass it
+into the config like so:
+
+    ```c
+    soundConfig.endCallback = my_end_callback;
+    soundConfig.pEndCallbackUserData = pMyEndCallbackUserData;
+    ```
+
+The end callback is declared like so:
+
+    ```c
+    void my_end_callback(void* pUserData, ma_sound* pSound)
+    {
+        ...
+    }
+    ```
+
+Internally a sound wraps around a data source. Some APIs exist to control the underlying data
+source, mainly for convenience:
+
+    ```c
+    ma_sound_seek_to_pcm_frame(&sound, frameIndex);
+    ma_sound_get_data_format(&sound, &format, &channels, &sampleRate, pChannelMap, channelMapCapacity);
+    ma_sound_get_cursor_in_pcm_frames(&sound, &cursor);
+    ma_sound_get_length_in_pcm_frames(&sound, &length);
+    ```
+
+Sound groups have the same API as sounds, only they are called `ma_sound_group`, and since they do
+not have any notion of a data source, anything relating to a data source is unavailable.
+
+Internally, sound data is loaded via the `ma_decoder` API which means by default it only supports
+file formats that have built-in support in miniaudio. You can extend this to support any kind of
+file format through the use of custom decoders. To do this you'll need to use a self-managed
+resource manager and configure it appropriately. See the "Resource Management" section below for
+details on how to set this up.
+
+
+6. Resource Management
+======================
+Many programs will want to manage sound resources for things such as reference counting and
+streaming. This is supported by miniaudio via the `ma_resource_manager` API.
+
+The resource manager is mainly responsible for the following:
+
+  * Loading of sound files into memory with reference counting.
+  * Streaming of sound data.
+
+When loading a sound file, the resource manager will give you back a `ma_data_source` compatible
+object called `ma_resource_manager_data_source`. This object can be passed into any
+`ma_data_source` API which is how you can read and seek audio data. When loading a sound file, you
+specify whether or not you want the sound to be fully loaded into memory (and optionally
+pre-decoded) or streamed. When loading into memory, you can also specify whether or not you want
+the data to be loaded asynchronously.
+
+The example below is how you can initialize a resource manager using it's default configuration:
+
+    ```c
+    ma_resource_manager_config config;
+    ma_resource_manager resourceManager;
+
+    config = ma_resource_manager_config_init();
+    result = ma_resource_manager_init(&config, &resourceManager);
+    if (result != MA_SUCCESS) {
+        ma_device_uninit(&device);
+        printf("Failed to initialize the resource manager.");
+        return -1;
+    }
+    ```
+
+You can configure the format, channels and sample rate of the decoded audio data. By default it
+will use the file's native data format, but you can configure it to use a consistent format. This
+is useful for offloading the cost of data conversion to load time rather than dynamically
+converting at mixing time. To do this, you configure the decoded format, channels and sample rate
+like the code below:
+
+    ```c
+    config = ma_resource_manager_config_init();
+    config.decodedFormat     = device.playback.format;
+    config.decodedChannels   = device.playback.channels;
+    config.decodedSampleRate = device.sampleRate;
+    ```
+
+In the code above, the resource manager will be configured so that any decoded audio data will be
+pre-converted at load time to the device's native data format. If instead you used defaults and
+the data format of the file did not match the device's data format, you would need to convert the
+data at mixing time which may be prohibitive in high-performance and large scale scenarios like
+games.
+
+Internally the resource manager uses the `ma_decoder` API to load sounds. This means by default it
+only supports decoders that are built into miniaudio. It's possible to support additional encoding
+formats through the use of custom decoders. To do so, pass in your `ma_decoding_backend_vtable`
+vtables into the resource manager config:
+
+    ```c
+    ma_decoding_backend_vtable* pCustomBackendVTables[] =
+    {
+        &g_ma_decoding_backend_vtable_libvorbis,
+        &g_ma_decoding_backend_vtable_libopus
+    };
+
+    ...
+
+    resourceManagerConfig.ppCustomDecodingBackendVTables = pCustomBackendVTables;
+    resourceManagerConfig.customDecodingBackendCount     = sizeof(pCustomBackendVTables) / sizeof(pCustomBackendVTables[0]);
+    resourceManagerConfig.pCustomDecodingBackendUserData = NULL;
+    ```
+
+This system can allow you to support any kind of file format. See the "Decoding" section for
+details on how to implement custom decoders. The miniaudio repository includes examples for Opus
+via libopus and libopusfile and Vorbis via libvorbis and libvorbisfile.
+
+Asynchronicity is achieved via a job system. When an operation needs to be performed, such as the
+decoding of a page, a job will be posted to a queue which will then be processed by a job thread.
+By default there will be only one job thread running, but this can be configured, like so:
+
+    ```c
+    config = ma_resource_manager_config_init();
+    config.jobThreadCount = MY_JOB_THREAD_COUNT;
+    ```
+
+By default job threads are managed internally by the resource manager, however you can also self
+manage your job threads if, for example, you want to integrate the job processing into your
+existing job infrastructure, or if you simply don't like the way the resource manager does it. To
+do this, just set the job thread count to 0 and process jobs manually. To process jobs, you first
+need to retrieve a job using `ma_resource_manager_next_job()` and then process it using
+`ma_job_process()`:
+
+    ```c
+    config = ma_resource_manager_config_init();
+    config.jobThreadCount = 0;                            // Don't manage any job threads internally.
+    config.flags = MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING; // Optional. Makes `ma_resource_manager_next_job()` non-blocking.
+
+    // ... Initialize your custom job threads ...
+
+    void my_custom_job_thread(...)
+    {
+        for (;;) {
+            ma_job job;
+            ma_result result = ma_resource_manager_next_job(pMyResourceManager, &job);
+            if (result != MA_SUCCESS) {
+                if (result == MA_NO_DATA_AVAILABLE) {
+                    // No jobs are available. Keep going. Will only get this if the resource manager was initialized
+                    // with MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING.
+                    continue;
+                } else if (result == MA_CANCELLED) {
+                    // MA_JOB_TYPE_QUIT was posted. Exit.
+                    break;
+                } else {
+                    // Some other error occurred.
+                    break;
+                }
+            }
+
+            ma_job_process(&job);
+        }
+    }
+    ```
+
+In the example above, the `MA_JOB_TYPE_QUIT` event is the used as the termination
+indicator, but you can use whatever you would like to terminate the thread. The call to
+`ma_resource_manager_next_job()` is blocking by default, but can be configured to be non-blocking
+by initializing the resource manager with the `MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING` configuration
+flag. Note that the `MA_JOB_TYPE_QUIT` will never be removed from the job queue. This
+is to give every thread the opportunity to catch the event and terminate naturally.
+
+When loading a file, it's sometimes convenient to be able to customize how files are opened and
+read instead of using standard `fopen()`, `fclose()`, etc. which is what miniaudio will use by
+default. This can be done by setting `pVFS` member of the resource manager's config:
+
+    ```c
+    // Initialize your custom VFS object. See documentation for VFS for information on how to do this.
+    my_custom_vfs vfs = my_custom_vfs_init();
+
+    config = ma_resource_manager_config_init();
+    config.pVFS = &vfs;
+    ```
+
+This is particularly useful in programs like games where you want to read straight from an archive
+rather than the normal file system. If you do not specify a custom VFS, the resource manager will
+use the operating system's normal file operations.
+
+To load a sound file and create a data source, call `ma_resource_manager_data_source_init()`. When
+loading a sound you need to specify the file path and options for how the sounds should be loaded.
+By default a sound will be loaded synchronously. The returned data source is owned by the caller
+which means the caller is responsible for the allocation and freeing of the data source. Below is
+an example for initializing a data source:
+
+    ```c
+    ma_resource_manager_data_source dataSource;
+    ma_result result = ma_resource_manager_data_source_init(pResourceManager, pFilePath, flags, &dataSource);
+    if (result != MA_SUCCESS) {
+        // Error.
+    }
+
+    // ...
+
+    // A ma_resource_manager_data_source object is compatible with the `ma_data_source` API. To read data, just call
+    // the `ma_data_source_read_pcm_frames()` like you would with any normal data source.
+    result = ma_data_source_read_pcm_frames(&dataSource, pDecodedData, frameCount, &framesRead);
+    if (result != MA_SUCCESS) {
+        // Failed to read PCM frames.
+    }
+
+    // ...
+
+    ma_resource_manager_data_source_uninit(&dataSource);
+    ```
+
+The `flags` parameter specifies how you want to perform loading of the sound file. It can be a
+combination of the following flags:
+
+    ```
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING
+    ```
+
+When no flags are specified (set to 0), the sound will be fully loaded into memory, but not
+decoded, meaning the raw file data will be stored in memory, and then dynamically decoded when
+`ma_data_source_read_pcm_frames()` is called. To instead decode the audio data before storing it in
+memory, use the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE` flag. By default, the sound file will
+be loaded synchronously, meaning `ma_resource_manager_data_source_init()` will only return after
+the entire file has been loaded. This is good for simplicity, but can be prohibitively slow. You
+can instead load the sound asynchronously using the `MA_RESOURCE_MANAGER_DATA_SOURCE_ASYNC` flag.
+This will result in `ma_resource_manager_data_source_init()` returning quickly, but no data will be
+returned by `ma_data_source_read_pcm_frames()` until some data is available. When no data is
+available because the asynchronous decoding hasn't caught up, `MA_BUSY` will be returned by
+`ma_data_source_read_pcm_frames()`.
+
+For large sounds, it's often prohibitive to store the entire file in memory. To mitigate this, you
+can instead stream audio data which you can do by specifying the
+`MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM` flag. When streaming, data will be decoded in 1
+second pages. When a new page needs to be decoded, a job will be posted to the job queue and then
+subsequently processed in a job thread.
+
+The `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING` flag can be used so that the sound will loop
+when it reaches the end by default. It's recommended you use this flag when you want to have a
+looping streaming sound. If you try loading a very short sound as a stream, you will get a glitch.
+This is because the resource manager needs to pre-fill the initial buffer at initialization time,
+and if you don't specify the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING` flag, the resource
+manager will assume the sound is not looping and will stop filling the buffer when it reaches the
+end, therefore resulting in a discontinuous buffer.
+
+For in-memory sounds, reference counting is used to ensure the data is loaded only once. This means
+multiple calls to `ma_resource_manager_data_source_init()` with the same file path will result in
+the file data only being loaded once. Each call to `ma_resource_manager_data_source_init()` must be
+matched up with a call to `ma_resource_manager_data_source_uninit()`. Sometimes it can be useful
+for a program to register self-managed raw audio data and associate it with a file path. Use the
+`ma_resource_manager_register_*()` and `ma_resource_manager_unregister_*()` APIs to do this.
+`ma_resource_manager_register_decoded_data()` is used to associate a pointer to raw, self-managed
+decoded audio data in the specified data format with the specified name. Likewise,
+`ma_resource_manager_register_encoded_data()` is used to associate a pointer to raw self-managed
+encoded audio data (the raw file data) with the specified name. Note that these names need not be
+actual file paths. When `ma_resource_manager_data_source_init()` is called (without the
+`MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM` flag), the resource manager will look for these
+explicitly registered data buffers and, if found, will use it as the backing data for the data
+source. Note that the resource manager does *not* make a copy of this data so it is up to the
+caller to ensure the pointer stays valid for its lifetime. Use
+`ma_resource_manager_unregister_data()` to unregister the self-managed data. You can also use
+`ma_resource_manager_register_file()` and `ma_resource_manager_unregister_file()` to register and
+unregister a file. It does not make sense to use the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM`
+flag with a self-managed data pointer.
+
+
+6.1. Asynchronous Loading and Synchronization
+---------------------------------------------
+When loading asynchronously, it can be useful to poll whether or not loading has finished. Use
+`ma_resource_manager_data_source_result()` to determine this. For in-memory sounds, this will
+return `MA_SUCCESS` when the file has been *entirely* decoded. If the sound is still being decoded,
+`MA_BUSY` will be returned. Otherwise, some other error code will be returned if the sound failed
+to load. For streaming data sources, `MA_SUCCESS` will be returned when the first page has been
+decoded and the sound is ready to be played. If the first page is still being decoded, `MA_BUSY`
+will be returned. Otherwise, some other error code will be returned if the sound failed to load.
+
+In addition to polling, you can also use a simple synchronization object called a "fence" to wait
+for asynchronously loaded sounds to finish. This is called `ma_fence`. The advantage to using a
+fence is that it can be used to wait for a group of sounds to finish loading rather than waiting
+for sounds on an individual basis. There are two stages to loading a sound:
+
+  * Initialization of the internal decoder; and
+  * Completion of decoding of the file (the file is fully decoded)
+
+You can specify separate fences for each of the different stages. Waiting for the initialization
+of the internal decoder is important for when you need to know the sample format, channels and
+sample rate of the file.
+
+The example below shows how you could use a fence when loading a number of sounds:
+
+    ```c
+    // This fence will be released when all sounds are finished loading entirely.
+    ma_fence fence;
+    ma_fence_init(&fence);
+
+    // This will be passed into the initialization routine for each sound.
+    ma_resource_manager_pipeline_notifications notifications = ma_resource_manager_pipeline_notifications_init();
+    notifications.done.pFence = &fence;
+
+    // Now load a bunch of sounds:
+    for (iSound = 0; iSound < soundCount; iSound += 1) {
+        ma_resource_manager_data_source_init(pResourceManager, pSoundFilePaths[iSound], flags, &notifications, &pSoundSources[iSound]);
+    }
+
+    // ... DO SOMETHING ELSE WHILE SOUNDS ARE LOADING ...
+
+    // Wait for loading of sounds to finish.
+    ma_fence_wait(&fence);
+    ```
+
+In the example above we used a fence for waiting until the entire file has been fully decoded. If
+you only need to wait for the initialization of the internal decoder to complete, you can use the
+`init` member of the `ma_resource_manager_pipeline_notifications` object:
+
+    ```c
+    notifications.init.pFence = &fence;
+    ```
+
+If a fence is not appropriate for your situation, you can instead use a callback that is fired on
+an individual sound basis. This is done in a very similar way to fences:
+
+    ```c
+    typedef struct
+    {
+        ma_async_notification_callbacks cb;
+        void* pMyData;
+    } my_notification;
+
+    void my_notification_callback(ma_async_notification* pNotification)
+    {
+        my_notification* pMyNotification = (my_notification*)pNotification;
+
+        // Do something in response to the sound finishing loading.
+    }
+
+    ...
+
+    my_notification myCallback;
+    myCallback.cb.onSignal = my_notification_callback;
+    myCallback.pMyData     = pMyData;
+
+    ma_resource_manager_pipeline_notifications notifications = ma_resource_manager_pipeline_notifications_init();
+    notifications.done.pNotification = &myCallback;
+
+    ma_resource_manager_data_source_init(pResourceManager, "my_sound.wav", flags, &notifications, &mySound);
+    ```
+
+In the example above we just extend the `ma_async_notification_callbacks` object and pass an
+instantiation into the `ma_resource_manager_pipeline_notifications` in the same way as we did with
+the fence, only we set `pNotification` instead of `pFence`. You can set both of these at the same
+time and they should both work as expected. If using the `pNotification` system, you need to ensure
+your `ma_async_notification_callbacks` object stays valid.
+
+
+
+6.2. Resource Manager Implementation Details
+--------------------------------------------
+Resources are managed in two main ways:
+
+  * By storing the entire sound inside an in-memory buffer (referred to as a data buffer)
+  * By streaming audio data on the fly (referred to as a data stream)
+
+A resource managed data source (`ma_resource_manager_data_source`) encapsulates a data buffer or
+data stream, depending on whether or not the data source was initialized with the
+`MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM` flag. If so, it will make use of a
+`ma_resource_manager_data_stream` object. Otherwise it will use a `ma_resource_manager_data_buffer`
+object. Both of these objects are data sources which means they can be used with any
+`ma_data_source_*()` API.
+
+Another major feature of the resource manager is the ability to asynchronously decode audio files.
+This relieves the audio thread of time-consuming decoding which can negatively affect scalability
+due to the audio thread needing to complete it's work extremely quickly to avoid glitching.
+Asynchronous decoding is achieved through a job system. There is a central multi-producer,
+multi-consumer, fixed-capacity job queue. When some asynchronous work needs to be done, a job is
+posted to the queue which is then read by a job thread. The number of job threads can be
+configured for improved scalability, and job threads can all run in parallel without needing to
+worry about the order of execution (how this is achieved is explained below).
+
+When a sound is being loaded asynchronously, playback can begin before the sound has been fully
+decoded. This enables the application to start playback of the sound quickly, while at the same
+time allowing to resource manager to keep loading in the background. Since there may be less
+threads than the number of sounds being loaded at a given time, a simple scheduling system is used
+to keep decoding time balanced and fair. The resource manager solves this by splitting decoding
+into chunks called pages. By default, each page is 1 second long. When a page has been decoded, a
+new job will be posted to start decoding the next page. By dividing up decoding into pages, an
+individual sound shouldn't ever delay every other sound from having their first page decoded. Of
+course, when loading many sounds at the same time, there will always be an amount of time required
+to process jobs in the queue so in heavy load situations there will still be some delay. To
+determine if a data source is ready to have some frames read, use
+`ma_resource_manager_data_source_get_available_frames()`. This will return the number of frames
+available starting from the current position.
+
+
+6.2.1. Job Queue
+----------------
+The resource manager uses a job queue which is multi-producer, multi-consumer, and fixed-capacity.
+This job queue is not currently lock-free, and instead uses a spinlock to achieve thread-safety.
+Only a fixed number of jobs can be allocated and inserted into the queue which is done through a
+lock-free data structure for allocating an index into a fixed sized array, with reference counting
+for mitigation of the ABA problem. The reference count is 32-bit.
+
+For many types of jobs it's important that they execute in a specific order. In these cases, jobs
+are executed serially. For the resource manager, serial execution of jobs is only required on a
+per-object basis (per data buffer or per data stream). Each of these objects stores an execution
+counter. When a job is posted it is associated with an execution counter. When the job is
+processed, it checks if the execution counter of the job equals the execution counter of the
+owning object and if so, processes the job. If the counters are not equal, the job will be posted
+back onto the job queue for later processing. When the job finishes processing the execution order
+of the main object is incremented. This system means the no matter how many job threads are
+executing, decoding of an individual sound will always get processed serially. The advantage to
+having multiple threads comes into play when loading multiple sounds at the same time.
+
+The resource manager's job queue is not 100% lock-free and will use a spinlock to achieve
+thread-safety for a very small section of code. This is only relevant when the resource manager
+uses more than one job thread. If only using a single job thread, which is the default, the
+lock should never actually wait in practice. The amount of time spent locking should be quite
+short, but it's something to be aware of for those who have pedantic lock-free requirements and
+need to use more than one job thread. There are plans to remove this lock in a future version.
+
+In addition, posting a job will release a semaphore, which on Win32 is implemented with
+`ReleaseSemaphore` and on POSIX platforms via a condition variable:
+
+    ```c
+    pthread_mutex_lock(&pSemaphore->lock);
+    {
+        pSemaphore->value += 1;
+        pthread_cond_signal(&pSemaphore->cond);
+    }
+    pthread_mutex_unlock(&pSemaphore->lock);
+    ```
+
+Again, this is relevant for those with strict lock-free requirements in the audio thread. To avoid
+this, you can use non-blocking mode (via the `MA_JOB_QUEUE_FLAG_NON_BLOCKING`
+flag) and implement your own job processing routine (see the "Resource Manager" section above for
+details on how to do this).
+
+
+
+6.2.2. Data Buffers
+-------------------
+When the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM` flag is excluded at initialization time, the
+resource manager will try to load the data into an in-memory data buffer. Before doing so, however,
+it will first check if the specified file is already loaded. If so, it will increment a reference
+counter and just use the already loaded data. This saves both time and memory. When the data buffer
+is uninitialized, the reference counter will be decremented. If the counter hits zero, the file
+will be unloaded. This is a detail to keep in mind because it could result in excessive loading and
+unloading of a sound. For example, the following sequence will result in a file be loaded twice,
+once after the other:
+
+    ```c
+    ma_resource_manager_data_source_init(pResourceManager, "my_file", ..., &myDataBuffer0); // Refcount = 1. Initial load.
+    ma_resource_manager_data_source_uninit(&myDataBuffer0);                                 // Refcount = 0. Unloaded.
+
+    ma_resource_manager_data_source_init(pResourceManager, "my_file", ..., &myDataBuffer1); // Refcount = 1. Reloaded because previous uninit() unloaded it.
+    ma_resource_manager_data_source_uninit(&myDataBuffer1);                                 // Refcount = 0. Unloaded.
+    ```
+
+A binary search tree (BST) is used for storing data buffers as it has good balance between
+efficiency and simplicity. The key of the BST is a 64-bit hash of the file path that was passed
+into `ma_resource_manager_data_source_init()`. The advantage of using a hash is that it saves
+memory over storing the entire path, has faster comparisons, and results in a mostly balanced BST
+due to the random nature of the hash. The disadvantages are that file names are case-sensitive and
+there's a small chance of name collisions. If case-sensitivity is an issue, you should normalize
+your file names to upper- or lower-case before initializing your data sources. If name collisions
+become an issue, you'll need to change the name of one of the colliding names or just not use the
+resource manager.
+
+When a sound file has not already been loaded and the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC`
+flag is excluded, the file will be decoded synchronously by the calling thread. There are two
+options for controlling how the audio is stored in the data buffer - encoded or decoded. When the
+`MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE` option is excluded, the raw file data will be stored
+in memory. Otherwise the sound will be decoded before storing it in memory. Synchronous loading is
+a very simple and standard process of simply adding an item to the BST, allocating a block of
+memory and then decoding (if `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE` is specified).
+
+When the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC` flag is specified, loading of the data buffer
+is done asynchronously. In this case, a job is posted to the queue to start loading and then the
+function immediately returns, setting an internal result code to `MA_BUSY`. This result code is
+returned when the program calls `ma_resource_manager_data_source_result()`. When decoding has fully
+completed `MA_SUCCESS` will be returned. This can be used to know if loading has fully completed.
+
+When loading asynchronously, a single job is posted to the queue of the type
+`MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE`. This involves making a copy of the file path and
+associating it with job. When the job is processed by the job thread, it will first load the file
+using the VFS associated with the resource manager. When using a custom VFS, it's important that it
+be completely thread-safe because it will be used from one or more job threads at the same time.
+Individual files should only ever be accessed by one thread at a time, however. After opening the
+file via the VFS, the job will determine whether or not the file is being decoded. If not, it
+simply allocates a block of memory and loads the raw file contents into it and returns. On the
+other hand, when the file is being decoded, it will first allocate a decoder on the heap and
+initialize it. Then it will check if the length of the file is known. If so it will allocate a
+block of memory to store the decoded output and initialize it to silence. If the size is unknown,
+it will allocate room for one page. After memory has been allocated, the first page will be
+decoded. If the sound is shorter than a page, the result code will be set to `MA_SUCCESS` and the
+completion event will be signalled and loading is now complete. If, however, there is more to
+decode, a job with the code `MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE` is posted. This job
+will decode the next page and perform the same process if it reaches the end. If there is more to
+decode, the job will post another `MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE` job which will
+keep on happening until the sound has been fully decoded. For sounds of an unknown length, each
+page will be linked together as a linked list. Internally this is implemented via the
+`ma_paged_audio_buffer` object.
+
+
+6.2.3. Data Streams
+-------------------
+Data streams only ever store two pages worth of data for each instance. They are most useful for
+large sounds like music tracks in games that would consume too much memory if fully decoded in
+memory. After every frame from a page has been read, a job will be posted to load the next page
+which is done from the VFS.
+
+For data streams, the `MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC` flag will determine whether or
+not initialization of the data source waits until the two pages have been decoded. When unset,
+`ma_resource_manager_data_source_init()` will wait until the two pages have been loaded, otherwise
+it will return immediately.
+
+When frames are read from a data stream using `ma_resource_manager_data_source_read_pcm_frames()`,
+`MA_BUSY` will be returned if there are no frames available. If there are some frames available,
+but less than the number requested, `MA_SUCCESS` will be returned, but the actual number of frames
+read will be less than the number requested. Due to the asynchronous nature of data streams,
+seeking is also asynchronous. If the data stream is in the middle of a seek, `MA_BUSY` will be
+returned when trying to read frames.
+
+When `ma_resource_manager_data_source_read_pcm_frames()` results in a page getting fully consumed
+a job is posted to load the next page. This will be posted from the same thread that called
+`ma_resource_manager_data_source_read_pcm_frames()`.
+
+Data streams are uninitialized by posting a job to the queue, but the function won't return until
+that job has been processed. The reason for this is that the caller owns the data stream object and
+therefore miniaudio needs to ensure everything completes before handing back control to the caller.
+Also, if the data stream is uninitialized while pages are in the middle of decoding, they must
+complete before destroying any underlying object and the job system handles this cleanly.
+
+Note that when a new page needs to be loaded, a job will be posted to the resource manager's job
+thread from the audio thread. You must keep in mind the details mentioned in the "Job Queue"
+section above regarding locking when posting an event if you require a strictly lock-free audio
+thread.
+
+
+
+7. Node Graph
+=============
+miniaudio's routing infrastructure follows a node graph paradigm. The idea is that you create a
+node whose outputs are attached to inputs of another node, thereby creating a graph. There are
+different types of nodes, with each node in the graph processing input data to produce output,
+which is then fed through the chain. Each node in the graph can apply their own custom effects. At
+the start of the graph will usually be one or more data source nodes which have no inputs and
+instead pull their data from a data source. At the end of the graph is an endpoint which represents
+the end of the chain and is where the final output is ultimately extracted from.
+
+Each node has a number of input buses and a number of output buses. An output bus from a node is
+attached to an input bus of another. Multiple nodes can connect their output buses to another
+node's input bus, in which case their outputs will be mixed before processing by the node. Below is
+a diagram that illustrates a hypothetical node graph setup:
+
+    ```
+    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Data flows left to right >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+
+    +---------------+                              +-----------------+
+    | Data Source 1 =----+    +----------+    +----= Low Pass Filter =----+
+    +---------------+    |    |          =----+    +-----------------+    |    +----------+
+                         +----= Splitter |                                +----= ENDPOINT |
+    +---------------+    |    |          =----+    +-----------------+    |    +----------+
+    | Data Source 2 =----+    +----------+    +----=  Echo / Delay   =----+
+    +---------------+                              +-----------------+
+    ```
+
+In the above graph, it starts with two data sources whose outputs are attached to the input of a
+splitter node. It's at this point that the two data sources are mixed. After mixing, the splitter
+performs it's processing routine and produces two outputs which is simply a duplication of the
+input stream. One output is attached to a low pass filter, whereas the other output is attached to
+a echo/delay. The outputs of the low pass filter and the echo are attached to the endpoint, and
+since they're both connected to the same input bus, they'll be mixed.
+
+Each input bus must be configured to accept the same number of channels, but the number of channels
+used by input buses can be different to the number of channels for output buses in which case
+miniaudio will automatically convert the input data to the output channel count before processing.
+The number of channels of an output bus of one node must match the channel count of the input bus
+it's attached to. The channel counts cannot be changed after the node has been initialized. If you
+attempt to attach an output bus to an input bus with a different channel count, attachment will
+fail.
+
+To use a node graph, you first need to initialize a `ma_node_graph` object. This is essentially a
+container around the entire graph. The `ma_node_graph` object is required for some thread-safety
+issues which will be explained later. A `ma_node_graph` object is initialized using miniaudio's
+standard config/init system:
+
+    ```c
+    ma_node_graph_config nodeGraphConfig = ma_node_graph_config_init(myChannelCount);
+
+    result = ma_node_graph_init(&nodeGraphConfig, NULL, &nodeGraph);    // Second parameter is a pointer to allocation callbacks.
+    if (result != MA_SUCCESS) {
+        // Failed to initialize node graph.
+    }
+    ```
+
+When you initialize the node graph, you're specifying the channel count of the endpoint. The
+endpoint is a special node which has one input bus and one output bus, both of which have the
+same channel count, which is specified in the config. Any nodes that connect directly to the
+endpoint must be configured such that their output buses have the same channel count. When you read
+audio data from the node graph, it'll have the channel count you specified in the config. To read
+data from the graph:
+
+    ```c
+    ma_uint32 framesRead;
+    result = ma_node_graph_read_pcm_frames(&nodeGraph, pFramesOut, frameCount, &framesRead);
+    if (result != MA_SUCCESS) {
+        // Failed to read data from the node graph.
+    }
+    ```
+
+When you read audio data, miniaudio starts at the node graph's endpoint node which then pulls in
+data from its input attachments, which in turn recursively pull in data from their inputs, and so
+on. At the start of the graph there will be some kind of data source node which will have zero
+inputs and will instead read directly from a data source. The base nodes don't literally need to
+read from a `ma_data_source` object, but they will always have some kind of underlying object that
+sources some kind of audio. The `ma_data_source_node` node can be used to read from a
+`ma_data_source`. Data is always in floating-point format and in the number of channels you
+specified when the graph was initialized. The sample rate is defined by the underlying data sources.
+It's up to you to ensure they use a consistent and appropriate sample rate.
+
+The `ma_node` API is designed to allow custom nodes to be implemented with relative ease, but
+miniaudio includes a few stock nodes for common functionality. This is how you would initialize a
+node which reads directly from a data source (`ma_data_source_node`) which is an example of one
+of the stock nodes that comes with miniaudio:
+
+    ```c
+    ma_data_source_node_config config = ma_data_source_node_config_init(pMyDataSource);
+
+    ma_data_source_node dataSourceNode;
+    result = ma_data_source_node_init(&nodeGraph, &config, NULL, &dataSourceNode);
+    if (result != MA_SUCCESS) {
+        // Failed to create data source node.
+    }
+    ```
+
+The data source node will use the output channel count to determine the channel count of the output
+bus. There will be 1 output bus and 0 input buses (data will be drawn directly from the data
+source). The data source must output to floating-point (`ma_format_f32`) or else an error will be
+returned from `ma_data_source_node_init()`.
+
+By default the node will not be attached to the graph. To do so, use `ma_node_attach_output_bus()`:
+
+    ```c
+    result = ma_node_attach_output_bus(&dataSourceNode, 0, ma_node_graph_get_endpoint(&nodeGraph), 0);
+    if (result != MA_SUCCESS) {
+        // Failed to attach node.
+    }
+    ```
+
+The code above connects the data source node directly to the endpoint. Since the data source node
+has only a single output bus, the index will always be 0. Likewise, the endpoint only has a single
+input bus which means the input bus index will also always be 0.
+
+To detach a specific output bus, use `ma_node_detach_output_bus()`. To detach all output buses, use
+`ma_node_detach_all_output_buses()`. If you want to just move the output bus from one attachment to
+another, you do not need to detach first. You can just call `ma_node_attach_output_bus()` and it'll
+deal with it for you.
+
+Less frequently you may want to create a specialized node. This will be a node where you implement
+your own processing callback to apply a custom effect of some kind. This is similar to initializing
+one of the stock node types, only this time you need to specify a pointer to a vtable containing a
+pointer to the processing function and the number of input and output buses. Example:
+
+    ```c
+    static void my_custom_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+    {
+        // Do some processing of ppFramesIn (one stream of audio data per input bus)
+        const float* pFramesIn_0 = ppFramesIn[0]; // Input bus @ index 0.
+        const float* pFramesIn_1 = ppFramesIn[1]; // Input bus @ index 1.
+        float* pFramesOut_0 = ppFramesOut[0];     // Output bus @ index 0.
+
+        // Do some processing. On input, `pFrameCountIn` will be the number of input frames in each
+        // buffer in `ppFramesIn` and `pFrameCountOut` will be the capacity of each of the buffers
+        // in `ppFramesOut`. On output, `pFrameCountIn` should be set to the number of input frames
+        // your node consumed and `pFrameCountOut` should be set the number of output frames that
+        // were produced.
+        //
+        // You should process as many frames as you can. If your effect consumes input frames at the
+        // same rate as output frames (always the case, unless you're doing resampling), you need
+        // only look at `ppFramesOut` and process that exact number of frames. If you're doing
+        // resampling, you'll need to be sure to set both `pFrameCountIn` and `pFrameCountOut`
+        // properly.
+    }
+
+    static ma_node_vtable my_custom_node_vtable =
+    {
+        my_custom_node_process_pcm_frames, // The function that will be called to process your custom node. This is where you'd implement your effect processing.
+        NULL,   // Optional. A callback for calculating the number of input frames that are required to process a specified number of output frames.
+        2,      // 2 input buses.
+        1,      // 1 output bus.
+        0       // Default flags.
+    };
+
+    ...
+
+    // Each bus needs to have a channel count specified. To do this you need to specify the channel
+    // counts in an array and then pass that into the node config.
+    ma_uint32 inputChannels[2];     // Equal in size to the number of input channels specified in the vtable.
+    ma_uint32 outputChannels[1];    // Equal in size to the number of output channels specified in the vtable.
+
+    inputChannels[0]  = channelsIn;
+    inputChannels[1]  = channelsIn;
+    outputChannels[0] = channelsOut;
+
+    ma_node_config nodeConfig = ma_node_config_init();
+    nodeConfig.vtable          = &my_custom_node_vtable;
+    nodeConfig.pInputChannels  = inputChannels;
+    nodeConfig.pOutputChannels = outputChannels;
+
+    ma_node_base node;
+    result = ma_node_init(&nodeGraph, &nodeConfig, NULL, &node);
+    if (result != MA_SUCCESS) {
+        // Failed to initialize node.
+    }
+    ```
+
+When initializing a custom node, as in the code above, you'll normally just place your vtable in
+static space. The number of input and output buses are specified as part of the vtable. If you need
+a variable number of buses on a per-node bases, the vtable should have the relevant bus count set
+to `MA_NODE_BUS_COUNT_UNKNOWN`. In this case, the bus count should be set in the node config:
+
+    ```c
+    static ma_node_vtable my_custom_node_vtable =
+    {
+        my_custom_node_process_pcm_frames, // The function that will be called process your custom node. This is where you'd implement your effect processing.
+        NULL,   // Optional. A callback for calculating the number of input frames that are required to process a specified number of output frames.
+        MA_NODE_BUS_COUNT_UNKNOWN,  // The number of input buses is determined on a per-node basis.
+        1,      // 1 output bus.
+        0       // Default flags.
+    };
+
+    ...
+
+    ma_node_config nodeConfig = ma_node_config_init();
+    nodeConfig.vtable          = &my_custom_node_vtable;
+    nodeConfig.inputBusCount   = myBusCount;        // <-- Since the vtable specifies MA_NODE_BUS_COUNT_UNKNOWN, the input bus count should be set here.
+    nodeConfig.pInputChannels  = inputChannels;     // <-- Make sure there are nodeConfig.inputBusCount elements in this array.
+    nodeConfig.pOutputChannels = outputChannels;    // <-- The vtable specifies 1 output bus, so there must be 1 element in this array.
+    ```
+
+In the above example it's important to never set the `inputBusCount` and `outputBusCount` members
+to anything other than their defaults if the vtable specifies an explicit count. They can only be
+set if the vtable specifies MA_NODE_BUS_COUNT_UNKNOWN in the relevant bus count.
+
+Most often you'll want to create a structure to encapsulate your node with some extra data. You
+need to make sure the `ma_node_base` object is your first member of the structure:
+
+    ```c
+    typedef struct
+    {
+        ma_node_base base; // <-- Make sure this is always the first member.
+        float someCustomData;
+    } my_custom_node;
+    ```
+
+By doing this, your object will be compatible with all `ma_node` APIs and you can attach it to the
+graph just like any other node.
+
+In the custom processing callback (`my_custom_node_process_pcm_frames()` in the example above), the
+number of channels for each bus is what was specified by the config when the node was initialized
+with `ma_node_init()`. In addition, all attachments to each of the input buses will have been
+pre-mixed by miniaudio. The config allows you to specify different channel counts for each
+individual input and output bus. It's up to the effect to handle it appropriate, and if it can't,
+return an error in it's initialization routine.
+
+Custom nodes can be assigned some flags to describe their behaviour. These are set via the vtable
+and include the following:
+
+    +-----------------------------------------+---------------------------------------------------+
+    | Flag Name                               | Description                                       |
+    +-----------------------------------------+---------------------------------------------------+
+    | MA_NODE_FLAG_PASSTHROUGH                | Useful for nodes that do not do any kind of audio |
+    |                                         | processing, but are instead used for tracking     |
+    |                                         | time, handling events, etc. Also used by the      |
+    |                                         | internal endpoint node. It reads directly from    |
+    |                                         | the input bus to the output bus. Nodes with this  |
+    |                                         | flag must have exactly 1 input bus and 1 output   |
+    |                                         | bus, and both buses must have the same channel    |
+    |                                         | counts.                                           |
+    +-----------------------------------------+---------------------------------------------------+
+    | MA_NODE_FLAG_CONTINUOUS_PROCESSING      | Causes the processing callback to be called even  |
+    |                                         | when no data is available to be read from input   |
+    |                                         | attachments. When a node has at least one input   |
+    |                                         | bus, but there are no inputs attached or the      |
+    |                                         | inputs do not deliver any data, the node's        |
+    |                                         | processing callback will not get fired. This flag |
+    |                                         | will make it so the callback is always fired      |
+    |                                         | regardless of whether or not any input data is    |
+    |                                         | received. This is useful for effects like         |
+    |                                         | echos where there will be a tail of audio data    |
+    |                                         | that still needs to be processed even when the    |
+    |                                         | original data sources have reached their ends. It |
+    |                                         | may also be useful for nodes that must always     |
+    |                                         | have their processing callback fired when there   |
+    |                                         | are no inputs attached.                           |
+    +-----------------------------------------+---------------------------------------------------+
+    | MA_NODE_FLAG_ALLOW_NULL_INPUT           | Used in conjunction with                          |
+    |                                         | `MA_NODE_FLAG_CONTINUOUS_PROCESSING`. When this   |
+    |                                         | is set, the `ppFramesIn` parameter of the         |
+    |                                         | processing callback will be set to NULL when      |
+    |                                         | there are no input frames are available. When     |
+    |                                         | this is unset, silence will be posted to the      |
+    |                                         | processing callback.                              |
+    +-----------------------------------------+---------------------------------------------------+
+    | MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES | Used to tell miniaudio that input and output      |
+    |                                         | frames are processed at different rates. You      |
+    |                                         | should set this for any nodes that perform        |
+    |                                         | resampling.                                       |
+    +-----------------------------------------+---------------------------------------------------+
+    | MA_NODE_FLAG_SILENT_OUTPUT              | Used to tell miniaudio that a node produces only  |
+    |                                         | silent output. This is useful for nodes where you |
+    |                                         | don't want the output to contribute to the final  |
+    |                                         | mix. An example might be if you want split your   |
+    |                                         | stream and have one branch be output to a file.   |
+    |                                         | When using this flag, you should avoid writing to |
+    |                                         | the output buffer of the node's processing        |
+    |                                         | callback because miniaudio will ignore it anyway. |
+    +-----------------------------------------+---------------------------------------------------+
+
+
+If you need to make a copy of an audio stream for effect processing you can use a splitter node
+called `ma_splitter_node`. This takes has 1 input bus and splits the stream into 2 output buses.
+You can use it like this:
+
+    ```c
+    ma_splitter_node_config splitterNodeConfig = ma_splitter_node_config_init(channels);
+
+    ma_splitter_node splitterNode;
+    result = ma_splitter_node_init(&nodeGraph, &splitterNodeConfig, NULL, &splitterNode);
+    if (result != MA_SUCCESS) {
+        // Failed to create node.
+    }
+
+    // Attach your output buses to two different input buses (can be on two different nodes).
+    ma_node_attach_output_bus(&splitterNode, 0, ma_node_graph_get_endpoint(&nodeGraph), 0); // Attach directly to the endpoint.
+    ma_node_attach_output_bus(&splitterNode, 1, &myEffectNode,                          0); // Attach to input bus 0 of some effect node.
+    ```
+
+The volume of an output bus can be configured on a per-bus basis:
+
+    ```c
+    ma_node_set_output_bus_volume(&splitterNode, 0, 0.5f);
+    ma_node_set_output_bus_volume(&splitterNode, 1, 0.5f);
+    ```
+
+In the code above we're using the splitter node from before and changing the volume of each of the
+copied streams.
+
+You can start and stop a node with the following:
+
+    ```c
+    ma_node_set_state(&splitterNode, ma_node_state_started);    // The default state.
+    ma_node_set_state(&splitterNode, ma_node_state_stopped);
+    ```
+
+By default the node is in a started state, but since it won't be connected to anything won't
+actually be invoked by the node graph until it's connected. When you stop a node, data will not be
+read from any of its input connections. You can use this property to stop a group of sounds
+atomically.
+
+You can configure the initial state of a node in it's config:
+
+    ```c
+    nodeConfig.initialState = ma_node_state_stopped;
+    ```
+
+Note that for the stock specialized nodes, all of their configs will have a `nodeConfig` member
+which is the config to use with the base node. This is where the initial state can be configured
+for specialized nodes:
+
+    ```c
+    dataSourceNodeConfig.nodeConfig.initialState = ma_node_state_stopped;
+    ```
+
+When using a specialized node like `ma_data_source_node` or `ma_splitter_node`, be sure to not
+modify the `vtable` member of the `nodeConfig` object.
+
+
+7.1. Timing
+-----------
+The node graph supports starting and stopping nodes at scheduled times. This is especially useful
+for data source nodes where you want to get the node set up, but only start playback at a specific
+time. There are two clocks: local and global.
+
+A local clock is per-node, whereas the global clock is per graph. Scheduling starts and stops can
+only be done based on the global clock because the local clock will not be running while the node
+is stopped. The global clocks advances whenever `ma_node_graph_read_pcm_frames()` is called. On the
+other hand, the local clock only advances when the node's processing callback is fired, and is
+advanced based on the output frame count.
+
+To retrieve the global time, use `ma_node_graph_get_time()`. The global time can be set with
+`ma_node_graph_set_time()` which might be useful if you want to do seeking on a global timeline.
+Getting and setting the local time is similar. Use `ma_node_get_time()` to retrieve the local time,
+and `ma_node_set_time()` to set the local time. The global and local times will be advanced by the
+audio thread, so care should be taken to avoid data races. Ideally you should avoid calling these
+outside of the node processing callbacks which are always run on the audio thread.
+
+There is basic support for scheduling the starting and stopping of nodes. You can only schedule one
+start and one stop at a time. This is mainly intended for putting nodes into a started or stopped
+state in a frame-exact manner. Without this mechanism, starting and stopping of a node is limited
+to the resolution of a call to `ma_node_graph_read_pcm_frames()` which would typically be in blocks
+of several milliseconds. The following APIs can be used for scheduling node states:
+
+    ```c
+    ma_node_set_state_time()
+    ma_node_get_state_time()
+    ```
+
+The time is absolute and must be based on the global clock. An example is below:
+
+    ```c
+    ma_node_set_state_time(&myNode, ma_node_state_started, sampleRate*1);   // Delay starting to 1 second.
+    ma_node_set_state_time(&myNode, ma_node_state_stopped, sampleRate*5);   // Delay stopping to 5 seconds.
+    ```
+
+An example for changing the state using a relative time.
+
+    ```c
+    ma_node_set_state_time(&myNode, ma_node_state_started, sampleRate*1 + ma_node_graph_get_time(&myNodeGraph));
+    ma_node_set_state_time(&myNode, ma_node_state_stopped, sampleRate*5 + ma_node_graph_get_time(&myNodeGraph));
+    ```
+
+Note that due to the nature of multi-threading the times may not be 100% exact. If this is an
+issue, consider scheduling state changes from within a processing callback. An idea might be to
+have some kind of passthrough trigger node that is used specifically for tracking time and handling
+events.
+
+
+
+7.2. Thread Safety and Locking
+------------------------------
+When processing audio, it's ideal not to have any kind of locking in the audio thread. Since it's
+expected that `ma_node_graph_read_pcm_frames()` would be run on the audio thread, it does so
+without the use of any locks. This section discusses the implementation used by miniaudio and goes
+over some of the compromises employed by miniaudio to achieve this goal. Note that the current
+implementation may not be ideal - feedback and critiques are most welcome.
+
+The node graph API is not *entirely* lock-free. Only `ma_node_graph_read_pcm_frames()` is expected
+to be lock-free. Attachment, detachment and uninitialization of nodes use locks to simplify the
+implementation, but are crafted in a way such that such locking is not required when reading audio
+data from the graph. Locking in these areas are achieved by means of spinlocks.
+
+The main complication with keeping `ma_node_graph_read_pcm_frames()` lock-free stems from the fact
+that a node can be uninitialized, and it's memory potentially freed, while in the middle of being
+processed on the audio thread. There are times when the audio thread will be referencing a node,
+which means the uninitialization process of a node needs to make sure it delays returning until the
+audio thread is finished so that control is not handed back to the caller thereby giving them a
+chance to free the node's memory.
+
+When the audio thread is processing a node, it does so by reading from each of the output buses of
+the node. In order for a node to process data for one of its output buses, it needs to read from
+each of its input buses, and so on an so forth. It follows that once all output buses of a node
+are detached, the node as a whole will be disconnected and no further processing will occur unless
+it's output buses are reattached, which won't be happening when the node is being uninitialized.
+By having `ma_node_detach_output_bus()` wait until the audio thread is finished with it, we can
+simplify a few things, at the expense of making `ma_node_detach_output_bus()` a bit slower. By
+doing this, the implementation of `ma_node_uninit()` becomes trivial - just detach all output
+nodes, followed by each of the attachments to each of its input nodes, and then do any final clean
+up.
+
+With the above design, the worst-case scenario is `ma_node_detach_output_bus()` taking as long as
+it takes to process the output bus being detached. This will happen if it's called at just the
+wrong moment where the audio thread has just iterated it and has just started processing. The
+caller of `ma_node_detach_output_bus()` will stall until the audio thread is finished, which
+includes the cost of recursively processing its inputs. This is the biggest compromise made with
+the approach taken by miniaudio for its lock-free processing system. The cost of detaching nodes
+earlier in the pipeline (data sources, for example) will be cheaper than the cost of detaching
+higher level nodes, such as some kind of final post-processing endpoint. If you need to do mass
+detachments, detach starting from the lowest level nodes and work your way towards the final
+endpoint node (but don't try detaching the node graph's endpoint). If the audio thread is not
+running, detachment will be fast and detachment in any order will be the same. The reason nodes
+need to wait for their input attachments to complete is due to the potential for desyncs between
+data sources. If the node was to terminate processing mid way through processing its inputs,
+there's a chance that some of the underlying data sources will have been read, but then others not.
+That will then result in a potential desynchronization when detaching and reattaching higher-level
+nodes. A possible solution to this is to have an option when detaching to terminate processing
+before processing all input attachments which should be fairly simple.
+
+Another compromise, albeit less significant, is locking when attaching and detaching nodes. This
+locking is achieved by means of a spinlock in order to reduce memory overhead. A lock is present
+for each input bus and output bus. When an output bus is connected to an input bus, both the output
+bus and input bus is locked. This locking is specifically for attaching and detaching across
+different threads and does not affect `ma_node_graph_read_pcm_frames()` in any way. The locking and
+unlocking is mostly self-explanatory, but a slightly less intuitive aspect comes into it when
+considering that iterating over attachments must not break as a result of attaching or detaching a
+node while iteration is occurring.
+
+Attaching and detaching are both quite simple. When an output bus of a node is attached to an input
+bus of another node, it's added to a linked list. Basically, an input bus is a linked list, where
+each item in the list is and output bus. We have some intentional (and convenient) restrictions on
+what can done with the linked list in order to simplify the implementation. First of all, whenever
+something needs to iterate over the list, it must do so in a forward direction. Backwards iteration
+is not supported. Also, items can only be added to the start of the list.
+
+The linked list is a doubly-linked list where each item in the list (an output bus) holds a pointer
+to the next item in the list, and another to the previous item. A pointer to the previous item is
+only required for fast detachment of the node - it is never used in iteration. This is an
+important property because it means from the perspective of iteration, attaching and detaching of
+an item can be done with a single atomic assignment. This is exploited by both the attachment and
+detachment process. When attaching the node, the first thing that is done is the setting of the
+local "next" and "previous" pointers of the node. After that, the item is "attached" to the list
+by simply performing an atomic exchange with the head pointer. After that, the node is "attached"
+to the list from the perspective of iteration. Even though the "previous" pointer of the next item
+hasn't yet been set, from the perspective of iteration it's been attached because iteration will
+only be happening in a forward direction which means the "previous" pointer won't actually ever get
+used. The same general process applies to detachment. See `ma_node_attach_output_bus()` and
+`ma_node_detach_output_bus()` for the implementation of this mechanism.
+
+
+
+8. Decoding
+===========
+The `ma_decoder` API is used for reading audio files. Decoders are completely decoupled from
+devices and can be used independently. Built-in support is included for the following formats:
+
+    +---------+
+    | Format  |
+    +---------+
+    | WAV     |
+    | MP3     |
+    | FLAC    |
+    +---------+
+
+You can disable the built-in decoders by specifying one or more of the following options before the
+miniaudio implementation:
+
+    ```c
+    #define MA_NO_WAV
+    #define MA_NO_MP3
+    #define MA_NO_FLAC
+    ```
+
+miniaudio supports the ability to plug in custom decoders. See the section below for details on how
+to use custom decoders.
+
+A decoder can be initialized from a file with `ma_decoder_init_file()`, a block of memory with
+`ma_decoder_init_memory()`, or from data delivered via callbacks with `ma_decoder_init()`. Here is
+an example for loading a decoder from a file:
+
+    ```c
+    ma_decoder decoder;
+    ma_result result = ma_decoder_init_file("MySong.mp3", NULL, &decoder);
+    if (result != MA_SUCCESS) {
+        return false;   // An error occurred.
+    }
+
+    ...
+
+    ma_decoder_uninit(&decoder);
+    ```
+
+When initializing a decoder, you can optionally pass in a pointer to a `ma_decoder_config` object
+(the `NULL` argument in the example above) which allows you to configure the output format, channel
+count, sample rate and channel map:
+
+    ```c
+    ma_decoder_config config = ma_decoder_config_init(ma_format_f32, 2, 48000);
+    ```
+
+When passing in `NULL` for decoder config in `ma_decoder_init*()`, the output format will be the
+same as that defined by the decoding backend.
+
+Data is read from the decoder as PCM frames. This will output the number of PCM frames actually
+read. If this is less than the requested number of PCM frames it means you've reached the end. The
+return value will be `MA_AT_END` if no samples have been read and the end has been reached.
+
+    ```c
+    ma_result result = ma_decoder_read_pcm_frames(pDecoder, pFrames, framesToRead, &framesRead);
+    if (framesRead < framesToRead) {
+        // Reached the end.
+    }
+    ```
+
+You can also seek to a specific frame like so:
+
+    ```c
+    ma_result result = ma_decoder_seek_to_pcm_frame(pDecoder, targetFrame);
+    if (result != MA_SUCCESS) {
+        return false;   // An error occurred.
+    }
+    ```
+
+If you want to loop back to the start, you can simply seek back to the first PCM frame:
+
+    ```c
+    ma_decoder_seek_to_pcm_frame(pDecoder, 0);
+    ```
+
+When loading a decoder, miniaudio uses a trial and error technique to find the appropriate decoding
+backend. This can be unnecessarily inefficient if the type is already known. In this case you can
+use `encodingFormat` variable in the device config to specify a specific encoding format you want
+to decode:
+
+    ```c
+    decoderConfig.encodingFormat = ma_encoding_format_wav;
+    ```
+
+See the `ma_encoding_format` enum for possible encoding formats.
+
+The `ma_decoder_init_file()` API will try using the file extension to determine which decoding
+backend to prefer.
+
+
+8.1. Custom Decoders
+--------------------
+It's possible to implement a custom decoder and plug it into miniaudio. This is extremely useful
+when you want to use the `ma_decoder` API, but need to support an encoding format that's not one of
+the stock formats supported by miniaudio. This can be put to particularly good use when using the
+`ma_engine` and/or `ma_resource_manager` APIs because they use `ma_decoder` internally. If, for
+example, you wanted to support Opus, you can do so with a custom decoder (there if a reference
+Opus decoder in the "extras" folder of the miniaudio repository which uses libopus + libopusfile).
+
+A custom decoder must implement a data source. A vtable called `ma_decoding_backend_vtable` needs
+to be implemented which is then passed into the decoder config:
+
+    ```c
+    ma_decoding_backend_vtable* pCustomBackendVTables[] =
+    {
+        &g_ma_decoding_backend_vtable_libvorbis,
+        &g_ma_decoding_backend_vtable_libopus
+    };
+
+    ...
+
+    decoderConfig = ma_decoder_config_init_default();
+    decoderConfig.pCustomBackendUserData = NULL;
+    decoderConfig.ppCustomBackendVTables = pCustomBackendVTables;
+    decoderConfig.customBackendCount     = sizeof(pCustomBackendVTables) / sizeof(pCustomBackendVTables[0]);
+    ```
+
+The `ma_decoding_backend_vtable` vtable has the following functions:
+
+    ```
+    onInit
+    onInitFile
+    onInitFileW
+    onInitMemory
+    onUninit
+    ```
+
+There are only two functions that must be implemented - `onInit` and `onUninit`. The other
+functions can be implemented for a small optimization for loading from a file path or memory. If
+these are not specified, miniaudio will deal with it for you via a generic implementation.
+
+When you initialize a custom data source (by implementing the `onInit` function in the vtable) you
+will need to output a pointer to a `ma_data_source` which implements your custom decoder. See the
+section about data sources for details on how to implement this. Alternatively, see the
+"custom_decoders" example in the miniaudio repository.
+
+The `onInit` function takes a pointer to some callbacks for the purpose of reading raw audio data
+from some arbitrary source. You'll use these functions to read from the raw data and perform the
+decoding. When you call them, you will pass in the `pReadSeekTellUserData` pointer to the relevant
+parameter.
+
+The `pConfig` parameter in `onInit` can be used to configure the backend if appropriate. It's only
+used as a hint and can be ignored. However, if any of the properties are relevant to your decoder,
+an optimal implementation will handle the relevant properties appropriately.
+
+If memory allocation is required, it should be done so via the specified allocation callbacks if
+possible (the `pAllocationCallbacks` parameter).
+
+If an error occurs when initializing the decoder, you should leave `ppBackend` unset, or set to
+NULL, and make sure everything is cleaned up appropriately and an appropriate result code returned.
+When multiple custom backends are specified, miniaudio will cycle through the vtables in the order
+they're listed in the array that's passed into the decoder config so it's important that your
+initialization routine is clean.
+
+When a decoder is uninitialized, the `onUninit` callback will be fired which will give you an
+opportunity to clean up and internal data.
+
+
+
+9. Encoding
+===========
+The `ma_encoding` API is used for writing audio files. The only supported output format is WAV.
+This can be disabled by specifying the following option before the implementation of miniaudio:
+
+    ```c
+    #define MA_NO_WAV
+    ```
+
+An encoder can be initialized to write to a file with `ma_encoder_init_file()` or from data
+delivered via callbacks with `ma_encoder_init()`. Below is an example for initializing an encoder
+to output to a file.
+
+    ```c
+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, FORMAT, CHANNELS, SAMPLE_RATE);
+    ma_encoder encoder;
+    ma_result result = ma_encoder_init_file("my_file.wav", &config, &encoder);
+    if (result != MA_SUCCESS) {
+        // Error
+    }
+
+    ...
+
+    ma_encoder_uninit(&encoder);
+    ```
+
+When initializing an encoder you must specify a config which is initialized with
+`ma_encoder_config_init()`. Here you must specify the file type, the output sample format, output
+channel count and output sample rate. The following file types are supported:
+
+    +------------------------+-------------+
+    | Enum                   | Description |
+    +------------------------+-------------+
+    | ma_encoding_format_wav | WAV         |
+    +------------------------+-------------+
+
+If the format, channel count or sample rate is not supported by the output file type an error will
+be returned. The encoder will not perform data conversion so you will need to convert it before
+outputting any audio data. To output audio data, use `ma_encoder_write_pcm_frames()`, like in the
+example below:
+
+    ```c
+    ma_uint64 framesWritten;
+    result = ma_encoder_write_pcm_frames(&encoder, pPCMFramesToWrite, framesToWrite, &framesWritten);
+    if (result != MA_SUCCESS) {
+        ... handle error ...
+    }
+    ```
+
+The `framesWritten` variable will contain the number of PCM frames that were actually written. This
+is optionally and you can pass in `NULL` if you need this.
+
+Encoders must be uninitialized with `ma_encoder_uninit()`.
+
+
+
+10. Data Conversion
+===================
+A data conversion API is included with miniaudio which supports the majority of data conversion
+requirements. This supports conversion between sample formats, channel counts (with channel
+mapping) and sample rates.
+
+
+10.1. Sample Format Conversion
+------------------------------
+Conversion between sample formats is achieved with the `ma_pcm_*_to_*()`, `ma_pcm_convert()` and
+`ma_convert_pcm_frames_format()` APIs. Use `ma_pcm_*_to_*()` to convert between two specific
+formats. Use `ma_pcm_convert()` to convert based on a `ma_format` variable. Use
+`ma_convert_pcm_frames_format()` to convert PCM frames where you want to specify the frame count
+and channel count as a variable instead of the total sample count.
+
+
+10.1.1. Dithering
+-----------------
+Dithering can be set using the ditherMode parameter.
+
+The different dithering modes include the following, in order of efficiency:
+
+    +-----------+--------------------------+
+    | Type      | Enum Token               |
+    +-----------+--------------------------+
+    | None      | ma_dither_mode_none      |
+    | Rectangle | ma_dither_mode_rectangle |
+    | Triangle  | ma_dither_mode_triangle  |
+    +-----------+--------------------------+
+
+Note that even if the dither mode is set to something other than `ma_dither_mode_none`, it will be
+ignored for conversions where dithering is not needed. Dithering is available for the following
+conversions:
+
+    ```
+    s16 -> u8
+    s24 -> u8
+    s32 -> u8
+    f32 -> u8
+    s24 -> s16
+    s32 -> s16
+    f32 -> s16
+    ```
+
+Note that it is not an error to pass something other than ma_dither_mode_none for conversions where
+dither is not used. It will just be ignored.
+
+
+
+10.2. Channel Conversion
+------------------------
+Channel conversion is used for channel rearrangement and conversion from one channel count to
+another. The `ma_channel_converter` API is used for channel conversion. Below is an example of
+initializing a simple channel converter which converts from mono to stereo.
+
+    ```c
+    ma_channel_converter_config config = ma_channel_converter_config_init(
+        ma_format,                      // Sample format
+        1,                              // Input channels
+        NULL,                           // Input channel map
+        2,                              // Output channels
+        NULL,                           // Output channel map
+        ma_channel_mix_mode_default);   // The mixing algorithm to use when combining channels.
+
+    result = ma_channel_converter_init(&config, NULL, &converter);
+    if (result != MA_SUCCESS) {
+        // Error.
+    }
+    ```
+
+To perform the conversion simply call `ma_channel_converter_process_pcm_frames()` like so:
+
+    ```c
+    ma_result result = ma_channel_converter_process_pcm_frames(&converter, pFramesOut, pFramesIn, frameCount);
+    if (result != MA_SUCCESS) {
+        // Error.
+    }
+    ```
+
+It is up to the caller to ensure the output buffer is large enough to accommodate the new PCM
+frames.
+
+Input and output PCM frames are always interleaved. Deinterleaved layouts are not supported.
+
+
+10.2.1. Channel Mapping
+-----------------------
+In addition to converting from one channel count to another, like the example above, the channel
+converter can also be used to rearrange channels. When initializing the channel converter, you can
+optionally pass in channel maps for both the input and output frames. If the channel counts are the
+same, and each channel map contains the same channel positions with the exception that they're in
+a different order, a simple shuffling of the channels will be performed. If, however, there is not
+a 1:1 mapping of channel positions, or the channel counts differ, the input channels will be mixed
+based on a mixing mode which is specified when initializing the `ma_channel_converter_config`
+object.
+
+When converting from mono to multi-channel, the mono channel is simply copied to each output
+channel. When going the other way around, the audio of each output channel is simply averaged and
+copied to the mono channel.
+
+In more complicated cases blending is used. The `ma_channel_mix_mode_simple` mode will drop excess
+channels and silence extra channels. For example, converting from 4 to 2 channels, the 3rd and 4th
+channels will be dropped, whereas converting from 2 to 4 channels will put silence into the 3rd and
+4th channels.
+
+The `ma_channel_mix_mode_rectangle` mode uses spacial locality based on a rectangle to compute a
+simple distribution between input and output. Imagine sitting in the middle of a room, with
+speakers on the walls representing channel positions. The `MA_CHANNEL_FRONT_LEFT` position can be
+thought of as being in the corner of the front and left walls.
+
+Finally, the `ma_channel_mix_mode_custom_weights` mode can be used to use custom user-defined
+weights. Custom weights can be passed in as the last parameter of
+`ma_channel_converter_config_init()`.
+
+Predefined channel maps can be retrieved with `ma_channel_map_init_standard()`. This takes a
+`ma_standard_channel_map` enum as its first parameter, which can be one of the following:
+
+    +-----------------------------------+-----------------------------------------------------------+
+    | Name                              | Description                                               |
+    +-----------------------------------+-----------------------------------------------------------+
+    | ma_standard_channel_map_default   | Default channel map used by miniaudio. See below.         |
+    | ma_standard_channel_map_microsoft | Channel map used by Microsoft's bitfield channel maps.    |
+    | ma_standard_channel_map_alsa      | Default ALSA channel map.                                 |
+    | ma_standard_channel_map_rfc3551   | RFC 3551. Based on AIFF.                                  |
+    | ma_standard_channel_map_flac      | FLAC channel map.                                         |
+    | ma_standard_channel_map_vorbis    | Vorbis channel map.                                       |
+    | ma_standard_channel_map_sound4    | FreeBSD's sound(4).                                       |
+    | ma_standard_channel_map_sndio     | sndio channel map. http://www.sndio.org/tips.html.        |
+    | ma_standard_channel_map_webaudio  | https://webaudio.github.io/web-audio-api/#ChannelOrdering |
+    +-----------------------------------+-----------------------------------------------------------+
+
+Below are the channel maps used by default in miniaudio (`ma_standard_channel_map_default`):
+
+    +---------------+---------------------------------+
+    | Channel Count | Mapping                         |
+    +---------------+---------------------------------+
+    | 1 (Mono)      | 0: MA_CHANNEL_MONO              |
+    +---------------+---------------------------------+
+    | 2 (Stereo)    | 0: MA_CHANNEL_FRONT_LEFT   <br> |
+    |               | 1: MA_CHANNEL_FRONT_RIGHT       |
+    +---------------+---------------------------------+
+    | 3             | 0: MA_CHANNEL_FRONT_LEFT   <br> |
+    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
+    |               | 2: MA_CHANNEL_FRONT_CENTER      |
+    +---------------+---------------------------------+
+    | 4 (Surround)  | 0: MA_CHANNEL_FRONT_LEFT   <br> |
+    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
+    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
+    |               | 3: MA_CHANNEL_BACK_CENTER       |
+    +---------------+---------------------------------+
+    | 5             | 0: MA_CHANNEL_FRONT_LEFT   <br> |
+    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
+    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
+    |               | 3: MA_CHANNEL_BACK_LEFT    <br> |
+    |               | 4: MA_CHANNEL_BACK_RIGHT        |
+    +---------------+---------------------------------+
+    | 6 (5.1)       | 0: MA_CHANNEL_FRONT_LEFT   <br> |
+    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
+    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
+    |               | 3: MA_CHANNEL_LFE          <br> |
+    |               | 4: MA_CHANNEL_SIDE_LEFT    <br> |
+    |               | 5: MA_CHANNEL_SIDE_RIGHT        |
+    +---------------+---------------------------------+
+    | 7             | 0: MA_CHANNEL_FRONT_LEFT   <br> |
+    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
+    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
+    |               | 3: MA_CHANNEL_LFE          <br> |
+    |               | 4: MA_CHANNEL_BACK_CENTER  <br> |
+    |               | 4: MA_CHANNEL_SIDE_LEFT    <br> |
+    |               | 5: MA_CHANNEL_SIDE_RIGHT        |
+    +---------------+---------------------------------+
+    | 8 (7.1)       | 0: MA_CHANNEL_FRONT_LEFT   <br> |
+    |               | 1: MA_CHANNEL_FRONT_RIGHT  <br> |
+    |               | 2: MA_CHANNEL_FRONT_CENTER <br> |
+    |               | 3: MA_CHANNEL_LFE          <br> |
+    |               | 4: MA_CHANNEL_BACK_LEFT    <br> |
+    |               | 5: MA_CHANNEL_BACK_RIGHT   <br> |
+    |               | 6: MA_CHANNEL_SIDE_LEFT    <br> |
+    |               | 7: MA_CHANNEL_SIDE_RIGHT        |
+    +---------------+---------------------------------+
+    | Other         | All channels set to 0. This     |
+    |               | is equivalent to the same       |
+    |               | mapping as the device.          |
+    +---------------+---------------------------------+
+
+
+
+10.3. Resampling
+----------------
+Resampling is achieved with the `ma_resampler` object. To create a resampler object, do something
+like the following:
+
+    ```c
+    ma_resampler_config config = ma_resampler_config_init(
+        ma_format_s16,
+        channels,
+        sampleRateIn,
+        sampleRateOut,
+        ma_resample_algorithm_linear);
+
+    ma_resampler resampler;
+    ma_result result = ma_resampler_init(&config, NULL, &resampler);
+    if (result != MA_SUCCESS) {
+        // An error occurred...
+    }
+    ```
+
+Do the following to uninitialize the resampler:
+
+    ```c
+    ma_resampler_uninit(&resampler);
+    ```
+
+The following example shows how data can be processed
+
+    ```c
+    ma_uint64 frameCountIn  = 1000;
+    ma_uint64 frameCountOut = 2000;
+    ma_result result = ma_resampler_process_pcm_frames(&resampler, pFramesIn, &frameCountIn, pFramesOut, &frameCountOut);
+    if (result != MA_SUCCESS) {
+        // An error occurred...
+    }
+
+    // At this point, frameCountIn contains the number of input frames that were consumed and frameCountOut contains the
+    // number of output frames written.
+    ```
+
+To initialize the resampler you first need to set up a config (`ma_resampler_config`) with
+`ma_resampler_config_init()`. You need to specify the sample format you want to use, the number of
+channels, the input and output sample rate, and the algorithm.
+
+The sample format can be either `ma_format_s16` or `ma_format_f32`. If you need a different format
+you will need to perform pre- and post-conversions yourself where necessary. Note that the format
+is the same for both input and output. The format cannot be changed after initialization.
+
+The resampler supports multiple channels and is always interleaved (both input and output). The
+channel count cannot be changed after initialization.
+
+The sample rates can be anything other than zero, and are always specified in hertz. They should be
+set to something like 44100, etc. The sample rate is the only configuration property that can be
+changed after initialization.
+
+The miniaudio resampler has built-in support for the following algorithms:
+
+    +-----------+------------------------------+
+    | Algorithm | Enum Token                   |
+    +-----------+------------------------------+
+    | Linear    | ma_resample_algorithm_linear |
+    | Custom    | ma_resample_algorithm_custom |
+    +-----------+------------------------------+
+
+The algorithm cannot be changed after initialization.
+
+Processing always happens on a per PCM frame basis and always assumes interleaved input and output.
+De-interleaved processing is not supported. To process frames, use
+`ma_resampler_process_pcm_frames()`. On input, this function takes the number of output frames you
+can fit in the output buffer and the number of input frames contained in the input buffer. On
+output these variables contain the number of output frames that were written to the output buffer
+and the number of input frames that were consumed in the process. You can pass in NULL for the
+input buffer in which case it will be treated as an infinitely large buffer of zeros. The output
+buffer can also be NULL, in which case the processing will be treated as seek.
+
+The sample rate can be changed dynamically on the fly. You can change this with explicit sample
+rates with `ma_resampler_set_rate()` and also with a decimal ratio with
+`ma_resampler_set_rate_ratio()`. The ratio is in/out.
+
+Sometimes it's useful to know exactly how many input frames will be required to output a specific
+number of frames. You can calculate this with `ma_resampler_get_required_input_frame_count()`.
+Likewise, it's sometimes useful to know exactly how many frames would be output given a certain
+number of input frames. You can do this with `ma_resampler_get_expected_output_frame_count()`.
+
+Due to the nature of how resampling works, the resampler introduces some latency. This can be
+retrieved in terms of both the input rate and the output rate with
+`ma_resampler_get_input_latency()` and `ma_resampler_get_output_latency()`.
+
+
+10.3.1. Resampling Algorithms
+-----------------------------
+The choice of resampling algorithm depends on your situation and requirements.
+
+
+10.3.1.1. Linear Resampling
+---------------------------
+The linear resampler is the fastest, but comes at the expense of poorer quality. There is, however,
+some control over the quality of the linear resampler which may make it a suitable option depending
+on your requirements.
+
+The linear resampler performs low-pass filtering before or after downsampling or upsampling,
+depending on the sample rates you're converting between. When decreasing the sample rate, the
+low-pass filter will be applied before downsampling. When increasing the rate it will be performed
+after upsampling. By default a fourth order low-pass filter will be applied. This can be configured
+via the `lpfOrder` configuration variable. Setting this to 0 will disable filtering.
+
+The low-pass filter has a cutoff frequency which defaults to half the sample rate of the lowest of
+the input and output sample rates (Nyquist Frequency).
+
+The API for the linear resampler is the same as the main resampler API, only it's called
+`ma_linear_resampler`.
+
+
+10.3.2. Custom Resamplers
+-------------------------
+You can implement a custom resampler by using the `ma_resample_algorithm_custom` resampling
+algorithm and setting a vtable in the resampler config:
+
+    ```c
+    ma_resampler_config config = ma_resampler_config_init(..., ma_resample_algorithm_custom);
+    config.pBackendVTable = &g_customResamplerVTable;
+    ```
+
+Custom resamplers are useful if the stock algorithms are not appropriate for your use case. You
+need to implement the required functions in `ma_resampling_backend_vtable`. Note that not all
+functions in the vtable need to be implemented, but if it's possible to implement, they should be.
+
+You can use the `ma_linear_resampler` object for an example on how to implement the vtable. The
+`onGetHeapSize` callback is used to calculate the size of any internal heap allocation the custom
+resampler will need to make given the supplied config. When you initialize the resampler via the
+`onInit` callback, you'll be given a pointer to a heap allocation which is where you should store
+the heap allocated data. You should not free this data in `onUninit` because miniaudio will manage
+it for you.
+
+The `onProcess` callback is where the actual resampling takes place. On input, `pFrameCountIn`
+points to a variable containing the number of frames in the `pFramesIn` buffer and
+`pFrameCountOut` points to a variable containing the capacity in frames of the `pFramesOut` buffer.
+On output, `pFrameCountIn` should be set to the number of input frames that were fully consumed,
+whereas `pFrameCountOut` should be set to the number of frames that were written to `pFramesOut`.
+
+The `onSetRate` callback is optional and is used for dynamically changing the sample rate. If
+dynamic rate changes are not supported, you can set this callback to NULL.
+
+The `onGetInputLatency` and `onGetOutputLatency` functions are used for retrieving the latency in
+input and output rates respectively. These can be NULL in which case latency calculations will be
+assumed to be NULL.
+
+The `onGetRequiredInputFrameCount` callback is used to give miniaudio a hint as to how many input
+frames are required to be available to produce the given number of output frames. Likewise, the
+`onGetExpectedOutputFrameCount` callback is used to determine how many output frames will be
+produced given the specified number of input frames. miniaudio will use these as a hint, but they
+are optional and can be set to NULL if you're unable to implement them.
+
+
+
+10.4. General Data Conversion
+-----------------------------
+The `ma_data_converter` API can be used to wrap sample format conversion, channel conversion and
+resampling into one operation. This is what miniaudio uses internally to convert between the format
+requested when the device was initialized and the format of the backend's native device. The API
+for general data conversion is very similar to the resampling API. Create a `ma_data_converter`
+object like this:
+
+    ```c
+    ma_data_converter_config config = ma_data_converter_config_init(
+        inputFormat,
+        outputFormat,
+        inputChannels,
+        outputChannels,
+        inputSampleRate,
+        outputSampleRate
+    );
+
+    ma_data_converter converter;
+    ma_result result = ma_data_converter_init(&config, NULL, &converter);
+    if (result != MA_SUCCESS) {
+        // An error occurred...
+    }
+    ```
+
+In the example above we use `ma_data_converter_config_init()` to initialize the config, however
+there's many more properties that can be configured, such as channel maps and resampling quality.
+Something like the following may be more suitable depending on your requirements:
+
+    ```c
+    ma_data_converter_config config = ma_data_converter_config_init_default();
+    config.formatIn = inputFormat;
+    config.formatOut = outputFormat;
+    config.channelsIn = inputChannels;
+    config.channelsOut = outputChannels;
+    config.sampleRateIn = inputSampleRate;
+    config.sampleRateOut = outputSampleRate;
+    ma_channel_map_init_standard(ma_standard_channel_map_flac, config.channelMapIn, sizeof(config.channelMapIn)/sizeof(config.channelMapIn[0]), config.channelCountIn);
+    config.resampling.linear.lpfOrder = MA_MAX_FILTER_ORDER;
+    ```
+
+Do the following to uninitialize the data converter:
+
+    ```c
+    ma_data_converter_uninit(&converter, NULL);
+    ```
+
+The following example shows how data can be processed
+
+    ```c
+    ma_uint64 frameCountIn  = 1000;
+    ma_uint64 frameCountOut = 2000;
+    ma_result result = ma_data_converter_process_pcm_frames(&converter, pFramesIn, &frameCountIn, pFramesOut, &frameCountOut);
+    if (result != MA_SUCCESS) {
+        // An error occurred...
+    }
+
+    // At this point, frameCountIn contains the number of input frames that were consumed and frameCountOut contains the number
+    // of output frames written.
+    ```
+
+The data converter supports multiple channels and is always interleaved (both input and output).
+The channel count cannot be changed after initialization.
+
+Sample rates can be anything other than zero, and are always specified in hertz. They should be set
+to something like 44100, etc. The sample rate is the only configuration property that can be
+changed after initialization, but only if the `resampling.allowDynamicSampleRate` member of
+`ma_data_converter_config` is set to `MA_TRUE`. To change the sample rate, use
+`ma_data_converter_set_rate()` or `ma_data_converter_set_rate_ratio()`. The ratio must be in/out.
+The resampling algorithm cannot be changed after initialization.
+
+Processing always happens on a per PCM frame basis and always assumes interleaved input and output.
+De-interleaved processing is not supported. To process frames, use
+`ma_data_converter_process_pcm_frames()`. On input, this function takes the number of output frames
+you can fit in the output buffer and the number of input frames contained in the input buffer. On
+output these variables contain the number of output frames that were written to the output buffer
+and the number of input frames that were consumed in the process. You can pass in NULL for the
+input buffer in which case it will be treated as an infinitely large
+buffer of zeros. The output buffer can also be NULL, in which case the processing will be treated
+as seek.
+
+Sometimes it's useful to know exactly how many input frames will be required to output a specific
+number of frames. You can calculate this with `ma_data_converter_get_required_input_frame_count()`.
+Likewise, it's sometimes useful to know exactly how many frames would be output given a certain
+number of input frames. You can do this with `ma_data_converter_get_expected_output_frame_count()`.
+
+Due to the nature of how resampling works, the data converter introduces some latency if resampling
+is required. This can be retrieved in terms of both the input rate and the output rate with
+`ma_data_converter_get_input_latency()` and `ma_data_converter_get_output_latency()`.
+
+
+
+11. Filtering
+=============
+
+11.1. Biquad Filtering
+----------------------
+Biquad filtering is achieved with the `ma_biquad` API. Example:
+
+    ```c
+    ma_biquad_config config = ma_biquad_config_init(ma_format_f32, channels, b0, b1, b2, a0, a1, a2);
+    ma_result result = ma_biquad_init(&config, NULL, &biquad);
+    if (result != MA_SUCCESS) {
+        // Error.
+    }
+
+    ...
+
+    ma_biquad_process_pcm_frames(&biquad, pFramesOut, pFramesIn, frameCount);
+    ```
+
+Biquad filtering is implemented using transposed direct form 2. The numerator coefficients are b0,
+b1 and b2, and the denominator coefficients are a0, a1 and a2. The a0 coefficient is required and
+coefficients must not be pre-normalized.
+
+Supported formats are `ma_format_s16` and `ma_format_f32`. If you need to use a different format
+you need to convert it yourself beforehand. When using `ma_format_s16` the biquad filter will use
+fixed point arithmetic. When using `ma_format_f32`, floating point arithmetic will be used.
+
+Input and output frames are always interleaved.
+
+Filtering can be applied in-place by passing in the same pointer for both the input and output
+buffers, like so:
+
+    ```c
+    ma_biquad_process_pcm_frames(&biquad, pMyData, pMyData, frameCount);
+    ```
+
+If you need to change the values of the coefficients, but maintain the values in the registers you
+can do so with `ma_biquad_reinit()`. This is useful if you need to change the properties of the
+filter while keeping the values of registers valid to avoid glitching. Do not use
+`ma_biquad_init()` for this as it will do a full initialization which involves clearing the
+registers to 0. Note that changing the format or channel count after initialization is invalid and
+will result in an error.
+
+
+11.2. Low-Pass Filtering
+------------------------
+Low-pass filtering is achieved with the following APIs:
+
+    +---------+------------------------------------------+
+    | API     | Description                              |
+    +---------+------------------------------------------+
+    | ma_lpf1 | First order low-pass filter              |
+    | ma_lpf2 | Second order low-pass filter             |
+    | ma_lpf  | High order low-pass filter (Butterworth) |
+    +---------+------------------------------------------+
+
+Low-pass filter example:
+
+    ```c
+    ma_lpf_config config = ma_lpf_config_init(ma_format_f32, channels, sampleRate, cutoffFrequency, order);
+    ma_result result = ma_lpf_init(&config, &lpf);
+    if (result != MA_SUCCESS) {
+        // Error.
+    }
+
+    ...
+
+    ma_lpf_process_pcm_frames(&lpf, pFramesOut, pFramesIn, frameCount);
+    ```
+
+Supported formats are `ma_format_s16` and` ma_format_f32`. If you need to use a different format
+you need to convert it yourself beforehand. Input and output frames are always interleaved.
+
+Filtering can be applied in-place by passing in the same pointer for both the input and output
+buffers, like so:
+
+    ```c
+    ma_lpf_process_pcm_frames(&lpf, pMyData, pMyData, frameCount);
+    ```
+
+The maximum filter order is limited to `MA_MAX_FILTER_ORDER` which is set to 8. If you need more,
+you can chain first and second order filters together.
+
+    ```c
+    for (iFilter = 0; iFilter < filterCount; iFilter += 1) {
+        ma_lpf2_process_pcm_frames(&lpf2[iFilter], pMyData, pMyData, frameCount);
+    }
+    ```
+
+If you need to change the configuration of the filter, but need to maintain the state of internal
+registers you can do so with `ma_lpf_reinit()`. This may be useful if you need to change the sample
+rate and/or cutoff frequency dynamically while maintaining smooth transitions. Note that changing the
+format or channel count after initialization is invalid and will result in an error.
+
+The `ma_lpf` object supports a configurable order, but if you only need a first order filter you
+may want to consider using `ma_lpf1`. Likewise, if you only need a second order filter you can use
+`ma_lpf2`. The advantage of this is that they're lighter weight and a bit more efficient.
+
+If an even filter order is specified, a series of second order filters will be processed in a
+chain. If an odd filter order is specified, a first order filter will be applied, followed by a
+series of second order filters in a chain.
+
+
+11.3. High-Pass Filtering
+-------------------------
+High-pass filtering is achieved with the following APIs:
+
+    +---------+-------------------------------------------+
+    | API     | Description                               |
+    +---------+-------------------------------------------+
+    | ma_hpf1 | First order high-pass filter              |
+    | ma_hpf2 | Second order high-pass filter             |
+    | ma_hpf  | High order high-pass filter (Butterworth) |
+    +---------+-------------------------------------------+
+
+High-pass filters work exactly the same as low-pass filters, only the APIs are called `ma_hpf1`,
+`ma_hpf2` and `ma_hpf`. See example code for low-pass filters for example usage.
+
+
+11.4. Band-Pass Filtering
+-------------------------
+Band-pass filtering is achieved with the following APIs:
+
+    +---------+-------------------------------+
+    | API     | Description                   |
+    +---------+-------------------------------+
+    | ma_bpf2 | Second order band-pass filter |
+    | ma_bpf  | High order band-pass filter   |
+    +---------+-------------------------------+
+
+Band-pass filters work exactly the same as low-pass filters, only the APIs are called `ma_bpf2` and
+`ma_hpf`. See example code for low-pass filters for example usage. Note that the order for
+band-pass filters must be an even number which means there is no first order band-pass filter,
+unlike low-pass and high-pass filters.
+
+
+11.5. Notch Filtering
+---------------------
+Notch filtering is achieved with the following APIs:
+
+    +-----------+------------------------------------------+
+    | API       | Description                              |
+    +-----------+------------------------------------------+
+    | ma_notch2 | Second order notching filter             |
+    +-----------+------------------------------------------+
+
+
+11.6. Peaking EQ Filtering
+-------------------------
+Peaking filtering is achieved with the following APIs:
+
+    +----------+------------------------------------------+
+    | API      | Description                              |
+    +----------+------------------------------------------+
+    | ma_peak2 | Second order peaking filter              |
+    +----------+------------------------------------------+
+
+
+11.7. Low Shelf Filtering
+-------------------------
+Low shelf filtering is achieved with the following APIs:
+
+    +-------------+------------------------------------------+
+    | API         | Description                              |
+    +-------------+------------------------------------------+
+    | ma_loshelf2 | Second order low shelf filter            |
+    +-------------+------------------------------------------+
+
+Where a high-pass filter is used to eliminate lower frequencies, a low shelf filter can be used to
+just turn them down rather than eliminate them entirely.
+
+
+11.8. High Shelf Filtering
+--------------------------
+High shelf filtering is achieved with the following APIs:
+
+    +-------------+------------------------------------------+
+    | API         | Description                              |
+    +-------------+------------------------------------------+
+    | ma_hishelf2 | Second order high shelf filter           |
+    +-------------+------------------------------------------+
+
+The high shelf filter has the same API as the low shelf filter, only you would use `ma_hishelf`
+instead of `ma_loshelf`. Where a low shelf filter is used to adjust the volume of low frequencies,
+the high shelf filter does the same thing for high frequencies.
+
+
+
+
+12. Waveform and Noise Generation
+=================================
+
+12.1. Waveforms
+---------------
+miniaudio supports generation of sine, square, triangle and sawtooth waveforms. This is achieved
+with the `ma_waveform` API. Example:
+
+    ```c
+    ma_waveform_config config = ma_waveform_config_init(
+        FORMAT,
+        CHANNELS,
+        SAMPLE_RATE,
+        ma_waveform_type_sine,
+        amplitude,
+        frequency);
+
+    ma_waveform waveform;
+    ma_result result = ma_waveform_init(&config, &waveform);
+    if (result != MA_SUCCESS) {
+        // Error.
+    }
+
+    ...
+
+    ma_waveform_read_pcm_frames(&waveform, pOutput, frameCount);
+    ```
+
+The amplitude, frequency, type, and sample rate can be changed dynamically with
+`ma_waveform_set_amplitude()`, `ma_waveform_set_frequency()`, `ma_waveform_set_type()`, and
+`ma_waveform_set_sample_rate()` respectively.
+
+You can invert the waveform by setting the amplitude to a negative value. You can use this to
+control whether or not a sawtooth has a positive or negative ramp, for example.
+
+Below are the supported waveform types:
+
+    +---------------------------+
+    | Enum Name                 |
+    +---------------------------+
+    | ma_waveform_type_sine     |
+    | ma_waveform_type_square   |
+    | ma_waveform_type_triangle |
+    | ma_waveform_type_sawtooth |
+    +---------------------------+
+
+
+
+12.2. Noise
+-----------
+miniaudio supports generation of white, pink and Brownian noise via the `ma_noise` API. Example:
+
+    ```c
+    ma_noise_config config = ma_noise_config_init(
+        FORMAT,
+        CHANNELS,
+        ma_noise_type_white,
+        SEED,
+        amplitude);
+
+    ma_noise noise;
+    ma_result result = ma_noise_init(&config, &noise);
+    if (result != MA_SUCCESS) {
+        // Error.
+    }
+
+    ...
+
+    ma_noise_read_pcm_frames(&noise, pOutput, frameCount);
+    ```
+
+The noise API uses simple LCG random number generation. It supports a custom seed which is useful
+for things like automated testing requiring reproducibility. Setting the seed to zero will default
+to `MA_DEFAULT_LCG_SEED`.
+
+The amplitude and seed can be changed dynamically with `ma_noise_set_amplitude()` and
+`ma_noise_set_seed()` respectively.
+
+By default, the noise API will use different values for different channels. So, for example, the
+left side in a stereo stream will be different to the right side. To instead have each channel use
+the same random value, set the `duplicateChannels` member of the noise config to true, like so:
+
+    ```c
+    config.duplicateChannels = MA_TRUE;
+    ```
+
+Below are the supported noise types.
+
+    +------------------------+
+    | Enum Name              |
+    +------------------------+
+    | ma_noise_type_white    |
+    | ma_noise_type_pink     |
+    | ma_noise_type_brownian |
+    +------------------------+
+
+
+
+13. Audio Buffers
+=================
+miniaudio supports reading from a buffer of raw audio data via the `ma_audio_buffer` API. This can
+read from memory that's managed by the application, but can also handle the memory management for
+you internally. Memory management is flexible and should support most use cases.
+
+Audio buffers are initialized using the standard configuration system used everywhere in miniaudio:
+
+    ```c
+    ma_audio_buffer_config config = ma_audio_buffer_config_init(
+        format,
+        channels,
+        sizeInFrames,
+        pExistingData,
+        &allocationCallbacks);
+
+    ma_audio_buffer buffer;
+    result = ma_audio_buffer_init(&config, &buffer);
+    if (result != MA_SUCCESS) {
+        // Error.
+    }
+
+    ...
+
+    ma_audio_buffer_uninit(&buffer);
+    ```
+
+In the example above, the memory pointed to by `pExistingData` will *not* be copied and is how an
+application can do self-managed memory allocation. If you would rather make a copy of the data, use
+`ma_audio_buffer_init_copy()`. To uninitialize the buffer, use `ma_audio_buffer_uninit()`.
+
+Sometimes it can be convenient to allocate the memory for the `ma_audio_buffer` structure and the
+raw audio data in a contiguous block of memory. That is, the raw audio data will be located
+immediately after the `ma_audio_buffer` structure. To do this, use
+`ma_audio_buffer_alloc_and_init()`:
+
+    ```c
+    ma_audio_buffer_config config = ma_audio_buffer_config_init(
+        format,
+        channels,
+        sizeInFrames,
+        pExistingData,
+        &allocationCallbacks);
+
+    ma_audio_buffer* pBuffer
+    result = ma_audio_buffer_alloc_and_init(&config, &pBuffer);
+    if (result != MA_SUCCESS) {
+        // Error
+    }
+
+    ...
+
+    ma_audio_buffer_uninit_and_free(&buffer);
+    ```
+
+If you initialize the buffer with `ma_audio_buffer_alloc_and_init()` you should uninitialize it
+with `ma_audio_buffer_uninit_and_free()`. In the example above, the memory pointed to by
+`pExistingData` will be copied into the buffer, which is contrary to the behavior of
+`ma_audio_buffer_init()`.
+
+An audio buffer has a playback cursor just like a decoder. As you read frames from the buffer, the
+cursor moves forward. The last parameter (`loop`) can be used to determine if the buffer should
+loop. The return value is the number of frames actually read. If this is less than the number of
+frames requested it means the end has been reached. This should never happen if the `loop`
+parameter is set to true. If you want to manually loop back to the start, you can do so with with
+`ma_audio_buffer_seek_to_pcm_frame(pAudioBuffer, 0)`. Below is an example for reading data from an
+audio buffer.
+
+    ```c
+    ma_uint64 framesRead = ma_audio_buffer_read_pcm_frames(pAudioBuffer, pFramesOut, desiredFrameCount, isLooping);
+    if (framesRead < desiredFrameCount) {
+        // If not looping, this means the end has been reached. This should never happen in looping mode with valid input.
+    }
+    ```
+
+Sometimes you may want to avoid the cost of data movement between the internal buffer and the
+output buffer. Instead you can use memory mapping to retrieve a pointer to a segment of data:
+
+    ```c
+    void* pMappedFrames;
+    ma_uint64 frameCount = frameCountToTryMapping;
+    ma_result result = ma_audio_buffer_map(pAudioBuffer, &pMappedFrames, &frameCount);
+    if (result == MA_SUCCESS) {
+        // Map was successful. The value in frameCount will be how many frames were _actually_ mapped, which may be
+        // less due to the end of the buffer being reached.
+        ma_copy_pcm_frames(pFramesOut, pMappedFrames, frameCount, pAudioBuffer->format, pAudioBuffer->channels);
+
+        // You must unmap the buffer.
+        ma_audio_buffer_unmap(pAudioBuffer, frameCount);
+    }
+    ```
+
+When you use memory mapping, the read cursor is increment by the frame count passed in to
+`ma_audio_buffer_unmap()`. If you decide not to process every frame you can pass in a value smaller
+than the value returned by `ma_audio_buffer_map()`. The disadvantage to using memory mapping is
+that it does not handle looping for you. You can determine if the buffer is at the end for the
+purpose of looping with `ma_audio_buffer_at_end()` or by inspecting the return value of
+`ma_audio_buffer_unmap()` and checking if it equals `MA_AT_END`. You should not treat `MA_AT_END`
+as an error when returned by `ma_audio_buffer_unmap()`.
+
+
+
+14. Ring Buffers
+================
+miniaudio supports lock free (single producer, single consumer) ring buffers which are exposed via
+the `ma_rb` and `ma_pcm_rb` APIs. The `ma_rb` API operates on bytes, whereas the `ma_pcm_rb`
+operates on PCM frames. They are otherwise identical as `ma_pcm_rb` is just a wrapper around
+`ma_rb`.
+
+Unlike most other APIs in miniaudio, ring buffers support both interleaved and deinterleaved
+streams. The caller can also allocate their own backing memory for the ring buffer to use
+internally for added flexibility. Otherwise the ring buffer will manage it's internal memory for
+you.
+
+The examples below use the PCM frame variant of the ring buffer since that's most likely the one
+you will want to use. To initialize a ring buffer, do something like the following:
+
+    ```c
+    ma_pcm_rb rb;
+    ma_result result = ma_pcm_rb_init(FORMAT, CHANNELS, BUFFER_SIZE_IN_FRAMES, NULL, NULL, &rb);
+    if (result != MA_SUCCESS) {
+        // Error
+    }
+    ```
+
+The `ma_pcm_rb_init()` function takes the sample format and channel count as parameters because
+it's the PCM variant of the ring buffer API. For the regular ring buffer that operates on bytes you
+would call `ma_rb_init()` which leaves these out and just takes the size of the buffer in bytes
+instead of frames. The fourth parameter is an optional pre-allocated buffer and the fifth parameter
+is a pointer to a `ma_allocation_callbacks` structure for custom memory allocation routines.
+Passing in `NULL` for this results in `MA_MALLOC()` and `MA_FREE()` being used.
+
+Use `ma_pcm_rb_init_ex()` if you need a deinterleaved buffer. The data for each sub-buffer is
+offset from each other based on the stride. To manage your sub-buffers you can use
+`ma_pcm_rb_get_subbuffer_stride()`, `ma_pcm_rb_get_subbuffer_offset()` and
+`ma_pcm_rb_get_subbuffer_ptr()`.
+
+Use `ma_pcm_rb_acquire_read()` and `ma_pcm_rb_acquire_write()` to retrieve a pointer to a section
+of the ring buffer. You specify the number of frames you need, and on output it will set to what
+was actually acquired. If the read or write pointer is positioned such that the number of frames
+requested will require a loop, it will be clamped to the end of the buffer. Therefore, the number
+of frames you're given may be less than the number you requested.
+
+After calling `ma_pcm_rb_acquire_read()` or `ma_pcm_rb_acquire_write()`, you do your work on the
+buffer and then "commit" it with `ma_pcm_rb_commit_read()` or `ma_pcm_rb_commit_write()`. This is
+where the read/write pointers are updated. When you commit you need to pass in the buffer that was
+returned by the earlier call to `ma_pcm_rb_acquire_read()` or `ma_pcm_rb_acquire_write()` and is
+only used for validation. The number of frames passed to `ma_pcm_rb_commit_read()` and
+`ma_pcm_rb_commit_write()` is what's used to increment the pointers, and can be less that what was
+originally requested.
+
+If you want to correct for drift between the write pointer and the read pointer you can use a
+combination of `ma_pcm_rb_pointer_distance()`, `ma_pcm_rb_seek_read()` and
+`ma_pcm_rb_seek_write()`. Note that you can only move the pointers forward, and you should only
+move the read pointer forward via the consumer thread, and the write pointer forward by the
+producer thread. If there is too much space between the pointers, move the read pointer forward. If
+there is too little space between the pointers, move the write pointer forward.
+
+You can use a ring buffer at the byte level instead of the PCM frame level by using the `ma_rb`
+API. This is exactly the same, only you will use the `ma_rb` functions instead of `ma_pcm_rb` and
+instead of frame counts you will pass around byte counts.
+
+The maximum size of the buffer in bytes is `0x7FFFFFFF-(MA_SIMD_ALIGNMENT-1)` due to the most
+significant bit being used to encode a loop flag and the internally managed buffers always being
+aligned to `MA_SIMD_ALIGNMENT`.
+
+Note that the ring buffer is only thread safe when used by a single consumer thread and single
+producer thread.
+
+
+
+15. Backends
+============
+The following backends are supported by miniaudio. These are listed in order of default priority.
+When no backend is specified when initializing a context or device, miniaudio will attempt to use
+each of these backends in the order listed in the table below.
+
+Note that backends that are not usable by the build target will not be included in the build. For
+example, ALSA, which is specific to Linux, will not be included in the Windows build.
+
+    +-------------+-----------------------+--------------------------------------------------------+
+    | Name        | Enum Name             | Supported Operating Systems                            |
+    +-------------+-----------------------+--------------------------------------------------------+
+    | WASAPI      | ma_backend_wasapi     | Windows Vista+                                         |
+    | DirectSound | ma_backend_dsound     | Windows XP+                                            |
+    | WinMM       | ma_backend_winmm      | Windows 95+                                            |
+    | Core Audio  | ma_backend_coreaudio  | macOS, iOS                                             |
+    | sndio       | ma_backend_sndio      | OpenBSD                                                |
+    | audio(4)    | ma_backend_audio4     | NetBSD, OpenBSD                                        |
+    | OSS         | ma_backend_oss        | FreeBSD                                                |
+    | PulseAudio  | ma_backend_pulseaudio | Cross Platform (disabled on Windows, BSD and Android)  |
+    | ALSA        | ma_backend_alsa       | Linux                                                  |
+    | JACK        | ma_backend_jack       | Cross Platform (disabled on BSD and Android)           |
+    | AAudio      | ma_backend_aaudio     | Android 8+                                             |
+    | OpenSL ES   | ma_backend_opensl     | Android (API level 16+)                                |
+    | Web Audio   | ma_backend_webaudio   | Web (via Emscripten)                                   |
+    | Custom      | ma_backend_custom     | Cross Platform                                         |
+    | Null        | ma_backend_null       | Cross Platform (not used on Web)                       |
+    +-------------+-----------------------+--------------------------------------------------------+
+
+Some backends have some nuance details you may want to be aware of.
+
+15.1. WASAPI
+------------
+- Low-latency shared mode will be disabled when using an application-defined sample rate which is
+  different to the device's native sample rate. To work around this, set `wasapi.noAutoConvertSRC`
+  to true in the device config. This is due to IAudioClient3_InitializeSharedAudioStream() failing
+  when the `AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM` flag is specified. Setting wasapi.noAutoConvertSRC
+  will result in miniaudio's internal resampler being used instead which will in turn enable the
+  use of low-latency shared mode.
+
+15.2. PulseAudio
+----------------
+- If you experience bad glitching/noise on Arch Linux, consider this fix from the Arch wiki:
+  https://wiki.archlinux.org/index.php/PulseAudio/Troubleshooting#Glitches,_skips_or_crackling.
+  Alternatively, consider using a different backend such as ALSA.
+
+15.3. Android
+-------------
+- To capture audio on Android, remember to add the RECORD_AUDIO permission to your manifest:
+  `<uses-permission android:name="android.permission.RECORD_AUDIO" />`
+- With OpenSL|ES, only a single ma_context can be active at any given time. This is due to a
+  limitation with OpenSL|ES.
+- With AAudio, only default devices are enumerated. This is due to AAudio not having an enumeration
+  API (devices are enumerated through Java). You can however perform your own device enumeration
+  through Java and then set the ID in the ma_device_id structure (ma_device_id.aaudio) and pass it
+  to ma_device_init().
+- The backend API will perform resampling where possible. The reason for this as opposed to using
+  miniaudio's built-in resampler is to take advantage of any potential device-specific
+  optimizations the driver may implement.
+
+BSD
+---
+- The sndio backend is currently only enabled on OpenBSD builds.
+- The audio(4) backend is supported on OpenBSD, but you may need to disable sndiod before you can
+  use it.
+
+15.4. UWP
+---------
+- UWP only supports default playback and capture devices.
+- UWP requires the Microphone capability to be enabled in the application's manifest (Package.appxmanifest):
+
+    ```
+    <Package ...>
+        ...
+        <Capabilities>
+            <DeviceCapability Name="microphone" />
+        </Capabilities>
+    </Package>
+    ```
+
+15.5. Web Audio / Emscripten
+----------------------------
+- You cannot use `-std=c*` compiler flags, nor `-ansi`. This only applies to the Emscripten build.
+- The first time a context is initialized it will create a global object called "miniaudio" whose
+  primary purpose is to act as a factory for device objects.
+- Currently the Web Audio backend uses ScriptProcessorNode's, but this may need to change later as
+  they've been deprecated.
+- Google has implemented a policy in their browsers that prevent automatic media output without
+  first receiving some kind of user input. The following web page has additional details:
+  https://developers.google.com/web/updates/2017/09/autoplay-policy-changes. Starting the device
+  may fail if you try to start playback without first handling some kind of user input.
+
+
+
+16. Optimization Tips
+=====================
+See below for some tips on improving performance.
+
+16.1. Low Level API
+-------------------
+- In the data callback, if your data is already clipped prior to copying it into the output buffer,
+  set the `noClip` config option in the device config to true. This will disable miniaudio's built
+  in clipping function.
+- By default, miniaudio will pre-silence the data callback's output buffer. If you know that you
+  will always write valid data to the output buffer you can disable pre-silencing by setting the
+  `noPreSilence` config option in the device config to true.
+
+16.2. High Level API
+--------------------
+- If a sound does not require doppler or pitch shifting, consider disabling pitching by
+  initializing the sound with the `MA_SOUND_FLAG_NO_PITCH` flag.
+- If a sound does not require spatialization, disable it by initializing the sound with the
+  `MA_SOUND_FLAG_NO_SPATIALIZATION` flag. It can be re-enabled again post-initialization with
+  `ma_sound_set_spatialization_enabled()`.
+- If you know all of your sounds will always be the same sample rate, set the engine's sample
+  rate to match that of the sounds. Likewise, if you're using a self-managed resource manager,
+  consider setting the decoded sample rate to match your sounds. By configuring everything to
+  use a consistent sample rate, sample rate conversion can be avoided.
+
+
+
+17. Miscellaneous Notes
+=======================
+- Automatic stream routing is enabled on a per-backend basis. Support is explicitly enabled for
+  WASAPI and Core Audio, however other backends such as PulseAudio may naturally support it, though
+  not all have been tested.
+- When compiling with VC6 and earlier, decoding is restricted to files less than 2GB in size. This
+  is due to 64-bit file APIs not being available.
+*/
+
+#ifndef miniaudio_h
+#define miniaudio_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MA_STRINGIFY(x)     #x
+#define MA_XSTRINGIFY(x)    MA_STRINGIFY(x)
+
+#define MA_VERSION_MAJOR    0
+#define MA_VERSION_MINOR    11
+#define MA_VERSION_REVISION 22
+#define MA_VERSION_STRING   MA_XSTRINGIFY(MA_VERSION_MAJOR) "." MA_XSTRINGIFY(MA_VERSION_MINOR) "." MA_XSTRINGIFY(MA_VERSION_REVISION)
+
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(push)
+    #pragma warning(disable:4201)   /* nonstandard extension used: nameless struct/union */
+    #pragma warning(disable:4214)   /* nonstandard extension used: bit field types other than int */
+    #pragma warning(disable:4324)   /* structure was padded due to alignment specifier */
+#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wpedantic" /* For ISO C99 doesn't support unnamed structs/unions [-Wpedantic] */
+    #if defined(__clang__)
+        #pragma GCC diagnostic ignored "-Wc11-extensions"   /* anonymous unions are a C11 extension */
+    #endif
+#endif
+
+
+#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined(_M_IA64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__) || defined(__ppc64__)
+    #define MA_SIZEOF_PTR   8
+#else
+    #define MA_SIZEOF_PTR   4
+#endif
+
+#include <stddef.h> /* For size_t. */
+
+/* Sized types. */
+#if defined(MA_USE_STDINT)
+    #include <stdint.h>
+    typedef int8_t   ma_int8;
+    typedef uint8_t  ma_uint8;
+    typedef int16_t  ma_int16;
+    typedef uint16_t ma_uint16;
+    typedef int32_t  ma_int32;
+    typedef uint32_t ma_uint32;
+    typedef int64_t  ma_int64;
+    typedef uint64_t ma_uint64;
+#else
+    typedef   signed char           ma_int8;
+    typedef unsigned char           ma_uint8;
+    typedef   signed short          ma_int16;
+    typedef unsigned short          ma_uint16;
+    typedef   signed int            ma_int32;
+    typedef unsigned int            ma_uint32;
+    #if defined(_MSC_VER) && !defined(__clang__)
+        typedef   signed __int64    ma_int64;
+        typedef unsigned __int64    ma_uint64;
+    #else
+        #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+            #pragma GCC diagnostic push
+            #pragma GCC diagnostic ignored "-Wlong-long"
+            #if defined(__clang__)
+                #pragma GCC diagnostic ignored "-Wc++11-long-long"
+            #endif
+        #endif
+        typedef   signed long long  ma_int64;
+        typedef unsigned long long  ma_uint64;
+        #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+            #pragma GCC diagnostic pop
+        #endif
+    #endif
+#endif  /* MA_USE_STDINT */
+
+#if MA_SIZEOF_PTR == 8
+    typedef ma_uint64           ma_uintptr;
+#else
+    typedef ma_uint32           ma_uintptr;
+#endif
+
+typedef ma_uint8    ma_bool8;
+typedef ma_uint32   ma_bool32;
+#define MA_TRUE     1
+#define MA_FALSE    0
+
+/* These float types are not used universally by miniaudio. It's to simplify some macro expansion for atomic types. */
+typedef float       ma_float;
+typedef double      ma_double;
+
+typedef void* ma_handle;
+typedef void* ma_ptr;
+
+/*
+ma_proc is annoying because when compiling with GCC we get pedantic warnings about converting
+between `void*` and `void (*)()`. We can't use `void (*)()` with MSVC however, because we'll get
+warning C4191 about "type cast between incompatible function types". To work around this I'm going
+to use a different data type depending on the compiler.
+*/
+#if defined(__GNUC__)
+typedef void (*ma_proc)(void);
+#else
+typedef void* ma_proc;
+#endif
+
+#if defined(_MSC_VER) && !defined(_WCHAR_T_DEFINED)
+typedef ma_uint16 wchar_t;
+#endif
+
+/* Define NULL for some compilers. */
+#ifndef NULL
+#define NULL 0
+#endif
+
+#if defined(SIZE_MAX)
+    #define MA_SIZE_MAX    SIZE_MAX
+#else
+    #define MA_SIZE_MAX    0xFFFFFFFF  /* When SIZE_MAX is not defined by the standard library just default to the maximum 32-bit unsigned integer. */
+#endif
+
+
+/* Platform/backend detection. */
+#if defined(_WIN32) || defined(__COSMOPOLITAN__)
+    #define MA_WIN32
+    #if defined(MA_FORCE_UWP) || (defined(WINAPI_FAMILY) && ((defined(WINAPI_FAMILY_PC_APP) && WINAPI_FAMILY == WINAPI_FAMILY_PC_APP) || (defined(WINAPI_FAMILY_PHONE_APP) && WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)))
+        #define MA_WIN32_UWP
+    #elif defined(WINAPI_FAMILY) && (defined(WINAPI_FAMILY_GAMES) && WINAPI_FAMILY == WINAPI_FAMILY_GAMES)
+        #define MA_WIN32_GDK
+    #else
+        #define MA_WIN32_DESKTOP
+    #endif
+#endif
+#if !defined(_WIN32)    /* If it's not Win32, assume POSIX. */
+    #define MA_POSIX
+
+    /*
+    Use the MA_NO_PTHREAD_IN_HEADER option at your own risk. This is intentionally undocumented.
+    You can use this to avoid including pthread.h in the header section. The downside is that it
+    results in some fixed sized structures being declared for the various types that are used in
+    miniaudio. The risk here is that these types might be too small for a given platform. This
+    risk is yours to take and no support will be offered if you enable this option.
+    */
+    #ifndef MA_NO_PTHREAD_IN_HEADER
+        #include <pthread.h>    /* Unfortunate #include, but needed for pthread_t, pthread_mutex_t and pthread_cond_t types. */
+        typedef pthread_t       ma_pthread_t;
+        typedef pthread_mutex_t ma_pthread_mutex_t;
+        typedef pthread_cond_t  ma_pthread_cond_t;
+    #else
+        typedef ma_uintptr      ma_pthread_t;
+        typedef union           ma_pthread_mutex_t { char __data[40]; ma_uint64 __alignment; } ma_pthread_mutex_t;
+        typedef union           ma_pthread_cond_t  { char __data[48]; ma_uint64 __alignment; } ma_pthread_cond_t;
+    #endif
+
+    #if defined(__unix__)
+        #define MA_UNIX
+    #endif
+    #if defined(__linux__)
+        #define MA_LINUX
+    #endif
+    #if defined(__APPLE__)
+        #define MA_APPLE
+    #endif
+    #if defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
+        #define MA_BSD
+    #endif
+    #if defined(__ANDROID__)
+        #define MA_ANDROID
+    #endif
+    #if defined(__EMSCRIPTEN__)
+        #define MA_EMSCRIPTEN
+    #endif
+    #if defined(__ORBIS__)
+        #define MA_ORBIS
+    #endif
+    #if defined(__PROSPERO__)
+        #define MA_PROSPERO
+    #endif
+    #if defined(__NX__)
+        #define MA_NX
+    #endif
+    #if defined(__BEOS__) || defined(__HAIKU__)
+        #define MA_BEOS
+    #endif
+    #if defined(__HAIKU__)
+        #define MA_HAIKU
+    #endif
+#endif
+
+#if defined(__has_c_attribute)
+    #if __has_c_attribute(fallthrough)
+        #define MA_FALLTHROUGH [[fallthrough]]
+    #endif
+#endif
+#if !defined(MA_FALLTHROUGH) && defined(__has_attribute) && (defined(__clang__) || defined(__GNUC__))
+    #if __has_attribute(fallthrough)
+        #define MA_FALLTHROUGH __attribute__((fallthrough))
+    #endif
+#endif
+#if !defined(MA_FALLTHROUGH)
+    #define MA_FALLTHROUGH ((void)0)
+#endif
+
+#ifdef _MSC_VER
+    #define MA_INLINE __forceinline
+
+    /* noinline was introduced in Visual Studio 2005. */
+    #if _MSC_VER >= 1400
+        #define MA_NO_INLINE __declspec(noinline)
+    #else
+        #define MA_NO_INLINE
+    #endif
+#elif defined(__GNUC__)
+    /*
+    I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when
+    the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some
+    case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the
+    command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue
+    I am using "__inline__" only when we're compiling in strict ANSI mode.
+    */
+    #if defined(__STRICT_ANSI__)
+        #define MA_GNUC_INLINE_HINT __inline__
+    #else
+        #define MA_GNUC_INLINE_HINT inline
+    #endif
+
+    #if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)) || defined(__clang__)
+        #define MA_INLINE MA_GNUC_INLINE_HINT __attribute__((always_inline))
+        #define MA_NO_INLINE __attribute__((noinline))
+    #else
+        #define MA_INLINE MA_GNUC_INLINE_HINT
+        #define MA_NO_INLINE __attribute__((noinline))
+    #endif
+#elif defined(__WATCOMC__)
+    #define MA_INLINE __inline
+    #define MA_NO_INLINE
+#else
+    #define MA_INLINE
+    #define MA_NO_INLINE
+#endif
+
+/* MA_DLL is not officially supported. You're on your own if you want to use this. */
+#if defined(MA_DLL)
+    #if defined(_WIN32)
+        #define MA_DLL_IMPORT  __declspec(dllimport)
+        #define MA_DLL_EXPORT  __declspec(dllexport)
+        #define MA_DLL_PRIVATE static
+    #else
+        #if defined(__GNUC__) && __GNUC__ >= 4
+            #define MA_DLL_IMPORT  __attribute__((visibility("default")))
+            #define MA_DLL_EXPORT  __attribute__((visibility("default")))
+            #define MA_DLL_PRIVATE __attribute__((visibility("hidden")))
+        #else
+            #define MA_DLL_IMPORT
+            #define MA_DLL_EXPORT
+            #define MA_DLL_PRIVATE static
+        #endif
+    #endif
+#endif
+
+#if !defined(MA_API)
+    #if defined(MA_DLL)
+        #if defined(MINIAUDIO_IMPLEMENTATION) || defined(MA_IMPLEMENTATION)
+            #define MA_API  MA_DLL_EXPORT
+        #else
+            #define MA_API  MA_DLL_IMPORT
+        #endif
+    #else
+        #define MA_API extern
+    #endif
+#endif
+
+#if !defined(MA_STATIC)
+    #if defined(MA_DLL)
+        #define MA_PRIVATE MA_DLL_PRIVATE
+    #else
+        #define MA_PRIVATE static
+    #endif
+#endif
+
+
+/* SIMD alignment in bytes. Currently set to 32 bytes in preparation for future AVX optimizations. */
+#define MA_SIMD_ALIGNMENT  32
+
+/*
+Special wchar_t type to ensure any structures in the public sections that reference it have a
+consistent size across all platforms.
+
+On Windows, wchar_t is 2 bytes, whereas everywhere else it's 4 bytes. Since Windows likes to use
+wchar_t for its IDs, we need a special explicitly sized wchar type that is always 2 bytes on all
+platforms.
+*/
+#if !defined(MA_POSIX) && defined(MA_WIN32)
+typedef wchar_t     ma_wchar_win32;
+#else
+typedef ma_uint16   ma_wchar_win32;
+#endif
+
+
+
+/*
+Logging Levels
+==============
+Log levels are only used to give logging callbacks some context as to the severity of a log message
+so they can do filtering. All log levels will be posted to registered logging callbacks. If you
+don't want to output a certain log level you can discriminate against the log level in the callback.
+
+MA_LOG_LEVEL_DEBUG
+    Used for debugging. Useful for debug and test builds, but should be disabled in release builds.
+
+MA_LOG_LEVEL_INFO
+    Informational logging. Useful for debugging. This will never be called from within the data
+    callback.
+
+MA_LOG_LEVEL_WARNING
+    Warnings. You should enable this in you development builds and action them when encountered. These
+    logs usually indicate a potential problem or misconfiguration, but still allow you to keep
+    running. This will never be called from within the data callback.
+
+MA_LOG_LEVEL_ERROR
+    Error logging. This will be fired when an operation fails and is subsequently aborted. This can
+    be fired from within the data callback, in which case the device will be stopped. You should
+    always have this log level enabled.
+*/
+typedef enum
+{
+    MA_LOG_LEVEL_DEBUG   = 4,
+    MA_LOG_LEVEL_INFO    = 3,
+    MA_LOG_LEVEL_WARNING = 2,
+    MA_LOG_LEVEL_ERROR   = 1
+} ma_log_level;
+
+/*
+Variables needing to be accessed atomically should be declared with this macro for two reasons:
+
+    1) It allows people who read the code to identify a variable as such; and
+    2) It forces alignment on platforms where it's required or optimal.
+
+Note that for x86/64, alignment is not strictly necessary, but does have some performance
+implications. Where supported by the compiler, alignment will be used, but otherwise if the CPU
+architecture does not require it, it will simply leave it unaligned. This is the case with old
+versions of Visual Studio, which I've confirmed with at least VC6.
+*/
+#if !defined(_MSC_VER) && defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+    #include <stdalign.h>
+    #define MA_ATOMIC(alignment, type)            _Alignas(alignment) type
+#else
+    #if defined(__GNUC__)
+        /* GCC-style compilers. */
+        #define MA_ATOMIC(alignment, type)        type __attribute__((aligned(alignment)))
+    #elif defined(_MSC_VER) && _MSC_VER > 1200  /* 1200 = VC6. Alignment not supported, but not necessary because x86 is the only supported target. */
+        /* MSVC. */
+        #define MA_ATOMIC(alignment, type)        __declspec(align(alignment)) type
+    #else
+        /* Other compilers. */
+        #define MA_ATOMIC(alignment, type)        type
+    #endif
+#endif
+
+typedef struct ma_context ma_context;
+typedef struct ma_device ma_device;
+
+typedef ma_uint8 ma_channel;
+typedef enum
+{
+    MA_CHANNEL_NONE               = 0,
+    MA_CHANNEL_MONO               = 1,
+    MA_CHANNEL_FRONT_LEFT         = 2,
+    MA_CHANNEL_FRONT_RIGHT        = 3,
+    MA_CHANNEL_FRONT_CENTER       = 4,
+    MA_CHANNEL_LFE                = 5,
+    MA_CHANNEL_BACK_LEFT          = 6,
+    MA_CHANNEL_BACK_RIGHT         = 7,
+    MA_CHANNEL_FRONT_LEFT_CENTER  = 8,
+    MA_CHANNEL_FRONT_RIGHT_CENTER = 9,
+    MA_CHANNEL_BACK_CENTER        = 10,
+    MA_CHANNEL_SIDE_LEFT          = 11,
+    MA_CHANNEL_SIDE_RIGHT         = 12,
+    MA_CHANNEL_TOP_CENTER         = 13,
+    MA_CHANNEL_TOP_FRONT_LEFT     = 14,
+    MA_CHANNEL_TOP_FRONT_CENTER   = 15,
+    MA_CHANNEL_TOP_FRONT_RIGHT    = 16,
+    MA_CHANNEL_TOP_BACK_LEFT      = 17,
+    MA_CHANNEL_TOP_BACK_CENTER    = 18,
+    MA_CHANNEL_TOP_BACK_RIGHT     = 19,
+    MA_CHANNEL_AUX_0              = 20,
+    MA_CHANNEL_AUX_1              = 21,
+    MA_CHANNEL_AUX_2              = 22,
+    MA_CHANNEL_AUX_3              = 23,
+    MA_CHANNEL_AUX_4              = 24,
+    MA_CHANNEL_AUX_5              = 25,
+    MA_CHANNEL_AUX_6              = 26,
+    MA_CHANNEL_AUX_7              = 27,
+    MA_CHANNEL_AUX_8              = 28,
+    MA_CHANNEL_AUX_9              = 29,
+    MA_CHANNEL_AUX_10             = 30,
+    MA_CHANNEL_AUX_11             = 31,
+    MA_CHANNEL_AUX_12             = 32,
+    MA_CHANNEL_AUX_13             = 33,
+    MA_CHANNEL_AUX_14             = 34,
+    MA_CHANNEL_AUX_15             = 35,
+    MA_CHANNEL_AUX_16             = 36,
+    MA_CHANNEL_AUX_17             = 37,
+    MA_CHANNEL_AUX_18             = 38,
+    MA_CHANNEL_AUX_19             = 39,
+    MA_CHANNEL_AUX_20             = 40,
+    MA_CHANNEL_AUX_21             = 41,
+    MA_CHANNEL_AUX_22             = 42,
+    MA_CHANNEL_AUX_23             = 43,
+    MA_CHANNEL_AUX_24             = 44,
+    MA_CHANNEL_AUX_25             = 45,
+    MA_CHANNEL_AUX_26             = 46,
+    MA_CHANNEL_AUX_27             = 47,
+    MA_CHANNEL_AUX_28             = 48,
+    MA_CHANNEL_AUX_29             = 49,
+    MA_CHANNEL_AUX_30             = 50,
+    MA_CHANNEL_AUX_31             = 51,
+    MA_CHANNEL_LEFT               = MA_CHANNEL_FRONT_LEFT,
+    MA_CHANNEL_RIGHT              = MA_CHANNEL_FRONT_RIGHT,
+    MA_CHANNEL_POSITION_COUNT     = (MA_CHANNEL_AUX_31 + 1)
+} _ma_channel_position; /* Do not use `_ma_channel_position` directly. Use `ma_channel` instead. */
+
+typedef enum
+{
+    MA_SUCCESS                        =  0,
+    MA_ERROR                          = -1,  /* A generic error. */
+    MA_INVALID_ARGS                   = -2,
+    MA_INVALID_OPERATION              = -3,
+    MA_OUT_OF_MEMORY                  = -4,
+    MA_OUT_OF_RANGE                   = -5,
+    MA_ACCESS_DENIED                  = -6,
+    MA_DOES_NOT_EXIST                 = -7,
+    MA_ALREADY_EXISTS                 = -8,
+    MA_TOO_MANY_OPEN_FILES            = -9,
+    MA_INVALID_FILE                   = -10,
+    MA_TOO_BIG                        = -11,
+    MA_PATH_TOO_LONG                  = -12,
+    MA_NAME_TOO_LONG                  = -13,
+    MA_NOT_DIRECTORY                  = -14,
+    MA_IS_DIRECTORY                   = -15,
+    MA_DIRECTORY_NOT_EMPTY            = -16,
+    MA_AT_END                         = -17,
+    MA_NO_SPACE                       = -18,
+    MA_BUSY                           = -19,
+    MA_IO_ERROR                       = -20,
+    MA_INTERRUPT                      = -21,
+    MA_UNAVAILABLE                    = -22,
+    MA_ALREADY_IN_USE                 = -23,
+    MA_BAD_ADDRESS                    = -24,
+    MA_BAD_SEEK                       = -25,
+    MA_BAD_PIPE                       = -26,
+    MA_DEADLOCK                       = -27,
+    MA_TOO_MANY_LINKS                 = -28,
+    MA_NOT_IMPLEMENTED                = -29,
+    MA_NO_MESSAGE                     = -30,
+    MA_BAD_MESSAGE                    = -31,
+    MA_NO_DATA_AVAILABLE              = -32,
+    MA_INVALID_DATA                   = -33,
+    MA_TIMEOUT                        = -34,
+    MA_NO_NETWORK                     = -35,
+    MA_NOT_UNIQUE                     = -36,
+    MA_NOT_SOCKET                     = -37,
+    MA_NO_ADDRESS                     = -38,
+    MA_BAD_PROTOCOL                   = -39,
+    MA_PROTOCOL_UNAVAILABLE           = -40,
+    MA_PROTOCOL_NOT_SUPPORTED         = -41,
+    MA_PROTOCOL_FAMILY_NOT_SUPPORTED  = -42,
+    MA_ADDRESS_FAMILY_NOT_SUPPORTED   = -43,
+    MA_SOCKET_NOT_SUPPORTED           = -44,
+    MA_CONNECTION_RESET               = -45,
+    MA_ALREADY_CONNECTED              = -46,
+    MA_NOT_CONNECTED                  = -47,
+    MA_CONNECTION_REFUSED             = -48,
+    MA_NO_HOST                        = -49,
+    MA_IN_PROGRESS                    = -50,
+    MA_CANCELLED                      = -51,
+    MA_MEMORY_ALREADY_MAPPED          = -52,
+
+    /* General non-standard errors. */
+    MA_CRC_MISMATCH                   = -100,
+
+    /* General miniaudio-specific errors. */
+    MA_FORMAT_NOT_SUPPORTED           = -200,
+    MA_DEVICE_TYPE_NOT_SUPPORTED      = -201,
+    MA_SHARE_MODE_NOT_SUPPORTED       = -202,
+    MA_NO_BACKEND                     = -203,
+    MA_NO_DEVICE                      = -204,
+    MA_API_NOT_FOUND                  = -205,
+    MA_INVALID_DEVICE_CONFIG          = -206,
+    MA_LOOP                           = -207,
+    MA_BACKEND_NOT_ENABLED            = -208,
+
+    /* State errors. */
+    MA_DEVICE_NOT_INITIALIZED         = -300,
+    MA_DEVICE_ALREADY_INITIALIZED     = -301,
+    MA_DEVICE_NOT_STARTED             = -302,
+    MA_DEVICE_NOT_STOPPED             = -303,
+
+    /* Operation errors. */
+    MA_FAILED_TO_INIT_BACKEND         = -400,
+    MA_FAILED_TO_OPEN_BACKEND_DEVICE  = -401,
+    MA_FAILED_TO_START_BACKEND_DEVICE = -402,
+    MA_FAILED_TO_STOP_BACKEND_DEVICE  = -403
+} ma_result;
+
+
+#define MA_MIN_CHANNELS                 1
+#ifndef MA_MAX_CHANNELS
+#define MA_MAX_CHANNELS                 254
+#endif
+
+#ifndef MA_MAX_FILTER_ORDER
+#define MA_MAX_FILTER_ORDER             8
+#endif
+
+typedef enum
+{
+    ma_stream_format_pcm = 0
+} ma_stream_format;
+
+typedef enum
+{
+    ma_stream_layout_interleaved = 0,
+    ma_stream_layout_deinterleaved
+} ma_stream_layout;
+
+typedef enum
+{
+    ma_dither_mode_none = 0,
+    ma_dither_mode_rectangle,
+    ma_dither_mode_triangle
+} ma_dither_mode;
+
+typedef enum
+{
+    /*
+    I like to keep these explicitly defined because they're used as a key into a lookup table. When items are
+    added to this, make sure there are no gaps and that they're added to the lookup table in ma_get_bytes_per_sample().
+    */
+    ma_format_unknown = 0,     /* Mainly used for indicating an error, but also used as the default for the output format for decoders. */
+    ma_format_u8      = 1,
+    ma_format_s16     = 2,     /* Seems to be the most widely supported format. */
+    ma_format_s24     = 3,     /* Tightly packed. 3 bytes per sample. */
+    ma_format_s32     = 4,
+    ma_format_f32     = 5,
+    ma_format_count
+} ma_format;
+
+typedef enum
+{
+    /* Standard rates need to be in priority order. */
+    ma_standard_sample_rate_48000  = 48000,     /* Most common */
+    ma_standard_sample_rate_44100  = 44100,
+
+    ma_standard_sample_rate_32000  = 32000,     /* Lows */
+    ma_standard_sample_rate_24000  = 24000,
+    ma_standard_sample_rate_22050  = 22050,
+
+    ma_standard_sample_rate_88200  = 88200,     /* Highs */
+    ma_standard_sample_rate_96000  = 96000,
+    ma_standard_sample_rate_176400 = 176400,
+    ma_standard_sample_rate_192000 = 192000,
+
+    ma_standard_sample_rate_16000  = 16000,     /* Extreme lows */
+    ma_standard_sample_rate_11025  = 11025,
+    ma_standard_sample_rate_8000   = 8000,
+
+    ma_standard_sample_rate_352800 = 352800,    /* Extreme highs */
+    ma_standard_sample_rate_384000 = 384000,
+
+    ma_standard_sample_rate_min    = ma_standard_sample_rate_8000,
+    ma_standard_sample_rate_max    = ma_standard_sample_rate_384000,
+    ma_standard_sample_rate_count  = 14         /* Need to maintain the count manually. Make sure this is updated if items are added to enum. */
+} ma_standard_sample_rate;
+
+
+typedef enum
+{
+    ma_channel_mix_mode_rectangular = 0,   /* Simple averaging based on the plane(s) the channel is sitting on. */
+    ma_channel_mix_mode_simple,            /* Drop excess channels; zeroed out extra channels. */
+    ma_channel_mix_mode_custom_weights,    /* Use custom weights specified in ma_channel_converter_config. */
+    ma_channel_mix_mode_default = ma_channel_mix_mode_rectangular
+} ma_channel_mix_mode;
+
+typedef enum
+{
+    ma_standard_channel_map_microsoft,
+    ma_standard_channel_map_alsa,
+    ma_standard_channel_map_rfc3551,   /* Based off AIFF. */
+    ma_standard_channel_map_flac,
+    ma_standard_channel_map_vorbis,
+    ma_standard_channel_map_sound4,    /* FreeBSD's sound(4). */
+    ma_standard_channel_map_sndio,     /* www.sndio.org/tips.html */
+    ma_standard_channel_map_webaudio = ma_standard_channel_map_flac, /* https://webaudio.github.io/web-audio-api/#ChannelOrdering. Only 1, 2, 4 and 6 channels are defined, but can fill in the gaps with logical assumptions. */
+    ma_standard_channel_map_default = ma_standard_channel_map_microsoft
+} ma_standard_channel_map;
+
+typedef enum
+{
+    ma_performance_profile_low_latency = 0,
+    ma_performance_profile_conservative
+} ma_performance_profile;
+
+
+typedef struct
+{
+    void* pUserData;
+    void* (* onMalloc)(size_t sz, void* pUserData);
+    void* (* onRealloc)(void* p, size_t sz, void* pUserData);
+    void  (* onFree)(void* p, void* pUserData);
+} ma_allocation_callbacks;
+
+typedef struct
+{
+    ma_int32 state;
+} ma_lcg;
+
+
+/*
+Atomics.
+
+These are typesafe structures to prevent errors as a result of forgetting to reference variables atomically. It's too
+easy to introduce subtle bugs where you accidentally do a regular assignment instead of an atomic load/store, etc. By
+using a struct we can enforce the use of atomics at compile time.
+
+These types are declared in the header section because we need to reference them in structs below, but functions for
+using them are only exposed in the implementation section. I do not want these to be part of the public API.
+
+There's a few downsides to this system. The first is that you need to declare a new struct for each type. Below are
+some macros to help with the declarations. They will be named like so:
+
+    ma_atomic_uint32 - atomic ma_uint32
+    ma_atomic_int32  - atomic ma_int32
+    ma_atomic_uint64 - atomic ma_uint64
+    ma_atomic_float  - atomic float
+    ma_atomic_bool32 - atomic ma_bool32
+
+The other downside is that atomic pointers are extremely messy. You need to declare a new struct for each specific
+type of pointer you need to make atomic. For example, an atomic ma_node* will look like this:
+
+    MA_ATOMIC_SAFE_TYPE_IMPL_PTR(node)
+
+Which will declare a type struct that's named like so:
+
+    ma_atomic_ptr_node
+
+Functions to use the atomic types are declared in the implementation section. All atomic functions are prefixed with
+the name of the struct. For example:
+
+    ma_atomic_uint32_set() - Atomic store of ma_uint32
+    ma_atomic_uint32_get() - Atomic load of ma_uint32
+    etc.
+
+For pointer types it's the same, which makes them a bit messy to use due to the length of each function name, but in
+return you get type safety and enforcement of atomic operations.
+*/
+#define MA_ATOMIC_SAFE_TYPE_DECL(c89TypeExtension, typeSize, type) \
+    typedef struct \
+    { \
+        MA_ATOMIC(typeSize, ma_##type) value; \
+    } ma_atomic_##type; \
+
+#define MA_ATOMIC_SAFE_TYPE_DECL_PTR(type) \
+    typedef struct \
+    { \
+        MA_ATOMIC(MA_SIZEOF_PTR, ma_##type*) value; \
+    } ma_atomic_ptr_##type; \
+
+MA_ATOMIC_SAFE_TYPE_DECL(32,  4, uint32)
+MA_ATOMIC_SAFE_TYPE_DECL(i32, 4, int32)
+MA_ATOMIC_SAFE_TYPE_DECL(64,  8, uint64)
+MA_ATOMIC_SAFE_TYPE_DECL(f32, 4, float)
+MA_ATOMIC_SAFE_TYPE_DECL(32,  4, bool32)
+
+
+/* Spinlocks are 32-bit for compatibility reasons. */
+typedef ma_uint32 ma_spinlock;
+
+#ifndef MA_NO_THREADING
+    /* Thread priorities should be ordered such that the default priority of the worker thread is 0. */
+    typedef enum
+    {
+        ma_thread_priority_idle     = -5,
+        ma_thread_priority_lowest   = -4,
+        ma_thread_priority_low      = -3,
+        ma_thread_priority_normal   = -2,
+        ma_thread_priority_high     = -1,
+        ma_thread_priority_highest  =  0,
+        ma_thread_priority_realtime =  1,
+        ma_thread_priority_default  =  0
+    } ma_thread_priority;
+
+    #if defined(MA_POSIX)
+        typedef ma_pthread_t ma_thread;
+    #elif defined(MA_WIN32)
+        typedef ma_handle ma_thread;
+    #endif
+
+    #if defined(MA_POSIX)
+        typedef ma_pthread_mutex_t ma_mutex;
+    #elif defined(MA_WIN32)
+        typedef ma_handle ma_mutex;
+    #endif
+
+    #if defined(MA_POSIX)
+        typedef struct
+        {
+            ma_uint32 value;
+            ma_pthread_mutex_t lock;
+            ma_pthread_cond_t cond;
+        } ma_event;
+    #elif defined(MA_WIN32)
+        typedef ma_handle ma_event;
+    #endif
+
+    #if defined(MA_POSIX)
+        typedef struct
+        {
+            int value;
+            ma_pthread_mutex_t lock;
+            ma_pthread_cond_t cond;
+        } ma_semaphore;
+    #elif defined(MA_WIN32)
+        typedef ma_handle ma_semaphore;
+    #endif
+#else
+    /* MA_NO_THREADING is set which means threading is disabled. Threading is required by some API families. If any of these are enabled we need to throw an error. */
+    #ifndef MA_NO_DEVICE_IO
+        #error "MA_NO_THREADING cannot be used without MA_NO_DEVICE_IO";
+    #endif
+#endif  /* MA_NO_THREADING */
+
+
+/*
+Retrieves the version of miniaudio as separated integers. Each component can be NULL if it's not required.
+*/
+MA_API void ma_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision);
+
+/*
+Retrieves the version of miniaudio as a string which can be useful for logging purposes.
+*/
+MA_API const char* ma_version_string(void);
+
+
+/**************************************************************************************************************************************************************
+
+Logging
+
+**************************************************************************************************************************************************************/
+#include <stdarg.h> /* For va_list. */
+
+#if defined(__has_attribute)
+    #if __has_attribute(format)
+        #define MA_ATTRIBUTE_FORMAT(fmt, va) __attribute__((format(printf, fmt, va)))
+    #endif
+#endif
+#ifndef MA_ATTRIBUTE_FORMAT
+#define MA_ATTRIBUTE_FORMAT(fmt, va)
+#endif
+
+#ifndef MA_MAX_LOG_CALLBACKS
+#define MA_MAX_LOG_CALLBACKS    4
+#endif
+
+
+/*
+The callback for handling log messages.
+
+
+Parameters
+----------
+pUserData (in)
+    The user data pointer that was passed into ma_log_register_callback().
+
+logLevel (in)
+    The log level. This can be one of the following:
+
+    +----------------------+
+    | Log Level            |
+    +----------------------+
+    | MA_LOG_LEVEL_DEBUG   |
+    | MA_LOG_LEVEL_INFO    |
+    | MA_LOG_LEVEL_WARNING |
+    | MA_LOG_LEVEL_ERROR   |
+    +----------------------+
+
+pMessage (in)
+    The log message.
+*/
+typedef void (* ma_log_callback_proc)(void* pUserData, ma_uint32 level, const char* pMessage);
+
+typedef struct
+{
+    ma_log_callback_proc onLog;
+    void* pUserData;
+} ma_log_callback;
+
+MA_API ma_log_callback ma_log_callback_init(ma_log_callback_proc onLog, void* pUserData);
+
+
+typedef struct
+{
+    ma_log_callback callbacks[MA_MAX_LOG_CALLBACKS];
+    ma_uint32 callbackCount;
+    ma_allocation_callbacks allocationCallbacks;    /* Need to store these persistently because ma_log_postv() might need to allocate a buffer on the heap. */
+#ifndef MA_NO_THREADING
+    ma_mutex lock;  /* For thread safety just to make it easier and safer for the logging implementation. */
+#endif
+} ma_log;
+
+MA_API ma_result ma_log_init(const ma_allocation_callbacks* pAllocationCallbacks, ma_log* pLog);
+MA_API void ma_log_uninit(ma_log* pLog);
+MA_API ma_result ma_log_register_callback(ma_log* pLog, ma_log_callback callback);
+MA_API ma_result ma_log_unregister_callback(ma_log* pLog, ma_log_callback callback);
+MA_API ma_result ma_log_post(ma_log* pLog, ma_uint32 level, const char* pMessage);
+MA_API ma_result ma_log_postv(ma_log* pLog, ma_uint32 level, const char* pFormat, va_list args);
+MA_API ma_result ma_log_postf(ma_log* pLog, ma_uint32 level, const char* pFormat, ...) MA_ATTRIBUTE_FORMAT(3, 4);
+
+
+/**************************************************************************************************************************************************************
+
+Biquad Filtering
+
+**************************************************************************************************************************************************************/
+typedef union
+{
+    float    f32;
+    ma_int32 s32;
+} ma_biquad_coefficient;
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    double b0;
+    double b1;
+    double b2;
+    double a0;
+    double a1;
+    double a2;
+} ma_biquad_config;
+
+MA_API ma_biquad_config ma_biquad_config_init(ma_format format, ma_uint32 channels, double b0, double b1, double b2, double a0, double a1, double a2);
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_biquad_coefficient b0;
+    ma_biquad_coefficient b1;
+    ma_biquad_coefficient b2;
+    ma_biquad_coefficient a1;
+    ma_biquad_coefficient a2;
+    ma_biquad_coefficient* pR1;
+    ma_biquad_coefficient* pR2;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_biquad;
+
+MA_API ma_result ma_biquad_get_heap_size(const ma_biquad_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_biquad_init_preallocated(const ma_biquad_config* pConfig, void* pHeap, ma_biquad* pBQ);
+MA_API ma_result ma_biquad_init(const ma_biquad_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_biquad* pBQ);
+MA_API void ma_biquad_uninit(ma_biquad* pBQ, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_biquad_reinit(const ma_biquad_config* pConfig, ma_biquad* pBQ);
+MA_API ma_result ma_biquad_clear_cache(ma_biquad* pBQ);
+MA_API ma_result ma_biquad_process_pcm_frames(ma_biquad* pBQ, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_biquad_get_latency(const ma_biquad* pBQ);
+
+
+/**************************************************************************************************************************************************************
+
+Low-Pass Filtering
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double cutoffFrequency;
+    double q;
+} ma_lpf1_config, ma_lpf2_config;
+
+MA_API ma_lpf1_config ma_lpf1_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency);
+MA_API ma_lpf2_config ma_lpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q);
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_biquad_coefficient a;
+    ma_biquad_coefficient* pR1;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_lpf1;
+
+MA_API ma_result ma_lpf1_get_heap_size(const ma_lpf1_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_lpf1_init_preallocated(const ma_lpf1_config* pConfig, void* pHeap, ma_lpf1* pLPF);
+MA_API ma_result ma_lpf1_init(const ma_lpf1_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf1* pLPF);
+MA_API void ma_lpf1_uninit(ma_lpf1* pLPF, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_lpf1_reinit(const ma_lpf1_config* pConfig, ma_lpf1* pLPF);
+MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF);
+MA_API ma_result ma_lpf1_process_pcm_frames(ma_lpf1* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_lpf1_get_latency(const ma_lpf1* pLPF);
+
+typedef struct
+{
+    ma_biquad bq;   /* The second order low-pass filter is implemented as a biquad filter. */
+} ma_lpf2;
+
+MA_API ma_result ma_lpf2_get_heap_size(const ma_lpf2_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_lpf2_init_preallocated(const ma_lpf2_config* pConfig, void* pHeap, ma_lpf2* pHPF);
+MA_API ma_result ma_lpf2_init(const ma_lpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf2* pLPF);
+MA_API void ma_lpf2_uninit(ma_lpf2* pLPF, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_lpf2_reinit(const ma_lpf2_config* pConfig, ma_lpf2* pLPF);
+MA_API ma_result ma_lpf2_clear_cache(ma_lpf2* pLPF);
+MA_API ma_result ma_lpf2_process_pcm_frames(ma_lpf2* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_lpf2_get_latency(const ma_lpf2* pLPF);
+
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double cutoffFrequency;
+    ma_uint32 order;    /* If set to 0, will be treated as a passthrough (no filtering will be applied). */
+} ma_lpf_config;
+
+MA_API ma_lpf_config ma_lpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_uint32 lpf1Count;
+    ma_uint32 lpf2Count;
+    ma_lpf1* pLPF1;
+    ma_lpf2* pLPF2;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_lpf;
+
+MA_API ma_result ma_lpf_get_heap_size(const ma_lpf_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_lpf_init_preallocated(const ma_lpf_config* pConfig, void* pHeap, ma_lpf* pLPF);
+MA_API ma_result ma_lpf_init(const ma_lpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf* pLPF);
+MA_API void ma_lpf_uninit(ma_lpf* pLPF, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_lpf_reinit(const ma_lpf_config* pConfig, ma_lpf* pLPF);
+MA_API ma_result ma_lpf_clear_cache(ma_lpf* pLPF);
+MA_API ma_result ma_lpf_process_pcm_frames(ma_lpf* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_lpf_get_latency(const ma_lpf* pLPF);
+
+
+/**************************************************************************************************************************************************************
+
+High-Pass Filtering
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double cutoffFrequency;
+    double q;
+} ma_hpf1_config, ma_hpf2_config;
+
+MA_API ma_hpf1_config ma_hpf1_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency);
+MA_API ma_hpf2_config ma_hpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q);
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_biquad_coefficient a;
+    ma_biquad_coefficient* pR1;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_hpf1;
+
+MA_API ma_result ma_hpf1_get_heap_size(const ma_hpf1_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_hpf1_init_preallocated(const ma_hpf1_config* pConfig, void* pHeap, ma_hpf1* pLPF);
+MA_API ma_result ma_hpf1_init(const ma_hpf1_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf1* pHPF);
+MA_API void ma_hpf1_uninit(ma_hpf1* pHPF, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_hpf1_reinit(const ma_hpf1_config* pConfig, ma_hpf1* pHPF);
+MA_API ma_result ma_hpf1_process_pcm_frames(ma_hpf1* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_hpf1_get_latency(const ma_hpf1* pHPF);
+
+typedef struct
+{
+    ma_biquad bq;   /* The second order high-pass filter is implemented as a biquad filter. */
+} ma_hpf2;
+
+MA_API ma_result ma_hpf2_get_heap_size(const ma_hpf2_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_hpf2_init_preallocated(const ma_hpf2_config* pConfig, void* pHeap, ma_hpf2* pHPF);
+MA_API ma_result ma_hpf2_init(const ma_hpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf2* pHPF);
+MA_API void ma_hpf2_uninit(ma_hpf2* pHPF, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_hpf2_reinit(const ma_hpf2_config* pConfig, ma_hpf2* pHPF);
+MA_API ma_result ma_hpf2_process_pcm_frames(ma_hpf2* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_hpf2_get_latency(const ma_hpf2* pHPF);
+
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double cutoffFrequency;
+    ma_uint32 order;    /* If set to 0, will be treated as a passthrough (no filtering will be applied). */
+} ma_hpf_config;
+
+MA_API ma_hpf_config ma_hpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_uint32 hpf1Count;
+    ma_uint32 hpf2Count;
+    ma_hpf1* pHPF1;
+    ma_hpf2* pHPF2;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_hpf;
+
+MA_API ma_result ma_hpf_get_heap_size(const ma_hpf_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_hpf_init_preallocated(const ma_hpf_config* pConfig, void* pHeap, ma_hpf* pLPF);
+MA_API ma_result ma_hpf_init(const ma_hpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf* pHPF);
+MA_API void ma_hpf_uninit(ma_hpf* pHPF, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_hpf_reinit(const ma_hpf_config* pConfig, ma_hpf* pHPF);
+MA_API ma_result ma_hpf_process_pcm_frames(ma_hpf* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_hpf_get_latency(const ma_hpf* pHPF);
+
+
+/**************************************************************************************************************************************************************
+
+Band-Pass Filtering
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double cutoffFrequency;
+    double q;
+} ma_bpf2_config;
+
+MA_API ma_bpf2_config ma_bpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q);
+
+typedef struct
+{
+    ma_biquad bq;   /* The second order band-pass filter is implemented as a biquad filter. */
+} ma_bpf2;
+
+MA_API ma_result ma_bpf2_get_heap_size(const ma_bpf2_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_bpf2_init_preallocated(const ma_bpf2_config* pConfig, void* pHeap, ma_bpf2* pBPF);
+MA_API ma_result ma_bpf2_init(const ma_bpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf2* pBPF);
+MA_API void ma_bpf2_uninit(ma_bpf2* pBPF, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_bpf2_reinit(const ma_bpf2_config* pConfig, ma_bpf2* pBPF);
+MA_API ma_result ma_bpf2_process_pcm_frames(ma_bpf2* pBPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_bpf2_get_latency(const ma_bpf2* pBPF);
+
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double cutoffFrequency;
+    ma_uint32 order;    /* If set to 0, will be treated as a passthrough (no filtering will be applied). */
+} ma_bpf_config;
+
+MA_API ma_bpf_config ma_bpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 bpf2Count;
+    ma_bpf2* pBPF2;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_bpf;
+
+MA_API ma_result ma_bpf_get_heap_size(const ma_bpf_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_bpf_init_preallocated(const ma_bpf_config* pConfig, void* pHeap, ma_bpf* pBPF);
+MA_API ma_result ma_bpf_init(const ma_bpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf* pBPF);
+MA_API void ma_bpf_uninit(ma_bpf* pBPF, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_bpf_reinit(const ma_bpf_config* pConfig, ma_bpf* pBPF);
+MA_API ma_result ma_bpf_process_pcm_frames(ma_bpf* pBPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_bpf_get_latency(const ma_bpf* pBPF);
+
+
+/**************************************************************************************************************************************************************
+
+Notching Filter
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double q;
+    double frequency;
+} ma_notch2_config, ma_notch_config;
+
+MA_API ma_notch2_config ma_notch2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double q, double frequency);
+
+typedef struct
+{
+    ma_biquad bq;
+} ma_notch2;
+
+MA_API ma_result ma_notch2_get_heap_size(const ma_notch2_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_notch2_init_preallocated(const ma_notch2_config* pConfig, void* pHeap, ma_notch2* pFilter);
+MA_API ma_result ma_notch2_init(const ma_notch2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_notch2* pFilter);
+MA_API void ma_notch2_uninit(ma_notch2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_notch2_reinit(const ma_notch2_config* pConfig, ma_notch2* pFilter);
+MA_API ma_result ma_notch2_process_pcm_frames(ma_notch2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_notch2_get_latency(const ma_notch2* pFilter);
+
+
+/**************************************************************************************************************************************************************
+
+Peaking EQ Filter
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double gainDB;
+    double q;
+    double frequency;
+} ma_peak2_config, ma_peak_config;
+
+MA_API ma_peak2_config ma_peak2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency);
+
+typedef struct
+{
+    ma_biquad bq;
+} ma_peak2;
+
+MA_API ma_result ma_peak2_get_heap_size(const ma_peak2_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_peak2_init_preallocated(const ma_peak2_config* pConfig, void* pHeap, ma_peak2* pFilter);
+MA_API ma_result ma_peak2_init(const ma_peak2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_peak2* pFilter);
+MA_API void ma_peak2_uninit(ma_peak2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_peak2_reinit(const ma_peak2_config* pConfig, ma_peak2* pFilter);
+MA_API ma_result ma_peak2_process_pcm_frames(ma_peak2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_peak2_get_latency(const ma_peak2* pFilter);
+
+
+/**************************************************************************************************************************************************************
+
+Low Shelf Filter
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double gainDB;
+    double shelfSlope;
+    double frequency;
+} ma_loshelf2_config, ma_loshelf_config;
+
+MA_API ma_loshelf2_config ma_loshelf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double shelfSlope, double frequency);
+
+typedef struct
+{
+    ma_biquad bq;
+} ma_loshelf2;
+
+MA_API ma_result ma_loshelf2_get_heap_size(const ma_loshelf2_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_loshelf2_init_preallocated(const ma_loshelf2_config* pConfig, void* pHeap, ma_loshelf2* pFilter);
+MA_API ma_result ma_loshelf2_init(const ma_loshelf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_loshelf2* pFilter);
+MA_API void ma_loshelf2_uninit(ma_loshelf2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_loshelf2_reinit(const ma_loshelf2_config* pConfig, ma_loshelf2* pFilter);
+MA_API ma_result ma_loshelf2_process_pcm_frames(ma_loshelf2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_loshelf2_get_latency(const ma_loshelf2* pFilter);
+
+
+/**************************************************************************************************************************************************************
+
+High Shelf Filter
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double gainDB;
+    double shelfSlope;
+    double frequency;
+} ma_hishelf2_config, ma_hishelf_config;
+
+MA_API ma_hishelf2_config ma_hishelf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double shelfSlope, double frequency);
+
+typedef struct
+{
+    ma_biquad bq;
+} ma_hishelf2;
+
+MA_API ma_result ma_hishelf2_get_heap_size(const ma_hishelf2_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_hishelf2_init_preallocated(const ma_hishelf2_config* pConfig, void* pHeap, ma_hishelf2* pFilter);
+MA_API ma_result ma_hishelf2_init(const ma_hishelf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hishelf2* pFilter);
+MA_API void ma_hishelf2_uninit(ma_hishelf2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_hishelf2_reinit(const ma_hishelf2_config* pConfig, ma_hishelf2* pFilter);
+MA_API ma_result ma_hishelf2_process_pcm_frames(ma_hishelf2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_uint32 ma_hishelf2_get_latency(const ma_hishelf2* pFilter);
+
+
+
+/*
+Delay
+*/
+typedef struct
+{
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_uint32 delayInFrames;
+    ma_bool32 delayStart;       /* Set to true to delay the start of the output; false otherwise. */
+    float wet;                  /* 0..1. Default = 1. */
+    float dry;                  /* 0..1. Default = 1. */
+    float decay;                /* 0..1. Default = 0 (no feedback). Feedback decay. Use this for echo. */
+} ma_delay_config;
+
+MA_API ma_delay_config ma_delay_config_init(ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 delayInFrames, float decay);
+
+
+typedef struct
+{
+    ma_delay_config config;
+    ma_uint32 cursor;               /* Feedback is written to this cursor. Always equal or in front of the read cursor. */
+    ma_uint32 bufferSizeInFrames;
+    float* pBuffer;
+} ma_delay;
+
+MA_API ma_result ma_delay_init(const ma_delay_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_delay* pDelay);
+MA_API void ma_delay_uninit(ma_delay* pDelay, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_delay_process_pcm_frames(ma_delay* pDelay, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount);
+MA_API void ma_delay_set_wet(ma_delay* pDelay, float value);
+MA_API float ma_delay_get_wet(const ma_delay* pDelay);
+MA_API void ma_delay_set_dry(ma_delay* pDelay, float value);
+MA_API float ma_delay_get_dry(const ma_delay* pDelay);
+MA_API void ma_delay_set_decay(ma_delay* pDelay, float value);
+MA_API float ma_delay_get_decay(const ma_delay* pDelay);
+
+
+/* Gainer for smooth volume changes. */
+typedef struct
+{
+    ma_uint32 channels;
+    ma_uint32 smoothTimeInFrames;
+} ma_gainer_config;
+
+MA_API ma_gainer_config ma_gainer_config_init(ma_uint32 channels, ma_uint32 smoothTimeInFrames);
+
+
+typedef struct
+{
+    ma_gainer_config config;
+    ma_uint32 t;
+    float masterVolume;
+    float* pOldGains;
+    float* pNewGains;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_gainer;
+
+MA_API ma_result ma_gainer_get_heap_size(const ma_gainer_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_gainer_init_preallocated(const ma_gainer_config* pConfig, void* pHeap, ma_gainer* pGainer);
+MA_API ma_result ma_gainer_init(const ma_gainer_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_gainer* pGainer);
+MA_API void ma_gainer_uninit(ma_gainer* pGainer, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_gainer_process_pcm_frames(ma_gainer* pGainer, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_result ma_gainer_set_gain(ma_gainer* pGainer, float newGain);
+MA_API ma_result ma_gainer_set_gains(ma_gainer* pGainer, float* pNewGains);
+MA_API ma_result ma_gainer_set_master_volume(ma_gainer* pGainer, float volume);
+MA_API ma_result ma_gainer_get_master_volume(const ma_gainer* pGainer, float* pVolume);
+
+
+
+/* Stereo panner. */
+typedef enum
+{
+    ma_pan_mode_balance = 0,    /* Does not blend one side with the other. Technically just a balance. Compatible with other popular audio engines and therefore the default. */
+    ma_pan_mode_pan             /* A true pan. The sound from one side will "move" to the other side and blend with it. */
+} ma_pan_mode;
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_pan_mode mode;
+    float pan;
+} ma_panner_config;
+
+MA_API ma_panner_config ma_panner_config_init(ma_format format, ma_uint32 channels);
+
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_pan_mode mode;
+    float pan;  /* -1..1 where 0 is no pan, -1 is left side, +1 is right side. Defaults to 0. */
+} ma_panner;
+
+MA_API ma_result ma_panner_init(const ma_panner_config* pConfig, ma_panner* pPanner);
+MA_API ma_result ma_panner_process_pcm_frames(ma_panner* pPanner, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API void ma_panner_set_mode(ma_panner* pPanner, ma_pan_mode mode);
+MA_API ma_pan_mode ma_panner_get_mode(const ma_panner* pPanner);
+MA_API void ma_panner_set_pan(ma_panner* pPanner, float pan);
+MA_API float ma_panner_get_pan(const ma_panner* pPanner);
+
+
+
+/* Fader. */
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+} ma_fader_config;
+
+MA_API ma_fader_config ma_fader_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate);
+
+typedef struct
+{
+    ma_fader_config config;
+    float volumeBeg;            /* If volumeBeg and volumeEnd is equal to 1, no fading happens (ma_fader_process_pcm_frames() will run as a passthrough). */
+    float volumeEnd;
+    ma_uint64 lengthInFrames;   /* The total length of the fade. */
+    ma_int64  cursorInFrames;   /* The current time in frames. Incremented by ma_fader_process_pcm_frames(). Signed because it'll be offset by startOffsetInFrames in set_fade_ex(). */
+} ma_fader;
+
+MA_API ma_result ma_fader_init(const ma_fader_config* pConfig, ma_fader* pFader);
+MA_API ma_result ma_fader_process_pcm_frames(ma_fader* pFader, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API void ma_fader_get_data_format(const ma_fader* pFader, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate);
+MA_API void ma_fader_set_fade(ma_fader* pFader, float volumeBeg, float volumeEnd, ma_uint64 lengthInFrames);
+MA_API void ma_fader_set_fade_ex(ma_fader* pFader, float volumeBeg, float volumeEnd, ma_uint64 lengthInFrames, ma_int64 startOffsetInFrames);
+MA_API float ma_fader_get_current_volume(const ma_fader* pFader);
+
+
+
+/* Spatializer. */
+typedef struct
+{
+    float x;
+    float y;
+    float z;
+} ma_vec3f;
+
+typedef struct
+{
+    ma_vec3f v;
+    ma_spinlock lock;
+} ma_atomic_vec3f;
+
+typedef enum
+{
+    ma_attenuation_model_none,          /* No distance attenuation and no spatialization. */
+    ma_attenuation_model_inverse,       /* Equivalent to OpenAL's AL_INVERSE_DISTANCE_CLAMPED. */
+    ma_attenuation_model_linear,        /* Linear attenuation. Equivalent to OpenAL's AL_LINEAR_DISTANCE_CLAMPED. */
+    ma_attenuation_model_exponential    /* Exponential attenuation. Equivalent to OpenAL's AL_EXPONENT_DISTANCE_CLAMPED. */
+} ma_attenuation_model;
+
+typedef enum
+{
+    ma_positioning_absolute,
+    ma_positioning_relative
+} ma_positioning;
+
+typedef enum
+{
+    ma_handedness_right,
+    ma_handedness_left
+} ma_handedness;
+
+
+typedef struct
+{
+    ma_uint32 channelsOut;
+    ma_channel* pChannelMapOut;
+    ma_handedness handedness;   /* Defaults to right. Forward is -1 on the Z axis. In a left handed system, forward is +1 on the Z axis. */
+    float coneInnerAngleInRadians;
+    float coneOuterAngleInRadians;
+    float coneOuterGain;
+    float speedOfSound;
+    ma_vec3f worldUp;
+} ma_spatializer_listener_config;
+
+MA_API ma_spatializer_listener_config ma_spatializer_listener_config_init(ma_uint32 channelsOut);
+
+
+typedef struct
+{
+    ma_spatializer_listener_config config;
+    ma_atomic_vec3f position;  /* The absolute position of the listener. */
+    ma_atomic_vec3f direction; /* The direction the listener is facing. The world up vector is config.worldUp. */
+    ma_atomic_vec3f velocity;
+    ma_bool32 isEnabled;
+
+    /* Memory management. */
+    ma_bool32 _ownsHeap;
+    void* _pHeap;
+} ma_spatializer_listener;
+
+MA_API ma_result ma_spatializer_listener_get_heap_size(const ma_spatializer_listener_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_spatializer_listener_init_preallocated(const ma_spatializer_listener_config* pConfig, void* pHeap, ma_spatializer_listener* pListener);
+MA_API ma_result ma_spatializer_listener_init(const ma_spatializer_listener_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_spatializer_listener* pListener);
+MA_API void ma_spatializer_listener_uninit(ma_spatializer_listener* pListener, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_channel* ma_spatializer_listener_get_channel_map(ma_spatializer_listener* pListener);
+MA_API void ma_spatializer_listener_set_cone(ma_spatializer_listener* pListener, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
+MA_API void ma_spatializer_listener_get_cone(const ma_spatializer_listener* pListener, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
+MA_API void ma_spatializer_listener_set_position(ma_spatializer_listener* pListener, float x, float y, float z);
+MA_API ma_vec3f ma_spatializer_listener_get_position(const ma_spatializer_listener* pListener);
+MA_API void ma_spatializer_listener_set_direction(ma_spatializer_listener* pListener, float x, float y, float z);
+MA_API ma_vec3f ma_spatializer_listener_get_direction(const ma_spatializer_listener* pListener);
+MA_API void ma_spatializer_listener_set_velocity(ma_spatializer_listener* pListener, float x, float y, float z);
+MA_API ma_vec3f ma_spatializer_listener_get_velocity(const ma_spatializer_listener* pListener);
+MA_API void ma_spatializer_listener_set_speed_of_sound(ma_spatializer_listener* pListener, float speedOfSound);
+MA_API float ma_spatializer_listener_get_speed_of_sound(const ma_spatializer_listener* pListener);
+MA_API void ma_spatializer_listener_set_world_up(ma_spatializer_listener* pListener, float x, float y, float z);
+MA_API ma_vec3f ma_spatializer_listener_get_world_up(const ma_spatializer_listener* pListener);
+MA_API void ma_spatializer_listener_set_enabled(ma_spatializer_listener* pListener, ma_bool32 isEnabled);
+MA_API ma_bool32 ma_spatializer_listener_is_enabled(const ma_spatializer_listener* pListener);
+
+
+typedef struct
+{
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_channel* pChannelMapIn;
+    ma_attenuation_model attenuationModel;
+    ma_positioning positioning;
+    ma_handedness handedness;           /* Defaults to right. Forward is -1 on the Z axis. In a left handed system, forward is +1 on the Z axis. */
+    float minGain;
+    float maxGain;
+    float minDistance;
+    float maxDistance;
+    float rolloff;
+    float coneInnerAngleInRadians;
+    float coneOuterAngleInRadians;
+    float coneOuterGain;
+    float dopplerFactor;                /* Set to 0 to disable doppler effect. */
+    float directionalAttenuationFactor; /* Set to 0 to disable directional attenuation. */
+    float minSpatializationChannelGain; /* The minimal scaling factor to apply to channel gains when accounting for the direction of the sound relative to the listener. Must be in the range of 0..1. Smaller values means more aggressive directional panning, larger values means more subtle directional panning. */
+    ma_uint32 gainSmoothTimeInFrames;   /* When the gain of a channel changes during spatialization, the transition will be linearly interpolated over this number of frames. */
+} ma_spatializer_config;
+
+MA_API ma_spatializer_config ma_spatializer_config_init(ma_uint32 channelsIn, ma_uint32 channelsOut);
+
+
+typedef struct
+{
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_channel* pChannelMapIn;
+    ma_attenuation_model attenuationModel;
+    ma_positioning positioning;
+    ma_handedness handedness;           /* Defaults to right. Forward is -1 on the Z axis. In a left handed system, forward is +1 on the Z axis. */
+    float minGain;
+    float maxGain;
+    float minDistance;
+    float maxDistance;
+    float rolloff;
+    float coneInnerAngleInRadians;
+    float coneOuterAngleInRadians;
+    float coneOuterGain;
+    float dopplerFactor;                /* Set to 0 to disable doppler effect. */
+    float directionalAttenuationFactor; /* Set to 0 to disable directional attenuation. */
+    ma_uint32 gainSmoothTimeInFrames;   /* When the gain of a channel changes during spatialization, the transition will be linearly interpolated over this number of frames. */
+    ma_atomic_vec3f position;
+    ma_atomic_vec3f direction;
+    ma_atomic_vec3f velocity;  /* For doppler effect. */
+    float dopplerPitch; /* Will be updated by ma_spatializer_process_pcm_frames() and can be used by higher level functions to apply a pitch shift for doppler effect. */
+    float minSpatializationChannelGain;
+    ma_gainer gainer;   /* For smooth gain transitions. */
+    float* pNewChannelGainsOut; /* An offset of _pHeap. Used by ma_spatializer_process_pcm_frames() to store new channel gains. The number of elements in this array is equal to config.channelsOut. */
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_spatializer;
+
+MA_API ma_result ma_spatializer_get_heap_size(const ma_spatializer_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_spatializer_init_preallocated(const ma_spatializer_config* pConfig, void* pHeap, ma_spatializer* pSpatializer);
+MA_API ma_result ma_spatializer_init(const ma_spatializer_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_uninit(ma_spatializer* pSpatializer, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer, ma_spatializer_listener* pListener, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_result ma_spatializer_set_master_volume(ma_spatializer* pSpatializer, float volume);
+MA_API ma_result ma_spatializer_get_master_volume(const ma_spatializer* pSpatializer, float* pVolume);
+MA_API ma_uint32 ma_spatializer_get_input_channels(const ma_spatializer* pSpatializer);
+MA_API ma_uint32 ma_spatializer_get_output_channels(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_attenuation_model(ma_spatializer* pSpatializer, ma_attenuation_model attenuationModel);
+MA_API ma_attenuation_model ma_spatializer_get_attenuation_model(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_positioning(ma_spatializer* pSpatializer, ma_positioning positioning);
+MA_API ma_positioning ma_spatializer_get_positioning(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_rolloff(ma_spatializer* pSpatializer, float rolloff);
+MA_API float ma_spatializer_get_rolloff(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_min_gain(ma_spatializer* pSpatializer, float minGain);
+MA_API float ma_spatializer_get_min_gain(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_max_gain(ma_spatializer* pSpatializer, float maxGain);
+MA_API float ma_spatializer_get_max_gain(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_min_distance(ma_spatializer* pSpatializer, float minDistance);
+MA_API float ma_spatializer_get_min_distance(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_max_distance(ma_spatializer* pSpatializer, float maxDistance);
+MA_API float ma_spatializer_get_max_distance(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_cone(ma_spatializer* pSpatializer, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
+MA_API void ma_spatializer_get_cone(const ma_spatializer* pSpatializer, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
+MA_API void ma_spatializer_set_doppler_factor(ma_spatializer* pSpatializer, float dopplerFactor);
+MA_API float ma_spatializer_get_doppler_factor(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_directional_attenuation_factor(ma_spatializer* pSpatializer, float directionalAttenuationFactor);
+MA_API float ma_spatializer_get_directional_attenuation_factor(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_position(ma_spatializer* pSpatializer, float x, float y, float z);
+MA_API ma_vec3f ma_spatializer_get_position(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_direction(ma_spatializer* pSpatializer, float x, float y, float z);
+MA_API ma_vec3f ma_spatializer_get_direction(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_set_velocity(ma_spatializer* pSpatializer, float x, float y, float z);
+MA_API ma_vec3f ma_spatializer_get_velocity(const ma_spatializer* pSpatializer);
+MA_API void ma_spatializer_get_relative_position_and_direction(const ma_spatializer* pSpatializer, const ma_spatializer_listener* pListener, ma_vec3f* pRelativePos, ma_vec3f* pRelativeDir);
+
+
+
+/************************************************************************************************************************************************************
+*************************************************************************************************************************************************************
+
+DATA CONVERSION
+===============
+
+This section contains the APIs for data conversion. You will find everything here for channel mapping, sample format conversion, resampling, etc.
+
+*************************************************************************************************************************************************************
+************************************************************************************************************************************************************/
+
+/**************************************************************************************************************************************************************
+
+Resampling
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRateIn;
+    ma_uint32 sampleRateOut;
+    ma_uint32 lpfOrder;         /* The low-pass filter order. Setting this to 0 will disable low-pass filtering. */
+    double    lpfNyquistFactor; /* 0..1. Defaults to 1. 1 = Half the sampling frequency (Nyquist Frequency), 0.5 = Quarter the sampling frequency (half Nyquest Frequency), etc. */
+} ma_linear_resampler_config;
+
+MA_API ma_linear_resampler_config ma_linear_resampler_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
+
+typedef struct
+{
+    ma_linear_resampler_config config;
+    ma_uint32 inAdvanceInt;
+    ma_uint32 inAdvanceFrac;
+    ma_uint32 inTimeInt;
+    ma_uint32 inTimeFrac;
+    union
+    {
+        float* f32;
+        ma_int16* s16;
+    } x0; /* The previous input frame. */
+    union
+    {
+        float* f32;
+        ma_int16* s16;
+    } x1; /* The next input frame. */
+    ma_lpf lpf;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_linear_resampler;
+
+MA_API ma_result ma_linear_resampler_get_heap_size(const ma_linear_resampler_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_linear_resampler_init_preallocated(const ma_linear_resampler_config* pConfig, void* pHeap, ma_linear_resampler* pResampler);
+MA_API ma_result ma_linear_resampler_init(const ma_linear_resampler_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_linear_resampler* pResampler);
+MA_API void ma_linear_resampler_uninit(ma_linear_resampler* pResampler, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_linear_resampler_process_pcm_frames(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut);
+MA_API ma_result ma_linear_resampler_set_rate(ma_linear_resampler* pResampler, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
+MA_API ma_result ma_linear_resampler_set_rate_ratio(ma_linear_resampler* pResampler, float ratioInOut);
+MA_API ma_uint64 ma_linear_resampler_get_input_latency(const ma_linear_resampler* pResampler);
+MA_API ma_uint64 ma_linear_resampler_get_output_latency(const ma_linear_resampler* pResampler);
+MA_API ma_result ma_linear_resampler_get_required_input_frame_count(const ma_linear_resampler* pResampler, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount);
+MA_API ma_result ma_linear_resampler_get_expected_output_frame_count(const ma_linear_resampler* pResampler, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount);
+MA_API ma_result ma_linear_resampler_reset(ma_linear_resampler* pResampler);
+
+
+typedef struct ma_resampler_config ma_resampler_config;
+
+typedef void ma_resampling_backend;
+typedef struct
+{
+    ma_result (* onGetHeapSize                )(void* pUserData, const ma_resampler_config* pConfig, size_t* pHeapSizeInBytes);
+    ma_result (* onInit                       )(void* pUserData, const ma_resampler_config* pConfig, void* pHeap, ma_resampling_backend** ppBackend);
+    void      (* onUninit                     )(void* pUserData, ma_resampling_backend* pBackend, const ma_allocation_callbacks* pAllocationCallbacks);
+    ma_result (* onProcess                    )(void* pUserData, ma_resampling_backend* pBackend, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut);
+    ma_result (* onSetRate                    )(void* pUserData, ma_resampling_backend* pBackend, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);                 /* Optional. Rate changes will be disabled. */
+    ma_uint64 (* onGetInputLatency            )(void* pUserData, const ma_resampling_backend* pBackend);                                                            /* Optional. Latency will be reported as 0. */
+    ma_uint64 (* onGetOutputLatency           )(void* pUserData, const ma_resampling_backend* pBackend);                                                            /* Optional. Latency will be reported as 0. */
+    ma_result (* onGetRequiredInputFrameCount )(void* pUserData, const ma_resampling_backend* pBackend, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount);   /* Optional. Latency mitigation will be disabled. */
+    ma_result (* onGetExpectedOutputFrameCount)(void* pUserData, const ma_resampling_backend* pBackend, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount);   /* Optional. Latency mitigation will be disabled. */
+    ma_result (* onReset                      )(void* pUserData, ma_resampling_backend* pBackend);
+} ma_resampling_backend_vtable;
+
+typedef enum
+{
+    ma_resample_algorithm_linear = 0,    /* Fastest, lowest quality. Optional low-pass filtering. Default. */
+    ma_resample_algorithm_custom,
+} ma_resample_algorithm;
+
+struct ma_resampler_config
+{
+    ma_format format;   /* Must be either ma_format_f32 or ma_format_s16. */
+    ma_uint32 channels;
+    ma_uint32 sampleRateIn;
+    ma_uint32 sampleRateOut;
+    ma_resample_algorithm algorithm;    /* When set to ma_resample_algorithm_custom, pBackendVTable will be used. */
+    ma_resampling_backend_vtable* pBackendVTable;
+    void* pBackendUserData;
+    struct
+    {
+        ma_uint32 lpfOrder;
+    } linear;
+};
+
+MA_API ma_resampler_config ma_resampler_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut, ma_resample_algorithm algorithm);
+
+typedef struct
+{
+    ma_resampling_backend* pBackend;
+    ma_resampling_backend_vtable* pBackendVTable;
+    void* pBackendUserData;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRateIn;
+    ma_uint32 sampleRateOut;
+    union
+    {
+        ma_linear_resampler linear;
+    } state;    /* State for stock resamplers so we can avoid a malloc. For stock resamplers, pBackend will point here. */
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_resampler;
+
+MA_API ma_result ma_resampler_get_heap_size(const ma_resampler_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_resampler_init_preallocated(const ma_resampler_config* pConfig, void* pHeap, ma_resampler* pResampler);
+
+/*
+Initializes a new resampler object from a config.
+*/
+MA_API ma_result ma_resampler_init(const ma_resampler_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_resampler* pResampler);
+
+/*
+Uninitializes a resampler.
+*/
+MA_API void ma_resampler_uninit(ma_resampler* pResampler, const ma_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Converts the given input data.
+
+Both the input and output frames must be in the format specified in the config when the resampler was initialized.
+
+On input, [pFrameCountOut] contains the number of output frames to process. On output it contains the number of output frames that
+were actually processed, which may be less than the requested amount which will happen if there's not enough input data. You can use
+ma_resampler_get_expected_output_frame_count() to know how many output frames will be processed for a given number of input frames.
+
+On input, [pFrameCountIn] contains the number of input frames contained in [pFramesIn]. On output it contains the number of whole
+input frames that were actually processed. You can use ma_resampler_get_required_input_frame_count() to know how many input frames
+you should provide for a given number of output frames. [pFramesIn] can be NULL, in which case zeroes will be used instead.
+
+If [pFramesOut] is NULL, a seek is performed. In this case, if [pFrameCountOut] is not NULL it will seek by the specified number of
+output frames. Otherwise, if [pFramesCountOut] is NULL and [pFrameCountIn] is not NULL, it will seek by the specified number of input
+frames. When seeking, [pFramesIn] is allowed to NULL, in which case the internal timing state will be updated, but no input will be
+processed. In this case, any internal filter state will be updated as if zeroes were passed in.
+
+It is an error for [pFramesOut] to be non-NULL and [pFrameCountOut] to be NULL.
+
+It is an error for both [pFrameCountOut] and [pFrameCountIn] to be NULL.
+*/
+MA_API ma_result ma_resampler_process_pcm_frames(ma_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut);
+
+
+/*
+Sets the input and output sample rate.
+*/
+MA_API ma_result ma_resampler_set_rate(ma_resampler* pResampler, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
+
+/*
+Sets the input and output sample rate as a ratio.
+
+The ration is in/out.
+*/
+MA_API ma_result ma_resampler_set_rate_ratio(ma_resampler* pResampler, float ratio);
+
+/*
+Retrieves the latency introduced by the resampler in input frames.
+*/
+MA_API ma_uint64 ma_resampler_get_input_latency(const ma_resampler* pResampler);
+
+/*
+Retrieves the latency introduced by the resampler in output frames.
+*/
+MA_API ma_uint64 ma_resampler_get_output_latency(const ma_resampler* pResampler);
+
+/*
+Calculates the number of whole input frames that would need to be read from the client in order to output the specified
+number of output frames.
+
+The returned value does not include cached input frames. It only returns the number of extra frames that would need to be
+read from the input buffer in order to output the specified number of output frames.
+*/
+MA_API ma_result ma_resampler_get_required_input_frame_count(const ma_resampler* pResampler, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount);
+
+/*
+Calculates the number of whole output frames that would be output after fully reading and consuming the specified number of
+input frames.
+*/
+MA_API ma_result ma_resampler_get_expected_output_frame_count(const ma_resampler* pResampler, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount);
+
+/*
+Resets the resampler's timer and clears its internal cache.
+*/
+MA_API ma_result ma_resampler_reset(ma_resampler* pResampler);
+
+
+/**************************************************************************************************************************************************************
+
+Channel Conversion
+
+**************************************************************************************************************************************************************/
+typedef enum
+{
+    ma_channel_conversion_path_unknown,
+    ma_channel_conversion_path_passthrough,
+    ma_channel_conversion_path_mono_out,    /* Converting to mono. */
+    ma_channel_conversion_path_mono_in,     /* Converting from mono. */
+    ma_channel_conversion_path_shuffle,     /* Simple shuffle. Will use this when all channels are present in both input and output channel maps, but just in a different order. */
+    ma_channel_conversion_path_weights      /* Blended based on weights. */
+} ma_channel_conversion_path;
+
+typedef enum
+{
+    ma_mono_expansion_mode_duplicate = 0,   /* The default. */
+    ma_mono_expansion_mode_average,         /* Average the mono channel across all channels. */
+    ma_mono_expansion_mode_stereo_only,     /* Duplicate to the left and right channels only and ignore the others. */
+    ma_mono_expansion_mode_default = ma_mono_expansion_mode_duplicate
+} ma_mono_expansion_mode;
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    const ma_channel* pChannelMapIn;
+    const ma_channel* pChannelMapOut;
+    ma_channel_mix_mode mixingMode;
+    ma_bool32 calculateLFEFromSpatialChannels;  /* When an output LFE channel is present, but no input LFE, set to true to set the output LFE to the average of all spatial channels (LR, FR, etc.). Ignored when an input LFE is present. */
+    float** ppWeights;  /* [in][out]. Only used when mixingMode is set to ma_channel_mix_mode_custom_weights. */
+} ma_channel_converter_config;
+
+MA_API ma_channel_converter_config ma_channel_converter_config_init(ma_format format, ma_uint32 channelsIn, const ma_channel* pChannelMapIn, ma_uint32 channelsOut, const ma_channel* pChannelMapOut, ma_channel_mix_mode mixingMode);
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_channel_mix_mode mixingMode;
+    ma_channel_conversion_path conversionPath;
+    ma_channel* pChannelMapIn;
+    ma_channel* pChannelMapOut;
+    ma_uint8* pShuffleTable;    /* Indexed by output channel index. */
+    union
+    {
+        float**    f32;
+        ma_int32** s16;
+    } weights;  /* [in][out] */
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_channel_converter;
+
+MA_API ma_result ma_channel_converter_get_heap_size(const ma_channel_converter_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_channel_converter_init_preallocated(const ma_channel_converter_config* pConfig, void* pHeap, ma_channel_converter* pConverter);
+MA_API ma_result ma_channel_converter_init(const ma_channel_converter_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_channel_converter* pConverter);
+MA_API void ma_channel_converter_uninit(ma_channel_converter* pConverter, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_channel_converter_process_pcm_frames(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount);
+MA_API ma_result ma_channel_converter_get_input_channel_map(const ma_channel_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_channel_converter_get_output_channel_map(const ma_channel_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap);
+
+
+/**************************************************************************************************************************************************************
+
+Data Conversion
+
+**************************************************************************************************************************************************************/
+typedef struct
+{
+    ma_format formatIn;
+    ma_format formatOut;
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_uint32 sampleRateIn;
+    ma_uint32 sampleRateOut;
+    ma_channel* pChannelMapIn;
+    ma_channel* pChannelMapOut;
+    ma_dither_mode ditherMode;
+    ma_channel_mix_mode channelMixMode;
+    ma_bool32 calculateLFEFromSpatialChannels;  /* When an output LFE channel is present, but no input LFE, set to true to set the output LFE to the average of all spatial channels (LR, FR, etc.). Ignored when an input LFE is present. */
+    float** ppChannelWeights;  /* [in][out]. Only used when mixingMode is set to ma_channel_mix_mode_custom_weights. */
+    ma_bool32 allowDynamicSampleRate;
+    ma_resampler_config resampling;
+} ma_data_converter_config;
+
+MA_API ma_data_converter_config ma_data_converter_config_init_default(void);
+MA_API ma_data_converter_config ma_data_converter_config_init(ma_format formatIn, ma_format formatOut, ma_uint32 channelsIn, ma_uint32 channelsOut, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
+
+
+typedef enum
+{
+    ma_data_converter_execution_path_passthrough,       /* No conversion. */
+    ma_data_converter_execution_path_format_only,       /* Only format conversion. */
+    ma_data_converter_execution_path_channels_only,     /* Only channel conversion. */
+    ma_data_converter_execution_path_resample_only,     /* Only resampling. */
+    ma_data_converter_execution_path_resample_first,    /* All conversions, but resample as the first step. */
+    ma_data_converter_execution_path_channels_first     /* All conversions, but channels as the first step. */
+} ma_data_converter_execution_path;
+
+typedef struct
+{
+    ma_format formatIn;
+    ma_format formatOut;
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_uint32 sampleRateIn;
+    ma_uint32 sampleRateOut;
+    ma_dither_mode ditherMode;
+    ma_data_converter_execution_path executionPath; /* The execution path the data converter will follow when processing. */
+    ma_channel_converter channelConverter;
+    ma_resampler resampler;
+    ma_bool8 hasPreFormatConversion;
+    ma_bool8 hasPostFormatConversion;
+    ma_bool8 hasChannelConverter;
+    ma_bool8 hasResampler;
+    ma_bool8 isPassthrough;
+
+    /* Memory management. */
+    ma_bool8 _ownsHeap;
+    void* _pHeap;
+} ma_data_converter;
+
+MA_API ma_result ma_data_converter_get_heap_size(const ma_data_converter_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_data_converter_init_preallocated(const ma_data_converter_config* pConfig, void* pHeap, ma_data_converter* pConverter);
+MA_API ma_result ma_data_converter_init(const ma_data_converter_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_converter* pConverter);
+MA_API void ma_data_converter_uninit(ma_data_converter* pConverter, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_data_converter_process_pcm_frames(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut);
+MA_API ma_result ma_data_converter_set_rate(ma_data_converter* pConverter, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut);
+MA_API ma_result ma_data_converter_set_rate_ratio(ma_data_converter* pConverter, float ratioInOut);
+MA_API ma_uint64 ma_data_converter_get_input_latency(const ma_data_converter* pConverter);
+MA_API ma_uint64 ma_data_converter_get_output_latency(const ma_data_converter* pConverter);
+MA_API ma_result ma_data_converter_get_required_input_frame_count(const ma_data_converter* pConverter, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount);
+MA_API ma_result ma_data_converter_get_expected_output_frame_count(const ma_data_converter* pConverter, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount);
+MA_API ma_result ma_data_converter_get_input_channel_map(const ma_data_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_data_converter_get_output_channel_map(const ma_data_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_data_converter_reset(ma_data_converter* pConverter);
+
+
+/************************************************************************************************************************************************************
+
+Format Conversion
+
+************************************************************************************************************************************************************/
+MA_API void ma_pcm_u8_to_s16(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_u8_to_s24(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_u8_to_s32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_u8_to_f32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s16_to_u8(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s16_to_s24(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s16_to_s32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s16_to_f32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s24_to_u8(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s24_to_s16(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s24_to_s32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s24_to_f32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s32_to_u8(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s32_to_s16(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s32_to_s24(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_s32_to_f32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_f32_to_u8(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_f32_to_s16(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_f32_to_s24(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_f32_to_s32(void* pOut, const void* pIn, ma_uint64 count, ma_dither_mode ditherMode);
+MA_API void ma_pcm_convert(void* pOut, ma_format formatOut, const void* pIn, ma_format formatIn, ma_uint64 sampleCount, ma_dither_mode ditherMode);
+MA_API void ma_convert_pcm_frames_format(void* pOut, ma_format formatOut, const void* pIn, ma_format formatIn, ma_uint64 frameCount, ma_uint32 channels, ma_dither_mode ditherMode);
+
+/*
+Deinterleaves an interleaved buffer.
+*/
+MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void* pInterleavedPCMFrames, void** ppDeinterleavedPCMFrames);
+
+/*
+Interleaves a group of deinterleaved buffers.
+*/
+MA_API void ma_interleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames);
+
+
+/************************************************************************************************************************************************************
+
+Channel Maps
+
+************************************************************************************************************************************************************/
+/*
+This is used in the shuffle table to indicate that the channel index is undefined and should be ignored.
+*/
+#define MA_CHANNEL_INDEX_NULL   255
+
+/*
+Retrieves the channel position of the specified channel in the given channel map.
+
+The pChannelMap parameter can be null, in which case miniaudio's default channel map will be assumed.
+*/
+MA_API ma_channel ma_channel_map_get_channel(const ma_channel* pChannelMap, ma_uint32 channelCount, ma_uint32 channelIndex);
+
+/*
+Initializes a blank channel map.
+
+When a blank channel map is specified anywhere it indicates that the native channel map should be used.
+*/
+MA_API void ma_channel_map_init_blank(ma_channel* pChannelMap, ma_uint32 channels);
+
+/*
+Helper for retrieving a standard channel map.
+
+The output channel map buffer must have a capacity of at least `channelMapCap`.
+*/
+MA_API void ma_channel_map_init_standard(ma_standard_channel_map standardChannelMap, ma_channel* pChannelMap, size_t channelMapCap, ma_uint32 channels);
+
+/*
+Copies a channel map.
+
+Both input and output channel map buffers must have a capacity of at least `channels`.
+*/
+MA_API void ma_channel_map_copy(ma_channel* pOut, const ma_channel* pIn, ma_uint32 channels);
+
+/*
+Copies a channel map if one is specified, otherwise copies the default channel map.
+
+The output buffer must have a capacity of at least `channels`. If not NULL, the input channel map must also have a capacity of at least `channels`.
+*/
+MA_API void ma_channel_map_copy_or_default(ma_channel* pOut, size_t channelMapCapOut, const ma_channel* pIn, ma_uint32 channels);
+
+
+/*
+Determines whether or not a channel map is valid.
+
+A blank channel map is valid (all channels set to MA_CHANNEL_NONE). The way a blank channel map is handled is context specific, but
+is usually treated as a passthrough.
+
+Invalid channel maps:
+  - A channel map with no channels
+  - A channel map with more than one channel and a mono channel
+
+The channel map buffer must have a capacity of at least `channels`.
+*/
+MA_API ma_bool32 ma_channel_map_is_valid(const ma_channel* pChannelMap, ma_uint32 channels);
+
+/*
+Helper for comparing two channel maps for equality.
+
+This assumes the channel count is the same between the two.
+
+Both channels map buffers must have a capacity of at least `channels`.
+*/
+MA_API ma_bool32 ma_channel_map_is_equal(const ma_channel* pChannelMapA, const ma_channel* pChannelMapB, ma_uint32 channels);
+
+/*
+Helper for determining if a channel map is blank (all channels set to MA_CHANNEL_NONE).
+
+The channel map buffer must have a capacity of at least `channels`.
+*/
+MA_API ma_bool32 ma_channel_map_is_blank(const ma_channel* pChannelMap, ma_uint32 channels);
+
+/*
+Helper for determining whether or not a channel is present in the given channel map.
+
+The channel map buffer must have a capacity of at least `channels`.
+*/
+MA_API ma_bool32 ma_channel_map_contains_channel_position(ma_uint32 channels, const ma_channel* pChannelMap, ma_channel channelPosition);
+
+/*
+Find a channel position in the given channel map. Returns MA_TRUE if the channel is found; MA_FALSE otherwise. The
+index of the channel is output to `pChannelIndex`.
+
+The channel map buffer must have a capacity of at least `channels`.
+*/
+MA_API ma_bool32 ma_channel_map_find_channel_position(ma_uint32 channels, const ma_channel* pChannelMap, ma_channel channelPosition, ma_uint32* pChannelIndex);
+
+/*
+Generates a string representing the given channel map.
+
+This is for printing and debugging purposes, not serialization/deserialization.
+
+Returns the length of the string, not including the null terminator.
+*/
+MA_API size_t ma_channel_map_to_string(const ma_channel* pChannelMap, ma_uint32 channels, char* pBufferOut, size_t bufferCap);
+
+/*
+Retrieves a human readable version of a channel position.
+*/
+MA_API const char* ma_channel_position_to_string(ma_channel channel);
+
+
+/************************************************************************************************************************************************************
+
+Conversion Helpers
+
+************************************************************************************************************************************************************/
+
+/*
+High-level helper for doing a full format conversion in one go. Returns the number of output frames. Call this with pOut set to NULL to
+determine the required size of the output buffer. frameCountOut should be set to the capacity of pOut. If pOut is NULL, frameCountOut is
+ignored.
+
+A return value of 0 indicates an error.
+
+This function is useful for one-off bulk conversions, but if you're streaming data you should use the ma_data_converter APIs instead.
+*/
+MA_API ma_uint64 ma_convert_frames(void* pOut, ma_uint64 frameCountOut, ma_format formatOut, ma_uint32 channelsOut, ma_uint32 sampleRateOut, const void* pIn, ma_uint64 frameCountIn, ma_format formatIn, ma_uint32 channelsIn, ma_uint32 sampleRateIn);
+MA_API ma_uint64 ma_convert_frames_ex(void* pOut, ma_uint64 frameCountOut, const void* pIn, ma_uint64 frameCountIn, const ma_data_converter_config* pConfig);
+
+
+/************************************************************************************************************************************************************
+
+Data Source
+
+************************************************************************************************************************************************************/
+typedef void ma_data_source;
+
+#define MA_DATA_SOURCE_SELF_MANAGED_RANGE_AND_LOOP_POINT    0x00000001
+
+typedef struct
+{
+    ma_result (* onRead)(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+    ma_result (* onSeek)(ma_data_source* pDataSource, ma_uint64 frameIndex);
+    ma_result (* onGetDataFormat)(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+    ma_result (* onGetCursor)(ma_data_source* pDataSource, ma_uint64* pCursor);
+    ma_result (* onGetLength)(ma_data_source* pDataSource, ma_uint64* pLength);
+    ma_result (* onSetLooping)(ma_data_source* pDataSource, ma_bool32 isLooping);
+    ma_uint32 flags;
+} ma_data_source_vtable;
+
+typedef ma_data_source* (* ma_data_source_get_next_proc)(ma_data_source* pDataSource);
+
+typedef struct
+{
+    const ma_data_source_vtable* vtable;
+} ma_data_source_config;
+
+MA_API ma_data_source_config ma_data_source_config_init(void);
+
+
+typedef struct
+{
+    const ma_data_source_vtable* vtable;
+    ma_uint64 rangeBegInFrames;
+    ma_uint64 rangeEndInFrames;             /* Set to -1 for unranged (default). */
+    ma_uint64 loopBegInFrames;              /* Relative to rangeBegInFrames. */
+    ma_uint64 loopEndInFrames;              /* Relative to rangeBegInFrames. Set to -1 for the end of the range. */
+    ma_data_source* pCurrent;               /* When non-NULL, the data source being initialized will act as a proxy and will route all operations to pCurrent. Used in conjunction with pNext/onGetNext for seamless chaining. */
+    ma_data_source* pNext;                  /* When set to NULL, onGetNext will be used. */
+    ma_data_source_get_next_proc onGetNext; /* Will be used when pNext is NULL. If both are NULL, no next will be used. */
+    MA_ATOMIC(4, ma_bool32) isLooping;
+} ma_data_source_base;
+
+MA_API ma_result ma_data_source_init(const ma_data_source_config* pConfig, ma_data_source* pDataSource);
+MA_API void ma_data_source_uninit(ma_data_source* pDataSource);
+MA_API ma_result ma_data_source_read_pcm_frames(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);   /* Must support pFramesOut = NULL in which case a forward seek should be performed. */
+MA_API ma_result ma_data_source_seek_pcm_frames(ma_data_source* pDataSource, ma_uint64 frameCount, ma_uint64* pFramesSeeked); /* Can only seek forward. Equivalent to ma_data_source_read_pcm_frames(pDataSource, NULL, frameCount, &framesRead); */
+MA_API ma_result ma_data_source_seek_to_pcm_frame(ma_data_source* pDataSource, ma_uint64 frameIndex);
+MA_API ma_result ma_data_source_seek_seconds(ma_data_source* pDataSource, float secondCount, float* pSecondsSeeked); /* Can only seek forward. Abstraction to ma_data_source_seek_pcm_frames() */
+MA_API ma_result ma_data_source_seek_to_second(ma_data_source* pDataSource, float seekPointInSeconds); /* Abstraction to ma_data_source_seek_to_pcm_frame() */
+MA_API ma_result ma_data_source_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_data_source_get_cursor_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pCursor);
+MA_API ma_result ma_data_source_get_length_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pLength);    /* Returns MA_NOT_IMPLEMENTED if the length is unknown or cannot be determined. Decoders can return this. */
+MA_API ma_result ma_data_source_get_cursor_in_seconds(ma_data_source* pDataSource, float* pCursor);
+MA_API ma_result ma_data_source_get_length_in_seconds(ma_data_source* pDataSource, float* pLength);
+MA_API ma_result ma_data_source_set_looping(ma_data_source* pDataSource, ma_bool32 isLooping);
+MA_API ma_bool32 ma_data_source_is_looping(const ma_data_source* pDataSource);
+MA_API ma_result ma_data_source_set_range_in_pcm_frames(ma_data_source* pDataSource, ma_uint64 rangeBegInFrames, ma_uint64 rangeEndInFrames);
+MA_API void ma_data_source_get_range_in_pcm_frames(const ma_data_source* pDataSource, ma_uint64* pRangeBegInFrames, ma_uint64* pRangeEndInFrames);
+MA_API ma_result ma_data_source_set_loop_point_in_pcm_frames(ma_data_source* pDataSource, ma_uint64 loopBegInFrames, ma_uint64 loopEndInFrames);
+MA_API void ma_data_source_get_loop_point_in_pcm_frames(const ma_data_source* pDataSource, ma_uint64* pLoopBegInFrames, ma_uint64* pLoopEndInFrames);
+MA_API ma_result ma_data_source_set_current(ma_data_source* pDataSource, ma_data_source* pCurrentDataSource);
+MA_API ma_data_source* ma_data_source_get_current(const ma_data_source* pDataSource);
+MA_API ma_result ma_data_source_set_next(ma_data_source* pDataSource, ma_data_source* pNextDataSource);
+MA_API ma_data_source* ma_data_source_get_next(const ma_data_source* pDataSource);
+MA_API ma_result ma_data_source_set_next_callback(ma_data_source* pDataSource, ma_data_source_get_next_proc onGetNext);
+MA_API ma_data_source_get_next_proc ma_data_source_get_next_callback(const ma_data_source* pDataSource);
+
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_uint64 cursor;
+    ma_uint64 sizeInFrames;
+    const void* pData;
+} ma_audio_buffer_ref;
+
+MA_API ma_result ma_audio_buffer_ref_init(ma_format format, ma_uint32 channels, const void* pData, ma_uint64 sizeInFrames, ma_audio_buffer_ref* pAudioBufferRef);
+MA_API void ma_audio_buffer_ref_uninit(ma_audio_buffer_ref* pAudioBufferRef);
+MA_API ma_result ma_audio_buffer_ref_set_data(ma_audio_buffer_ref* pAudioBufferRef, const void* pData, ma_uint64 sizeInFrames);
+MA_API ma_uint64 ma_audio_buffer_ref_read_pcm_frames(ma_audio_buffer_ref* pAudioBufferRef, void* pFramesOut, ma_uint64 frameCount, ma_bool32 loop);
+MA_API ma_result ma_audio_buffer_ref_seek_to_pcm_frame(ma_audio_buffer_ref* pAudioBufferRef, ma_uint64 frameIndex);
+MA_API ma_result ma_audio_buffer_ref_map(ma_audio_buffer_ref* pAudioBufferRef, void** ppFramesOut, ma_uint64* pFrameCount);
+MA_API ma_result ma_audio_buffer_ref_unmap(ma_audio_buffer_ref* pAudioBufferRef, ma_uint64 frameCount);    /* Returns MA_AT_END if the end has been reached. This should be considered successful. */
+MA_API ma_bool32 ma_audio_buffer_ref_at_end(const ma_audio_buffer_ref* pAudioBufferRef);
+MA_API ma_result ma_audio_buffer_ref_get_cursor_in_pcm_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pCursor);
+MA_API ma_result ma_audio_buffer_ref_get_length_in_pcm_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pLength);
+MA_API ma_result ma_audio_buffer_ref_get_available_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pAvailableFrames);
+
+
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_uint64 sizeInFrames;
+    const void* pData;  /* If set to NULL, will allocate a block of memory for you. */
+    ma_allocation_callbacks allocationCallbacks;
+} ma_audio_buffer_config;
+
+MA_API ma_audio_buffer_config ma_audio_buffer_config_init(ma_format format, ma_uint32 channels, ma_uint64 sizeInFrames, const void* pData, const ma_allocation_callbacks* pAllocationCallbacks);
+
+typedef struct
+{
+    ma_audio_buffer_ref ref;
+    ma_allocation_callbacks allocationCallbacks;
+    ma_bool32 ownsData;             /* Used to control whether or not miniaudio owns the data buffer. If set to true, pData will be freed in ma_audio_buffer_uninit(). */
+    ma_uint8 _pExtraData[1];        /* For allocating a buffer with the memory located directly after the other memory of the structure. */
+} ma_audio_buffer;
+
+MA_API ma_result ma_audio_buffer_init(const ma_audio_buffer_config* pConfig, ma_audio_buffer* pAudioBuffer);
+MA_API ma_result ma_audio_buffer_init_copy(const ma_audio_buffer_config* pConfig, ma_audio_buffer* pAudioBuffer);
+MA_API ma_result ma_audio_buffer_alloc_and_init(const ma_audio_buffer_config* pConfig, ma_audio_buffer** ppAudioBuffer);  /* Always copies the data. Doesn't make sense to use this otherwise. Use ma_audio_buffer_uninit_and_free() to uninit. */
+MA_API void ma_audio_buffer_uninit(ma_audio_buffer* pAudioBuffer);
+MA_API void ma_audio_buffer_uninit_and_free(ma_audio_buffer* pAudioBuffer);
+MA_API ma_uint64 ma_audio_buffer_read_pcm_frames(ma_audio_buffer* pAudioBuffer, void* pFramesOut, ma_uint64 frameCount, ma_bool32 loop);
+MA_API ma_result ma_audio_buffer_seek_to_pcm_frame(ma_audio_buffer* pAudioBuffer, ma_uint64 frameIndex);
+MA_API ma_result ma_audio_buffer_map(ma_audio_buffer* pAudioBuffer, void** ppFramesOut, ma_uint64* pFrameCount);
+MA_API ma_result ma_audio_buffer_unmap(ma_audio_buffer* pAudioBuffer, ma_uint64 frameCount);    /* Returns MA_AT_END if the end has been reached. This should be considered successful. */
+MA_API ma_bool32 ma_audio_buffer_at_end(const ma_audio_buffer* pAudioBuffer);
+MA_API ma_result ma_audio_buffer_get_cursor_in_pcm_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pCursor);
+MA_API ma_result ma_audio_buffer_get_length_in_pcm_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pLength);
+MA_API ma_result ma_audio_buffer_get_available_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pAvailableFrames);
+
+
+/*
+Paged Audio Buffer
+==================
+A paged audio buffer is made up of a linked list of pages. It's expandable, but not shrinkable. It
+can be used for cases where audio data is streamed in asynchronously while allowing data to be read
+at the same time.
+
+This is lock-free, but not 100% thread safe. You can append a page and read from the buffer across
+simultaneously across different threads, however only one thread at a time can append, and only one
+thread at a time can read and seek.
+*/
+typedef struct ma_paged_audio_buffer_page ma_paged_audio_buffer_page;
+struct ma_paged_audio_buffer_page
+{
+    MA_ATOMIC(MA_SIZEOF_PTR, ma_paged_audio_buffer_page*) pNext;
+    ma_uint64 sizeInFrames;
+    ma_uint8 pAudioData[1];
+};
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_paged_audio_buffer_page head;                                /* Dummy head for the lock-free algorithm. Always has a size of 0. */
+    MA_ATOMIC(MA_SIZEOF_PTR, ma_paged_audio_buffer_page*) pTail;    /* Never null. Initially set to &head. */
+} ma_paged_audio_buffer_data;
+
+MA_API ma_result ma_paged_audio_buffer_data_init(ma_format format, ma_uint32 channels, ma_paged_audio_buffer_data* pData);
+MA_API void ma_paged_audio_buffer_data_uninit(ma_paged_audio_buffer_data* pData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_paged_audio_buffer_page* ma_paged_audio_buffer_data_get_head(ma_paged_audio_buffer_data* pData);
+MA_API ma_paged_audio_buffer_page* ma_paged_audio_buffer_data_get_tail(ma_paged_audio_buffer_data* pData);
+MA_API ma_result ma_paged_audio_buffer_data_get_length_in_pcm_frames(ma_paged_audio_buffer_data* pData, ma_uint64* pLength);
+MA_API ma_result ma_paged_audio_buffer_data_allocate_page(ma_paged_audio_buffer_data* pData, ma_uint64 pageSizeInFrames, const void* pInitialData, const ma_allocation_callbacks* pAllocationCallbacks, ma_paged_audio_buffer_page** ppPage);
+MA_API ma_result ma_paged_audio_buffer_data_free_page(ma_paged_audio_buffer_data* pData, ma_paged_audio_buffer_page* pPage, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_paged_audio_buffer_data_append_page(ma_paged_audio_buffer_data* pData, ma_paged_audio_buffer_page* pPage);
+MA_API ma_result ma_paged_audio_buffer_data_allocate_and_append_page(ma_paged_audio_buffer_data* pData, ma_uint32 pageSizeInFrames, const void* pInitialData, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+typedef struct
+{
+    ma_paged_audio_buffer_data* pData;  /* Must not be null. */
+} ma_paged_audio_buffer_config;
+
+MA_API ma_paged_audio_buffer_config ma_paged_audio_buffer_config_init(ma_paged_audio_buffer_data* pData);
+
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_paged_audio_buffer_data* pData;              /* Audio data is read from here. Cannot be null. */
+    ma_paged_audio_buffer_page* pCurrent;
+    ma_uint64 relativeCursor;                       /* Relative to the current page. */
+    ma_uint64 absoluteCursor;
+} ma_paged_audio_buffer;
+
+MA_API ma_result ma_paged_audio_buffer_init(const ma_paged_audio_buffer_config* pConfig, ma_paged_audio_buffer* pPagedAudioBuffer);
+MA_API void ma_paged_audio_buffer_uninit(ma_paged_audio_buffer* pPagedAudioBuffer);
+MA_API ma_result ma_paged_audio_buffer_read_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);   /* Returns MA_AT_END if no more pages available. */
+MA_API ma_result ma_paged_audio_buffer_seek_to_pcm_frame(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64 frameIndex);
+MA_API ma_result ma_paged_audio_buffer_get_cursor_in_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64* pCursor);
+MA_API ma_result ma_paged_audio_buffer_get_length_in_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64* pLength);
+
+
+
+/************************************************************************************************************************************************************
+
+Ring Buffer
+
+************************************************************************************************************************************************************/
+typedef struct
+{
+    void* pBuffer;
+    ma_uint32 subbufferSizeInBytes;
+    ma_uint32 subbufferCount;
+    ma_uint32 subbufferStrideInBytes;
+    MA_ATOMIC(4, ma_uint32) encodedReadOffset;  /* Most significant bit is the loop flag. Lower 31 bits contains the actual offset in bytes. Must be used atomically. */
+    MA_ATOMIC(4, ma_uint32) encodedWriteOffset; /* Most significant bit is the loop flag. Lower 31 bits contains the actual offset in bytes. Must be used atomically. */
+    ma_bool8 ownsBuffer;                        /* Used to know whether or not miniaudio is responsible for free()-ing the buffer. */
+    ma_bool8 clearOnWriteAcquire;               /* When set, clears the acquired write buffer before returning from ma_rb_acquire_write(). */
+    ma_allocation_callbacks allocationCallbacks;
+} ma_rb;
+
+MA_API ma_result ma_rb_init_ex(size_t subbufferSizeInBytes, size_t subbufferCount, size_t subbufferStrideInBytes, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_rb* pRB);
+MA_API ma_result ma_rb_init(size_t bufferSizeInBytes, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_rb* pRB);
+MA_API void ma_rb_uninit(ma_rb* pRB);
+MA_API void ma_rb_reset(ma_rb* pRB);
+MA_API ma_result ma_rb_acquire_read(ma_rb* pRB, size_t* pSizeInBytes, void** ppBufferOut);
+MA_API ma_result ma_rb_commit_read(ma_rb* pRB, size_t sizeInBytes);
+MA_API ma_result ma_rb_acquire_write(ma_rb* pRB, size_t* pSizeInBytes, void** ppBufferOut);
+MA_API ma_result ma_rb_commit_write(ma_rb* pRB, size_t sizeInBytes);
+MA_API ma_result ma_rb_seek_read(ma_rb* pRB, size_t offsetInBytes);
+MA_API ma_result ma_rb_seek_write(ma_rb* pRB, size_t offsetInBytes);
+MA_API ma_int32 ma_rb_pointer_distance(ma_rb* pRB);    /* Returns the distance between the write pointer and the read pointer. Should never be negative for a correct program. Will return the number of bytes that can be read before the read pointer hits the write pointer. */
+MA_API ma_uint32 ma_rb_available_read(ma_rb* pRB);
+MA_API ma_uint32 ma_rb_available_write(ma_rb* pRB);
+MA_API size_t ma_rb_get_subbuffer_size(ma_rb* pRB);
+MA_API size_t ma_rb_get_subbuffer_stride(ma_rb* pRB);
+MA_API size_t ma_rb_get_subbuffer_offset(ma_rb* pRB, size_t subbufferIndex);
+MA_API void* ma_rb_get_subbuffer_ptr(ma_rb* pRB, size_t subbufferIndex, void* pBuffer);
+
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_rb rb;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate; /* Not required for the ring buffer itself, but useful for associating the data with some sample rate, particularly for data sources. */
+} ma_pcm_rb;
+
+MA_API ma_result ma_pcm_rb_init_ex(ma_format format, ma_uint32 channels, ma_uint32 subbufferSizeInFrames, ma_uint32 subbufferCount, ma_uint32 subbufferStrideInFrames, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_pcm_rb* pRB);
+MA_API ma_result ma_pcm_rb_init(ma_format format, ma_uint32 channels, ma_uint32 bufferSizeInFrames, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_pcm_rb* pRB);
+MA_API void ma_pcm_rb_uninit(ma_pcm_rb* pRB);
+MA_API void ma_pcm_rb_reset(ma_pcm_rb* pRB);
+MA_API ma_result ma_pcm_rb_acquire_read(ma_pcm_rb* pRB, ma_uint32* pSizeInFrames, void** ppBufferOut);
+MA_API ma_result ma_pcm_rb_commit_read(ma_pcm_rb* pRB, ma_uint32 sizeInFrames);
+MA_API ma_result ma_pcm_rb_acquire_write(ma_pcm_rb* pRB, ma_uint32* pSizeInFrames, void** ppBufferOut);
+MA_API ma_result ma_pcm_rb_commit_write(ma_pcm_rb* pRB, ma_uint32 sizeInFrames);
+MA_API ma_result ma_pcm_rb_seek_read(ma_pcm_rb* pRB, ma_uint32 offsetInFrames);
+MA_API ma_result ma_pcm_rb_seek_write(ma_pcm_rb* pRB, ma_uint32 offsetInFrames);
+MA_API ma_int32 ma_pcm_rb_pointer_distance(ma_pcm_rb* pRB); /* Return value is in frames. */
+MA_API ma_uint32 ma_pcm_rb_available_read(ma_pcm_rb* pRB);
+MA_API ma_uint32 ma_pcm_rb_available_write(ma_pcm_rb* pRB);
+MA_API ma_uint32 ma_pcm_rb_get_subbuffer_size(ma_pcm_rb* pRB);
+MA_API ma_uint32 ma_pcm_rb_get_subbuffer_stride(ma_pcm_rb* pRB);
+MA_API ma_uint32 ma_pcm_rb_get_subbuffer_offset(ma_pcm_rb* pRB, ma_uint32 subbufferIndex);
+MA_API void* ma_pcm_rb_get_subbuffer_ptr(ma_pcm_rb* pRB, ma_uint32 subbufferIndex, void* pBuffer);
+MA_API ma_format ma_pcm_rb_get_format(const ma_pcm_rb* pRB);
+MA_API ma_uint32 ma_pcm_rb_get_channels(const ma_pcm_rb* pRB);
+MA_API ma_uint32 ma_pcm_rb_get_sample_rate(const ma_pcm_rb* pRB);
+MA_API void ma_pcm_rb_set_sample_rate(ma_pcm_rb* pRB, ma_uint32 sampleRate);
+
+
+/*
+The idea of the duplex ring buffer is to act as the intermediary buffer when running two asynchronous devices in a duplex set up. The
+capture device writes to it, and then a playback device reads from it.
+
+At the moment this is just a simple naive implementation, but in the future I want to implement some dynamic resampling to seamlessly
+handle desyncs. Note that the API is work in progress and may change at any time in any version.
+
+The size of the buffer is based on the capture side since that's what'll be written to the buffer. It is based on the capture period size
+in frames. The internal sample rate of the capture device is also needed in order to calculate the size.
+*/
+typedef struct
+{
+    ma_pcm_rb rb;
+} ma_duplex_rb;
+
+MA_API ma_result ma_duplex_rb_init(ma_format captureFormat, ma_uint32 captureChannels, ma_uint32 sampleRate, ma_uint32 captureInternalSampleRate, ma_uint32 captureInternalPeriodSizeInFrames, const ma_allocation_callbacks* pAllocationCallbacks, ma_duplex_rb* pRB);
+MA_API ma_result ma_duplex_rb_uninit(ma_duplex_rb* pRB);
+
+
+/************************************************************************************************************************************************************
+
+Miscellaneous Helpers
+
+************************************************************************************************************************************************************/
+/*
+Retrieves a human readable description of the given result code.
+*/
+MA_API const char* ma_result_description(ma_result result);
+
+/*
+malloc()
+*/
+MA_API void* ma_malloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks);
+
+/*
+calloc()
+*/
+MA_API void* ma_calloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks);
+
+/*
+realloc()
+*/
+MA_API void* ma_realloc(void* p, size_t sz, const ma_allocation_callbacks* pAllocationCallbacks);
+
+/*
+free()
+*/
+MA_API void ma_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Performs an aligned malloc, with the assumption that the alignment is a power of 2.
+*/
+MA_API void* ma_aligned_malloc(size_t sz, size_t alignment, const ma_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Free's an aligned malloc'd buffer.
+*/
+MA_API void ma_aligned_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
+
+/*
+Retrieves a friendly name for a format.
+*/
+MA_API const char* ma_get_format_name(ma_format format);
+
+/*
+Blends two frames in floating point format.
+*/
+MA_API void ma_blend_f32(float* pOut, float* pInA, float* pInB, float factor, ma_uint32 channels);
+
+/*
+Retrieves the size of a sample in bytes for the given format.
+
+This API is efficient and is implemented using a lookup table.
+
+Thread Safety: SAFE
+  This API is pure.
+*/
+MA_API ma_uint32 ma_get_bytes_per_sample(ma_format format);
+static MA_INLINE ma_uint32 ma_get_bytes_per_frame(ma_format format, ma_uint32 channels) { return ma_get_bytes_per_sample(format) * channels; }
+
+/*
+Converts a log level to a string.
+*/
+MA_API const char* ma_log_level_to_string(ma_uint32 logLevel);
+
+
+
+
+/************************************************************************************************************************************************************
+
+Synchronization
+
+************************************************************************************************************************************************************/
+/*
+Locks a spinlock.
+*/
+MA_API ma_result ma_spinlock_lock(volatile ma_spinlock* pSpinlock);
+
+/*
+Locks a spinlock, but does not yield() when looping.
+*/
+MA_API ma_result ma_spinlock_lock_noyield(volatile ma_spinlock* pSpinlock);
+
+/*
+Unlocks a spinlock.
+*/
+MA_API ma_result ma_spinlock_unlock(volatile ma_spinlock* pSpinlock);
+
+
+#ifndef MA_NO_THREADING
+
+/*
+Creates a mutex.
+
+A mutex must be created from a valid context. A mutex is initially unlocked.
+*/
+MA_API ma_result ma_mutex_init(ma_mutex* pMutex);
+
+/*
+Deletes a mutex.
+*/
+MA_API void ma_mutex_uninit(ma_mutex* pMutex);
+
+/*
+Locks a mutex with an infinite timeout.
+*/
+MA_API void ma_mutex_lock(ma_mutex* pMutex);
+
+/*
+Unlocks a mutex.
+*/
+MA_API void ma_mutex_unlock(ma_mutex* pMutex);
+
+
+/*
+Initializes an auto-reset event.
+*/
+MA_API ma_result ma_event_init(ma_event* pEvent);
+
+/*
+Uninitializes an auto-reset event.
+*/
+MA_API void ma_event_uninit(ma_event* pEvent);
+
+/*
+Waits for the specified auto-reset event to become signalled.
+*/
+MA_API ma_result ma_event_wait(ma_event* pEvent);
+
+/*
+Signals the specified auto-reset event.
+*/
+MA_API ma_result ma_event_signal(ma_event* pEvent);
+
+
+MA_API ma_result ma_semaphore_init(int initialValue, ma_semaphore* pSemaphore);
+MA_API void ma_semaphore_uninit(ma_semaphore* pSemaphore);
+MA_API ma_result ma_semaphore_wait(ma_semaphore* pSemaphore);
+MA_API ma_result ma_semaphore_release(ma_semaphore* pSemaphore);
+#endif  /* MA_NO_THREADING */
+
+
+/*
+Fence
+=====
+This locks while the counter is larger than 0. Counter can be incremented and decremented by any
+thread, but care needs to be taken when waiting. It is possible for one thread to acquire the
+fence just as another thread returns from ma_fence_wait().
+
+The idea behind a fence is to allow you to wait for a group of operations to complete. When an
+operation starts, the counter is incremented which locks the fence. When the operation completes,
+the fence will be released which decrements the counter. ma_fence_wait() will block until the
+counter hits zero.
+
+If threading is disabled, ma_fence_wait() will spin on the counter.
+*/
+typedef struct
+{
+#ifndef MA_NO_THREADING
+    ma_event e;
+#endif
+    ma_uint32 counter;
+} ma_fence;
+
+MA_API ma_result ma_fence_init(ma_fence* pFence);
+MA_API void ma_fence_uninit(ma_fence* pFence);
+MA_API ma_result ma_fence_acquire(ma_fence* pFence);    /* Increment counter. */
+MA_API ma_result ma_fence_release(ma_fence* pFence);    /* Decrement counter. */
+MA_API ma_result ma_fence_wait(ma_fence* pFence);       /* Wait for counter to reach 0. */
+
+
+
+/*
+Notification callback for asynchronous operations.
+*/
+typedef void ma_async_notification;
+
+typedef struct
+{
+    void (* onSignal)(ma_async_notification* pNotification);
+} ma_async_notification_callbacks;
+
+MA_API ma_result ma_async_notification_signal(ma_async_notification* pNotification);
+
+
+/*
+Simple polling notification.
+
+This just sets a variable when the notification has been signalled which is then polled with ma_async_notification_poll_is_signalled()
+*/
+typedef struct
+{
+    ma_async_notification_callbacks cb;
+    ma_bool32 signalled;
+} ma_async_notification_poll;
+
+MA_API ma_result ma_async_notification_poll_init(ma_async_notification_poll* pNotificationPoll);
+MA_API ma_bool32 ma_async_notification_poll_is_signalled(const ma_async_notification_poll* pNotificationPoll);
+
+
+/*
+Event Notification
+
+This uses an ma_event. If threading is disabled (MA_NO_THREADING), initialization will fail.
+*/
+typedef struct
+{
+    ma_async_notification_callbacks cb;
+#ifndef MA_NO_THREADING
+    ma_event e;
+#endif
+} ma_async_notification_event;
+
+MA_API ma_result ma_async_notification_event_init(ma_async_notification_event* pNotificationEvent);
+MA_API ma_result ma_async_notification_event_uninit(ma_async_notification_event* pNotificationEvent);
+MA_API ma_result ma_async_notification_event_wait(ma_async_notification_event* pNotificationEvent);
+MA_API ma_result ma_async_notification_event_signal(ma_async_notification_event* pNotificationEvent);
+
+
+
+
+/************************************************************************************************************************************************************
+
+Job Queue
+
+************************************************************************************************************************************************************/
+
+/*
+Slot Allocator
+--------------
+The idea of the slot allocator is for it to be used in conjunction with a fixed sized buffer. You use the slot allocator to allocate an index that can be used
+as the insertion point for an object.
+
+Slots are reference counted to help mitigate the ABA problem in the lock-free queue we use for tracking jobs.
+
+The slot index is stored in the low 32 bits. The reference counter is stored in the high 32 bits:
+
+    +-----------------+-----------------+
+    | 32 Bits         | 32 Bits         |
+    +-----------------+-----------------+
+    | Reference Count | Slot Index      |
+    +-----------------+-----------------+
+*/
+typedef struct
+{
+    ma_uint32 capacity;    /* The number of slots to make available. */
+} ma_slot_allocator_config;
+
+MA_API ma_slot_allocator_config ma_slot_allocator_config_init(ma_uint32 capacity);
+
+
+typedef struct
+{
+    MA_ATOMIC(4, ma_uint32) bitfield;   /* Must be used atomically because the allocation and freeing routines need to make copies of this which must never be optimized away by the compiler. */
+} ma_slot_allocator_group;
+
+typedef struct
+{
+    ma_slot_allocator_group* pGroups;   /* Slots are grouped in chunks of 32. */
+    ma_uint32* pSlots;                  /* 32 bits for reference counting for ABA mitigation. */
+    ma_uint32 count;                    /* Allocation count. */
+    ma_uint32 capacity;
+
+    /* Memory management. */
+    ma_bool32 _ownsHeap;
+    void* _pHeap;
+} ma_slot_allocator;
+
+MA_API ma_result ma_slot_allocator_get_heap_size(const ma_slot_allocator_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_slot_allocator_init_preallocated(const ma_slot_allocator_config* pConfig, void* pHeap, ma_slot_allocator* pAllocator);
+MA_API ma_result ma_slot_allocator_init(const ma_slot_allocator_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_slot_allocator* pAllocator);
+MA_API void ma_slot_allocator_uninit(ma_slot_allocator* pAllocator, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_slot_allocator_alloc(ma_slot_allocator* pAllocator, ma_uint64* pSlot);
+MA_API ma_result ma_slot_allocator_free(ma_slot_allocator* pAllocator, ma_uint64 slot);
+
+
+typedef struct ma_job ma_job;
+
+/*
+Callback for processing a job. Each job type will have their own processing callback which will be
+called by ma_job_process().
+*/
+typedef ma_result (* ma_job_proc)(ma_job* pJob);
+
+/* When a job type is added here an callback needs to be added go "g_jobVTable" in the implementation section. */
+typedef enum
+{
+    /* Miscellaneous. */
+    MA_JOB_TYPE_QUIT = 0,
+    MA_JOB_TYPE_CUSTOM,
+
+    /* Resource Manager. */
+    MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE,
+    MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER_NODE,
+    MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE,
+    MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER,
+    MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER,
+    MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_STREAM,
+    MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_STREAM,
+    MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_STREAM,
+    MA_JOB_TYPE_RESOURCE_MANAGER_SEEK_DATA_STREAM,
+
+    /* Device. */
+    MA_JOB_TYPE_DEVICE_AAUDIO_REROUTE,
+
+    /* Count. Must always be last. */
+    MA_JOB_TYPE_COUNT
+} ma_job_type;
+
+struct ma_job
+{
+    union
+    {
+        struct
+        {
+            ma_uint16 code;         /* Job type. */
+            ma_uint16 slot;         /* Index into a ma_slot_allocator. */
+            ma_uint32 refcount;
+        } breakup;
+        ma_uint64 allocation;
+    } toc;  /* 8 bytes. We encode the job code into the slot allocation data to save space. */
+    MA_ATOMIC(8, ma_uint64) next; /* refcount + slot for the next item. Does not include the job code. */
+    ma_uint32 order;    /* Execution order. Used to create a data dependency and ensure a job is executed in order. Usage is contextual depending on the job type. */
+
+    union
+    {
+        /* Miscellaneous. */
+        struct
+        {
+            ma_job_proc proc;
+            ma_uintptr data0;
+            ma_uintptr data1;
+        } custom;
+
+        /* Resource Manager */
+        union
+        {
+            struct
+            {
+                /*ma_resource_manager**/ void* pResourceManager;
+                /*ma_resource_manager_data_buffer_node**/ void* pDataBufferNode;
+                char* pFilePath;
+                wchar_t* pFilePathW;
+                ma_uint32 flags;                                /* Resource manager data source flags that were used when initializing the data buffer. */
+                ma_async_notification* pInitNotification;       /* Signalled when the data buffer has been initialized and the format/channels/rate can be retrieved. */
+                ma_async_notification* pDoneNotification;       /* Signalled when the data buffer has been fully decoded. Will be passed through to MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE when decoding. */
+                ma_fence* pInitFence;                           /* Released when initialization of the decoder is complete. */
+                ma_fence* pDoneFence;                           /* Released if initialization of the decoder fails. Passed through to PAGE_DATA_BUFFER_NODE untouched if init is successful. */
+            } loadDataBufferNode;
+            struct
+            {
+                /*ma_resource_manager**/ void* pResourceManager;
+                /*ma_resource_manager_data_buffer_node**/ void* pDataBufferNode;
+                ma_async_notification* pDoneNotification;
+                ma_fence* pDoneFence;
+            } freeDataBufferNode;
+            struct
+            {
+                /*ma_resource_manager**/ void* pResourceManager;
+                /*ma_resource_manager_data_buffer_node**/ void* pDataBufferNode;
+                /*ma_decoder**/ void* pDecoder;
+                ma_async_notification* pDoneNotification;       /* Signalled when the data buffer has been fully decoded. */
+                ma_fence* pDoneFence;                           /* Passed through from LOAD_DATA_BUFFER_NODE and released when the data buffer completes decoding or an error occurs. */
+            } pageDataBufferNode;
+
+            struct
+            {
+                /*ma_resource_manager_data_buffer**/ void* pDataBuffer;
+                ma_async_notification* pInitNotification;       /* Signalled when the data buffer has been initialized and the format/channels/rate can be retrieved. */
+                ma_async_notification* pDoneNotification;       /* Signalled when the data buffer has been fully decoded. */
+                ma_fence* pInitFence;                           /* Released when the data buffer has been initialized and the format/channels/rate can be retrieved. */
+                ma_fence* pDoneFence;                           /* Released when the data buffer has been fully decoded. */
+                ma_uint64 rangeBegInPCMFrames;
+                ma_uint64 rangeEndInPCMFrames;
+                ma_uint64 loopPointBegInPCMFrames;
+                ma_uint64 loopPointEndInPCMFrames;
+                ma_uint32 isLooping;
+            } loadDataBuffer;
+            struct
+            {
+                /*ma_resource_manager_data_buffer**/ void* pDataBuffer;
+                ma_async_notification* pDoneNotification;
+                ma_fence* pDoneFence;
+            } freeDataBuffer;
+
+            struct
+            {
+                /*ma_resource_manager_data_stream**/ void* pDataStream;
+                char* pFilePath;                            /* Allocated when the job is posted, freed by the job thread after loading. */
+                wchar_t* pFilePathW;                        /* ^ As above ^. Only used if pFilePath is NULL. */
+                ma_uint64 initialSeekPoint;
+                ma_async_notification* pInitNotification;   /* Signalled after the first two pages have been decoded and frames can be read from the stream. */
+                ma_fence* pInitFence;
+            } loadDataStream;
+            struct
+            {
+                /*ma_resource_manager_data_stream**/ void* pDataStream;
+                ma_async_notification* pDoneNotification;
+                ma_fence* pDoneFence;
+            } freeDataStream;
+            struct
+            {
+                /*ma_resource_manager_data_stream**/ void* pDataStream;
+                ma_uint32 pageIndex;                    /* The index of the page to decode into. */
+            } pageDataStream;
+            struct
+            {
+                /*ma_resource_manager_data_stream**/ void* pDataStream;
+                ma_uint64 frameIndex;
+            } seekDataStream;
+        } resourceManager;
+
+        /* Device. */
+        union
+        {
+            union
+            {
+                struct
+                {
+                    /*ma_device**/ void* pDevice;
+                    /*ma_device_type*/ ma_uint32 deviceType;
+                } reroute;
+            } aaudio;
+        } device;
+    } data;
+};
+
+MA_API ma_job ma_job_init(ma_uint16 code);
+MA_API ma_result ma_job_process(ma_job* pJob);
+
+
+/*
+When set, ma_job_queue_next() will not wait and no semaphore will be signaled in
+ma_job_queue_post(). ma_job_queue_next() will return MA_NO_DATA_AVAILABLE if nothing is available.
+
+This flag should always be used for platforms that do not support multithreading.
+*/
+typedef enum
+{
+    MA_JOB_QUEUE_FLAG_NON_BLOCKING = 0x00000001
+} ma_job_queue_flags;
+
+typedef struct
+{
+    ma_uint32 flags;
+    ma_uint32 capacity; /* The maximum number of jobs that can fit in the queue at a time. */
+} ma_job_queue_config;
+
+MA_API ma_job_queue_config ma_job_queue_config_init(ma_uint32 flags, ma_uint32 capacity);
+
+
+typedef struct
+{
+    ma_uint32 flags;                /* Flags passed in at initialization time. */
+    ma_uint32 capacity;             /* The maximum number of jobs that can fit in the queue at a time. Set by the config. */
+    MA_ATOMIC(8, ma_uint64) head;   /* The first item in the list. Required for removing from the top of the list. */
+    MA_ATOMIC(8, ma_uint64) tail;   /* The last item in the list. Required for appending to the end of the list. */
+#ifndef MA_NO_THREADING
+    ma_semaphore sem;               /* Only used when MA_JOB_QUEUE_FLAG_NON_BLOCKING is unset. */
+#endif
+    ma_slot_allocator allocator;
+    ma_job* pJobs;
+#ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
+    ma_spinlock lock;
+#endif
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_job_queue;
+
+MA_API ma_result ma_job_queue_get_heap_size(const ma_job_queue_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_job_queue_init_preallocated(const ma_job_queue_config* pConfig, void* pHeap, ma_job_queue* pQueue);
+MA_API ma_result ma_job_queue_init(const ma_job_queue_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_job_queue* pQueue);
+MA_API void ma_job_queue_uninit(ma_job_queue* pQueue, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_job_queue_post(ma_job_queue* pQueue, const ma_job* pJob);
+MA_API ma_result ma_job_queue_next(ma_job_queue* pQueue, ma_job* pJob); /* Returns MA_CANCELLED if the next job is a quit job. */
+
+
+
+/************************************************************************************************************************************************************
+*************************************************************************************************************************************************************
+
+DEVICE I/O
+==========
+
+This section contains the APIs for device playback and capture. Here is where you'll find ma_device_init(), etc.
+
+*************************************************************************************************************************************************************
+************************************************************************************************************************************************************/
+#ifndef MA_NO_DEVICE_IO
+/* Some backends are only supported on certain platforms. */
+#if defined(MA_WIN32)
+    #define MA_SUPPORT_WASAPI
+
+    #if defined(MA_WIN32_DESKTOP)   /* DirectSound and WinMM backends are only supported on desktops. */
+        #define MA_SUPPORT_DSOUND
+        #define MA_SUPPORT_WINMM
+
+        /* Don't enable JACK here if compiling with Cosmopolitan. It'll be enabled in the Linux section below. */
+        #if !defined(__COSMOPOLITAN__)
+            #define MA_SUPPORT_JACK    /* JACK is technically supported on Windows, but I don't know how many people use it in practice... */
+        #endif
+    #endif
+#endif
+#if defined(MA_UNIX) && !defined(MA_ORBIS) && !defined(MA_PROSPERO)
+    #if defined(MA_LINUX)
+        #if !defined(MA_ANDROID) && !defined(__COSMOPOLITAN__)   /* ALSA is not supported on Android. */
+            #define MA_SUPPORT_ALSA
+        #endif
+    #endif
+    #if !defined(MA_BSD) && !defined(MA_ANDROID) && !defined(MA_EMSCRIPTEN)
+        #define MA_SUPPORT_PULSEAUDIO
+        #define MA_SUPPORT_JACK
+    #endif
+    #if defined(__OpenBSD__)        /* <-- Change this to "#if defined(MA_BSD)" to enable sndio on all BSD flavors. */
+        #define MA_SUPPORT_SNDIO    /* sndio is only supported on OpenBSD for now. May be expanded later if there's demand. */
+    #endif
+    #if defined(__NetBSD__) || defined(__OpenBSD__)
+        #define MA_SUPPORT_AUDIO4   /* Only support audio(4) on platforms with known support. */
+    #endif
+    #if defined(__FreeBSD__) || defined(__DragonFly__)
+        #define MA_SUPPORT_OSS      /* Only support OSS on specific platforms with known support. */
+    #endif
+#endif
+#if defined(MA_ANDROID)
+    #define MA_SUPPORT_AAUDIO
+    #define MA_SUPPORT_OPENSL
+#endif
+#if defined(MA_APPLE)
+    #define MA_SUPPORT_COREAUDIO
+#endif
+#if defined(MA_EMSCRIPTEN)
+    #define MA_SUPPORT_WEBAUDIO
+#endif
+
+/* All platforms should support custom backends. */
+#define MA_SUPPORT_CUSTOM
+
+/* Explicitly disable the Null backend for Emscripten because it uses a background thread which is not properly supported right now. */
+#if !defined(MA_EMSCRIPTEN)
+#define MA_SUPPORT_NULL
+#endif
+
+
+#if defined(MA_SUPPORT_WASAPI) && !defined(MA_NO_WASAPI) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_WASAPI))
+    #define MA_HAS_WASAPI
+#endif
+#if defined(MA_SUPPORT_DSOUND) && !defined(MA_NO_DSOUND) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_DSOUND))
+    #define MA_HAS_DSOUND
+#endif
+#if defined(MA_SUPPORT_WINMM) && !defined(MA_NO_WINMM) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_WINMM))
+    #define MA_HAS_WINMM
+#endif
+#if defined(MA_SUPPORT_ALSA) && !defined(MA_NO_ALSA) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_ALSA))
+    #define MA_HAS_ALSA
+#endif
+#if defined(MA_SUPPORT_PULSEAUDIO) && !defined(MA_NO_PULSEAUDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_PULSEAUDIO))
+    #define MA_HAS_PULSEAUDIO
+#endif
+#if defined(MA_SUPPORT_JACK) && !defined(MA_NO_JACK) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_JACK))
+    #define MA_HAS_JACK
+#endif
+#if defined(MA_SUPPORT_COREAUDIO) && !defined(MA_NO_COREAUDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_COREAUDIO))
+    #define MA_HAS_COREAUDIO
+#endif
+#if defined(MA_SUPPORT_SNDIO) && !defined(MA_NO_SNDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_SNDIO))
+    #define MA_HAS_SNDIO
+#endif
+#if defined(MA_SUPPORT_AUDIO4) && !defined(MA_NO_AUDIO4) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_AUDIO4))
+    #define MA_HAS_AUDIO4
+#endif
+#if defined(MA_SUPPORT_OSS) && !defined(MA_NO_OSS) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_OSS))
+    #define MA_HAS_OSS
+#endif
+#if defined(MA_SUPPORT_AAUDIO) && !defined(MA_NO_AAUDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_AAUDIO))
+    #define MA_HAS_AAUDIO
+#endif
+#if defined(MA_SUPPORT_OPENSL) && !defined(MA_NO_OPENSL) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_OPENSL))
+    #define MA_HAS_OPENSL
+#endif
+#if defined(MA_SUPPORT_WEBAUDIO) && !defined(MA_NO_WEBAUDIO) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_WEBAUDIO))
+    #define MA_HAS_WEBAUDIO
+#endif
+#if defined(MA_SUPPORT_CUSTOM) && !defined(MA_NO_CUSTOM) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_CUSTOM))
+    #define MA_HAS_CUSTOM
+#endif
+#if defined(MA_SUPPORT_NULL) && !defined(MA_NO_NULL) && (!defined(MA_ENABLE_ONLY_SPECIFIC_BACKENDS) || defined(MA_ENABLE_NULL))
+    #define MA_HAS_NULL
+#endif
+
+typedef enum
+{
+    ma_device_state_uninitialized = 0,
+    ma_device_state_stopped       = 1,  /* The device's default state after initialization. */
+    ma_device_state_started       = 2,  /* The device is started and is requesting and/or delivering audio data. */
+    ma_device_state_starting      = 3,  /* Transitioning from a stopped state to started. */
+    ma_device_state_stopping      = 4   /* Transitioning from a started state to stopped. */
+} ma_device_state;
+
+MA_ATOMIC_SAFE_TYPE_DECL(i32, 4, device_state)
+
+
+#ifdef MA_SUPPORT_WASAPI
+/* We need a IMMNotificationClient object for WASAPI. */
+typedef struct
+{
+    void* lpVtbl;
+    ma_uint32 counter;
+    ma_device* pDevice;
+} ma_IMMNotificationClient;
+#endif
+
+/* Backend enums must be in priority order. */
+typedef enum
+{
+    ma_backend_wasapi,
+    ma_backend_dsound,
+    ma_backend_winmm,
+    ma_backend_coreaudio,
+    ma_backend_sndio,
+    ma_backend_audio4,
+    ma_backend_oss,
+    ma_backend_pulseaudio,
+    ma_backend_alsa,
+    ma_backend_jack,
+    ma_backend_aaudio,
+    ma_backend_opensl,
+    ma_backend_webaudio,
+    ma_backend_custom,  /* <-- Custom backend, with callbacks defined by the context config. */
+    ma_backend_null     /* <-- Must always be the last item. Lowest priority, and used as the terminator for backend enumeration. */
+} ma_backend;
+
+#define MA_BACKEND_COUNT (ma_backend_null+1)
+
+
+/*
+Device job thread. This is used by backends that require asynchronous processing of certain
+operations. It is not used by all backends.
+
+The device job thread is made up of a thread and a job queue. You can post a job to the thread with
+ma_device_job_thread_post(). The thread will do the processing of the job.
+*/
+typedef struct
+{
+    ma_bool32 noThread; /* Set this to true if you want to process jobs yourself. */
+    ma_uint32 jobQueueCapacity;
+    ma_uint32 jobQueueFlags;
+} ma_device_job_thread_config;
+
+MA_API ma_device_job_thread_config ma_device_job_thread_config_init(void);
+
+typedef struct
+{
+    ma_thread thread;
+    ma_job_queue jobQueue;
+    ma_bool32 _hasThread;
+} ma_device_job_thread;
+
+MA_API ma_result ma_device_job_thread_init(const ma_device_job_thread_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_device_job_thread* pJobThread);
+MA_API void ma_device_job_thread_uninit(ma_device_job_thread* pJobThread, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_device_job_thread_post(ma_device_job_thread* pJobThread, const ma_job* pJob);
+MA_API ma_result ma_device_job_thread_next(ma_device_job_thread* pJobThread, ma_job* pJob);
+
+
+
+/* Device notification types. */
+typedef enum
+{
+    ma_device_notification_type_started,
+    ma_device_notification_type_stopped,
+    ma_device_notification_type_rerouted,
+    ma_device_notification_type_interruption_began,
+    ma_device_notification_type_interruption_ended,
+    ma_device_notification_type_unlocked
+} ma_device_notification_type;
+
+typedef struct
+{
+    ma_device* pDevice;
+    ma_device_notification_type type;
+    union
+    {
+        struct
+        {
+            int _unused;
+        } started;
+        struct
+        {
+            int _unused;
+        } stopped;
+        struct
+        {
+            int _unused;
+        } rerouted;
+        struct
+        {
+            int _unused;
+        } interruption;
+    } data;
+} ma_device_notification;
+
+/*
+The notification callback for when the application should be notified of a change to the device.
+
+This callback is used for notifying the application of changes such as when the device has started,
+stopped, rerouted or an interruption has occurred. Note that not all backends will post all
+notification types. For example, some backends will perform automatic stream routing without any
+kind of notification to the host program which means miniaudio will never know about it and will
+never be able to fire the rerouted notification. You should keep this in mind when designing your
+program.
+
+The stopped notification will *not* get fired when a device is rerouted.
+
+
+Parameters
+----------
+pNotification (in)
+    A pointer to a structure containing information about the event. Use the `pDevice` member of
+    this object to retrieve the relevant device. The `type` member can be used to discriminate
+    against each of the notification types.
+
+
+Remarks
+-------
+Do not restart or uninitialize the device from the callback.
+
+Not all notifications will be triggered by all backends, however the started and stopped events
+should be reliable for all backends. Some backends do not have a good way to detect device
+stoppages due to unplugging the device which may result in the stopped callback not getting
+fired. This has been observed with at least one BSD variant.
+
+The rerouted notification is fired *after* the reroute has occurred. The stopped notification will
+*not* get fired when a device is rerouted. The following backends are known to do automatic stream
+rerouting, but do not have a way to be notified of the change:
+
+  * DirectSound
+
+The interruption notifications are used on mobile platforms for detecting when audio is interrupted
+due to things like an incoming phone call. Currently this is only implemented on iOS. None of the
+Android backends will report this notification.
+*/
+typedef void (* ma_device_notification_proc)(const ma_device_notification* pNotification);
+
+
+/*
+The callback for processing audio data from the device.
+
+The data callback is fired by miniaudio whenever the device needs to have more data delivered to a playback device, or when a capture device has some data
+available. This is called as soon as the backend asks for more data which means it may be called with inconsistent frame counts. You cannot assume the
+callback will be fired with a consistent frame count.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the relevant device.
+
+pOutput (out)
+    A pointer to the output buffer that will receive audio data that will later be played back through the speakers. This will be non-null for a playback or
+    full-duplex device and null for a capture and loopback device.
+
+pInput (in)
+    A pointer to the buffer containing input data from a recording device. This will be non-null for a capture, full-duplex or loopback device and null for a
+    playback device.
+
+frameCount (in)
+    The number of PCM frames to process. Note that this will not necessarily be equal to what you requested when you initialized the device. The
+    `periodSizeInFrames` and `periodSizeInMilliseconds` members of the device config are just hints, and are not necessarily exactly what you'll get. You must
+    not assume this will always be the same value each time the callback is fired.
+
+
+Remarks
+-------
+You cannot stop and start the device from inside the callback or else you'll get a deadlock. You must also not uninitialize the device from inside the
+callback. The following APIs cannot be called from inside the callback:
+
+    ma_device_init()
+    ma_device_init_ex()
+    ma_device_uninit()
+    ma_device_start()
+    ma_device_stop()
+
+The proper way to stop the device is to call `ma_device_stop()` from a different thread, normally the main application thread.
+*/
+typedef void (* ma_device_data_proc)(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount);
+
+
+
+
+/*
+DEPRECATED. Use ma_device_notification_proc instead.
+
+The callback for when the device has been stopped.
+
+This will be called when the device is stopped explicitly with `ma_device_stop()` and also called implicitly when the device is stopped through external forces
+such as being unplugged or an internal error occurring.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device that has just stopped.
+
+
+Remarks
+-------
+Do not restart or uninitialize the device from the callback.
+*/
+typedef void (* ma_stop_proc)(ma_device* pDevice);  /* DEPRECATED. Use ma_device_notification_proc instead. */
+
+typedef enum
+{
+    ma_device_type_playback = 1,
+    ma_device_type_capture  = 2,
+    ma_device_type_duplex   = ma_device_type_playback | ma_device_type_capture, /* 3 */
+    ma_device_type_loopback = 4
+} ma_device_type;
+
+typedef enum
+{
+    ma_share_mode_shared = 0,
+    ma_share_mode_exclusive
+} ma_share_mode;
+
+/* iOS/tvOS/watchOS session categories. */
+typedef enum
+{
+    ma_ios_session_category_default = 0,        /* AVAudioSessionCategoryPlayAndRecord. */
+    ma_ios_session_category_none,               /* Leave the session category unchanged. */
+    ma_ios_session_category_ambient,            /* AVAudioSessionCategoryAmbient */
+    ma_ios_session_category_solo_ambient,       /* AVAudioSessionCategorySoloAmbient */
+    ma_ios_session_category_playback,           /* AVAudioSessionCategoryPlayback */
+    ma_ios_session_category_record,             /* AVAudioSessionCategoryRecord */
+    ma_ios_session_category_play_and_record,    /* AVAudioSessionCategoryPlayAndRecord */
+    ma_ios_session_category_multi_route         /* AVAudioSessionCategoryMultiRoute */
+} ma_ios_session_category;
+
+/* iOS/tvOS/watchOS session category options */
+typedef enum
+{
+    ma_ios_session_category_option_mix_with_others                            = 0x01,   /* AVAudioSessionCategoryOptionMixWithOthers */
+    ma_ios_session_category_option_duck_others                                = 0x02,   /* AVAudioSessionCategoryOptionDuckOthers */
+    ma_ios_session_category_option_allow_bluetooth                            = 0x04,   /* AVAudioSessionCategoryOptionAllowBluetooth */
+    ma_ios_session_category_option_default_to_speaker                         = 0x08,   /* AVAudioSessionCategoryOptionDefaultToSpeaker */
+    ma_ios_session_category_option_interrupt_spoken_audio_and_mix_with_others = 0x11,   /* AVAudioSessionCategoryOptionInterruptSpokenAudioAndMixWithOthers */
+    ma_ios_session_category_option_allow_bluetooth_a2dp                       = 0x20,   /* AVAudioSessionCategoryOptionAllowBluetoothA2DP */
+    ma_ios_session_category_option_allow_air_play                             = 0x40,   /* AVAudioSessionCategoryOptionAllowAirPlay */
+} ma_ios_session_category_option;
+
+/* OpenSL stream types. */
+typedef enum
+{
+    ma_opensl_stream_type_default = 0,              /* Leaves the stream type unset. */
+    ma_opensl_stream_type_voice,                    /* SL_ANDROID_STREAM_VOICE */
+    ma_opensl_stream_type_system,                   /* SL_ANDROID_STREAM_SYSTEM */
+    ma_opensl_stream_type_ring,                     /* SL_ANDROID_STREAM_RING */
+    ma_opensl_stream_type_media,                    /* SL_ANDROID_STREAM_MEDIA */
+    ma_opensl_stream_type_alarm,                    /* SL_ANDROID_STREAM_ALARM */
+    ma_opensl_stream_type_notification              /* SL_ANDROID_STREAM_NOTIFICATION */
+} ma_opensl_stream_type;
+
+/* OpenSL recording presets. */
+typedef enum
+{
+    ma_opensl_recording_preset_default = 0,         /* Leaves the input preset unset. */
+    ma_opensl_recording_preset_generic,             /* SL_ANDROID_RECORDING_PRESET_GENERIC */
+    ma_opensl_recording_preset_camcorder,           /* SL_ANDROID_RECORDING_PRESET_CAMCORDER */
+    ma_opensl_recording_preset_voice_recognition,   /* SL_ANDROID_RECORDING_PRESET_VOICE_RECOGNITION */
+    ma_opensl_recording_preset_voice_communication, /* SL_ANDROID_RECORDING_PRESET_VOICE_COMMUNICATION */
+    ma_opensl_recording_preset_voice_unprocessed    /* SL_ANDROID_RECORDING_PRESET_UNPROCESSED */
+} ma_opensl_recording_preset;
+
+/* WASAPI audio thread priority characteristics. */
+typedef enum
+{
+    ma_wasapi_usage_default = 0,
+    ma_wasapi_usage_games,
+    ma_wasapi_usage_pro_audio,
+} ma_wasapi_usage;
+
+/* AAudio usage types. */
+typedef enum
+{
+    ma_aaudio_usage_default = 0,                    /* Leaves the usage type unset. */
+    ma_aaudio_usage_media,                          /* AAUDIO_USAGE_MEDIA */
+    ma_aaudio_usage_voice_communication,            /* AAUDIO_USAGE_VOICE_COMMUNICATION */
+    ma_aaudio_usage_voice_communication_signalling, /* AAUDIO_USAGE_VOICE_COMMUNICATION_SIGNALLING */
+    ma_aaudio_usage_alarm,                          /* AAUDIO_USAGE_ALARM */
+    ma_aaudio_usage_notification,                   /* AAUDIO_USAGE_NOTIFICATION */
+    ma_aaudio_usage_notification_ringtone,          /* AAUDIO_USAGE_NOTIFICATION_RINGTONE */
+    ma_aaudio_usage_notification_event,             /* AAUDIO_USAGE_NOTIFICATION_EVENT */
+    ma_aaudio_usage_assistance_accessibility,       /* AAUDIO_USAGE_ASSISTANCE_ACCESSIBILITY */
+    ma_aaudio_usage_assistance_navigation_guidance, /* AAUDIO_USAGE_ASSISTANCE_NAVIGATION_GUIDANCE */
+    ma_aaudio_usage_assistance_sonification,        /* AAUDIO_USAGE_ASSISTANCE_SONIFICATION */
+    ma_aaudio_usage_game,                           /* AAUDIO_USAGE_GAME */
+    ma_aaudio_usage_assitant,                       /* AAUDIO_USAGE_ASSISTANT */
+    ma_aaudio_usage_emergency,                      /* AAUDIO_SYSTEM_USAGE_EMERGENCY */
+    ma_aaudio_usage_safety,                         /* AAUDIO_SYSTEM_USAGE_SAFETY */
+    ma_aaudio_usage_vehicle_status,                 /* AAUDIO_SYSTEM_USAGE_VEHICLE_STATUS */
+    ma_aaudio_usage_announcement                    /* AAUDIO_SYSTEM_USAGE_ANNOUNCEMENT */
+} ma_aaudio_usage;
+
+/* AAudio content types. */
+typedef enum
+{
+    ma_aaudio_content_type_default = 0,             /* Leaves the content type unset. */
+    ma_aaudio_content_type_speech,                  /* AAUDIO_CONTENT_TYPE_SPEECH */
+    ma_aaudio_content_type_music,                   /* AAUDIO_CONTENT_TYPE_MUSIC */
+    ma_aaudio_content_type_movie,                   /* AAUDIO_CONTENT_TYPE_MOVIE */
+    ma_aaudio_content_type_sonification             /* AAUDIO_CONTENT_TYPE_SONIFICATION */
+} ma_aaudio_content_type;
+
+/* AAudio input presets. */
+typedef enum
+{
+    ma_aaudio_input_preset_default = 0,             /* Leaves the input preset unset. */
+    ma_aaudio_input_preset_generic,                 /* AAUDIO_INPUT_PRESET_GENERIC */
+    ma_aaudio_input_preset_camcorder,               /* AAUDIO_INPUT_PRESET_CAMCORDER */
+    ma_aaudio_input_preset_voice_recognition,       /* AAUDIO_INPUT_PRESET_VOICE_RECOGNITION */
+    ma_aaudio_input_preset_voice_communication,     /* AAUDIO_INPUT_PRESET_VOICE_COMMUNICATION */
+    ma_aaudio_input_preset_unprocessed,             /* AAUDIO_INPUT_PRESET_UNPROCESSED */
+    ma_aaudio_input_preset_voice_performance        /* AAUDIO_INPUT_PRESET_VOICE_PERFORMANCE */
+} ma_aaudio_input_preset;
+
+typedef enum
+{
+    ma_aaudio_allow_capture_default = 0,            /* Leaves the allowed capture policy unset. */
+    ma_aaudio_allow_capture_by_all,                 /* AAUDIO_ALLOW_CAPTURE_BY_ALL */
+    ma_aaudio_allow_capture_by_system,              /* AAUDIO_ALLOW_CAPTURE_BY_SYSTEM */
+    ma_aaudio_allow_capture_by_none                 /* AAUDIO_ALLOW_CAPTURE_BY_NONE */
+} ma_aaudio_allowed_capture_policy;
+
+typedef union
+{
+    ma_int64 counter;
+    double counterD;
+} ma_timer;
+
+typedef union
+{
+    ma_wchar_win32 wasapi[64];      /* WASAPI uses a wchar_t string for identification. */
+    ma_uint8 dsound[16];            /* DirectSound uses a GUID for identification. */
+    /*UINT_PTR*/ ma_uint32 winmm;   /* When creating a device, WinMM expects a Win32 UINT_PTR for device identification. In practice it's actually just a UINT. */
+    char alsa[256];                 /* ALSA uses a name string for identification. */
+    char pulse[256];                /* PulseAudio uses a name string for identification. */
+    int jack;                       /* JACK always uses default devices. */
+    char coreaudio[256];            /* Core Audio uses a string for identification. */
+    char sndio[256];                /* "snd/0", etc. */
+    char audio4[256];               /* "/dev/audio", etc. */
+    char oss[64];                   /* "dev/dsp0", etc. "dev/dsp" for the default device. */
+    ma_int32 aaudio;                /* AAudio uses a 32-bit integer for identification. */
+    ma_uint32 opensl;               /* OpenSL|ES uses a 32-bit unsigned integer for identification. */
+    char webaudio[32];              /* Web Audio always uses default devices for now, but if this changes it'll be a GUID. */
+    union
+    {
+        int i;
+        char s[256];
+        void* p;
+    } custom;                       /* The custom backend could be anything. Give them a few options. */
+    int nullbackend;                /* The null backend uses an integer for device IDs. */
+} ma_device_id;
+
+MA_API ma_bool32 ma_device_id_equal(const ma_device_id* pA, const ma_device_id* pB);
+
+
+typedef struct ma_context_config    ma_context_config;
+typedef struct ma_device_config     ma_device_config;
+typedef struct ma_backend_callbacks ma_backend_callbacks;
+
+#define MA_DATA_FORMAT_FLAG_EXCLUSIVE_MODE (1U << 1)    /* If set, this is supported in exclusive mode. Otherwise not natively supported by exclusive mode. */
+
+#ifndef MA_MAX_DEVICE_NAME_LENGTH
+#define MA_MAX_DEVICE_NAME_LENGTH   255
+#endif
+
+typedef struct
+{
+    /* Basic info. This is the only information guaranteed to be filled in during device enumeration. */
+    ma_device_id id;
+    char name[MA_MAX_DEVICE_NAME_LENGTH + 1];   /* +1 for null terminator. */
+    ma_bool32 isDefault;
+
+    ma_uint32 nativeDataFormatCount;
+    struct
+    {
+        ma_format format;       /* Sample format. If set to ma_format_unknown, all sample formats are supported. */
+        ma_uint32 channels;     /* If set to 0, all channels are supported. */
+        ma_uint32 sampleRate;   /* If set to 0, all sample rates are supported. */
+        ma_uint32 flags;        /* A combination of MA_DATA_FORMAT_FLAG_* flags. */
+    } nativeDataFormats[/*ma_format_count * ma_standard_sample_rate_count * MA_MAX_CHANNELS*/ 64];  /* Not sure how big to make this. There can be *many* permutations for virtual devices which can support anything. */
+} ma_device_info;
+
+struct ma_device_config
+{
+    ma_device_type deviceType;
+    ma_uint32 sampleRate;
+    ma_uint32 periodSizeInFrames;
+    ma_uint32 periodSizeInMilliseconds;
+    ma_uint32 periods;
+    ma_performance_profile performanceProfile;
+    ma_bool8 noPreSilencedOutputBuffer; /* When set to true, the contents of the output buffer passed into the data callback will be left undefined rather than initialized to silence. */
+    ma_bool8 noClip;                    /* When set to true, the contents of the output buffer passed into the data callback will not be clipped after returning. Only applies when the playback sample format is f32. */
+    ma_bool8 noDisableDenormals;        /* Do not disable denormals when firing the data callback. */
+    ma_bool8 noFixedSizedCallback;      /* Disables strict fixed-sized data callbacks. Setting this to true will result in the period size being treated only as a hint to the backend. This is an optimization for those who don't need fixed sized callbacks. */
+    ma_device_data_proc dataCallback;
+    ma_device_notification_proc notificationCallback;
+    ma_stop_proc stopCallback;
+    void* pUserData;
+    ma_resampler_config resampling;
+    struct
+    {
+        const ma_device_id* pDeviceID;
+        ma_format format;
+        ma_uint32 channels;
+        ma_channel* pChannelMap;
+        ma_channel_mix_mode channelMixMode;
+        ma_bool32 calculateLFEFromSpatialChannels;  /* When an output LFE channel is present, but no input LFE, set to true to set the output LFE to the average of all spatial channels (LR, FR, etc.). Ignored when an input LFE is present. */
+        ma_share_mode shareMode;
+    } playback;
+    struct
+    {
+        const ma_device_id* pDeviceID;
+        ma_format format;
+        ma_uint32 channels;
+        ma_channel* pChannelMap;
+        ma_channel_mix_mode channelMixMode;
+        ma_bool32 calculateLFEFromSpatialChannels;  /* When an output LFE channel is present, but no input LFE, set to true to set the output LFE to the average of all spatial channels (LR, FR, etc.). Ignored when an input LFE is present. */
+        ma_share_mode shareMode;
+    } capture;
+
+    struct
+    {
+        ma_wasapi_usage usage;              /* When configured, uses Avrt APIs to set the thread characteristics. */
+        ma_bool8 noAutoConvertSRC;          /* When set to true, disables the use of AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM. */
+        ma_bool8 noDefaultQualitySRC;       /* When set to true, disables the use of AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY. */
+        ma_bool8 noAutoStreamRouting;       /* Disables automatic stream routing. */
+        ma_bool8 noHardwareOffloading;      /* Disables WASAPI's hardware offloading feature. */
+        ma_uint32 loopbackProcessID;        /* The process ID to include or exclude for loopback mode. Set to 0 to capture audio from all processes. Ignored when an explicit device ID is specified. */
+        ma_bool8 loopbackProcessExclude;    /* When set to true, excludes the process specified by loopbackProcessID. By default, the process will be included. */
+    } wasapi;
+    struct
+    {
+        ma_bool32 noMMap;           /* Disables MMap mode. */
+        ma_bool32 noAutoFormat;     /* Opens the ALSA device with SND_PCM_NO_AUTO_FORMAT. */
+        ma_bool32 noAutoChannels;   /* Opens the ALSA device with SND_PCM_NO_AUTO_CHANNELS. */
+        ma_bool32 noAutoResample;   /* Opens the ALSA device with SND_PCM_NO_AUTO_RESAMPLE. */
+    } alsa;
+    struct
+    {
+        const char* pStreamNamePlayback;
+        const char* pStreamNameCapture;
+        int channelMap;
+    } pulse;
+    struct
+    {
+        ma_bool32 allowNominalSampleRateChange; /* Desktop only. When enabled, allows changing of the sample rate at the operating system level. */
+    } coreaudio;
+    struct
+    {
+        ma_opensl_stream_type streamType;
+        ma_opensl_recording_preset recordingPreset;
+        ma_bool32 enableCompatibilityWorkarounds;
+    } opensl;
+    struct
+    {
+        ma_aaudio_usage usage;
+        ma_aaudio_content_type contentType;
+        ma_aaudio_input_preset inputPreset;
+        ma_aaudio_allowed_capture_policy allowedCapturePolicy;
+        ma_bool32 noAutoStartAfterReroute;
+        ma_bool32 enableCompatibilityWorkarounds;
+        ma_bool32 allowSetBufferCapacity;
+    } aaudio;
+};
+
+
+/*
+The callback for handling device enumeration. This is fired from `ma_context_enumerate_devices()`.
+
+
+Parameters
+----------
+pContext (in)
+    A pointer to the context performing the enumeration.
+
+deviceType (in)
+    The type of the device being enumerated. This will always be either `ma_device_type_playback` or `ma_device_type_capture`.
+
+pInfo (in)
+    A pointer to a `ma_device_info` containing the ID and name of the enumerated device. Note that this will not include detailed information about the device,
+    only basic information (ID and name). The reason for this is that it would otherwise require opening the backend device to probe for the information which
+    is too inefficient.
+
+pUserData (in)
+    The user data pointer passed into `ma_context_enumerate_devices()`.
+*/
+typedef ma_bool32 (* ma_enum_devices_callback_proc)(ma_context* pContext, ma_device_type deviceType, const ma_device_info* pInfo, void* pUserData);
+
+
+/*
+Describes some basic details about a playback or capture device.
+*/
+typedef struct
+{
+    const ma_device_id* pDeviceID;
+    ma_share_mode shareMode;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_channel channelMap[MA_MAX_CHANNELS];
+    ma_uint32 periodSizeInFrames;
+    ma_uint32 periodSizeInMilliseconds;
+    ma_uint32 periodCount;
+} ma_device_descriptor;
+
+/*
+These are the callbacks required to be implemented for a backend. These callbacks are grouped into two parts: context and device. There is one context
+to many devices. A device is created from a context.
+
+The general flow goes like this:
+
+  1) A context is created with `onContextInit()`
+     1a) Available devices can be enumerated with `onContextEnumerateDevices()` if required.
+     1b) Detailed information about a device can be queried with `onContextGetDeviceInfo()` if required.
+  2) A device is created from the context that was created in the first step using `onDeviceInit()`, and optionally a device ID that was
+     selected from device enumeration via `onContextEnumerateDevices()`.
+  3) A device is started or stopped with `onDeviceStart()` / `onDeviceStop()`
+  4) Data is delivered to and from the device by the backend. This is always done based on the native format returned by the prior call
+     to `onDeviceInit()`. Conversion between the device's native format and the format requested by the application will be handled by
+     miniaudio internally.
+
+Initialization of the context is quite simple. You need to do any necessary initialization of internal objects and then output the
+callbacks defined in this structure.
+
+Once the context has been initialized you can initialize a device. Before doing so, however, the application may want to know which
+physical devices are available. This is where `onContextEnumerateDevices()` comes in. This is fairly simple. For each device, fire the
+given callback with, at a minimum, the basic information filled out in `ma_device_info`. When the callback returns `MA_FALSE`, enumeration
+needs to stop and the `onContextEnumerateDevices()` function returns with a success code.
+
+Detailed device information can be retrieved from a device ID using `onContextGetDeviceInfo()`. This takes as input the device type and ID,
+and on output returns detailed information about the device in `ma_device_info`. The `onContextGetDeviceInfo()` callback must handle the
+case when the device ID is NULL, in which case information about the default device needs to be retrieved.
+
+Once the context has been created and the device ID retrieved (if using anything other than the default device), the device can be created.
+This is a little bit more complicated than initialization of the context due to its more complicated configuration. When initializing a
+device, a duplex device may be requested. This means a separate data format needs to be specified for both playback and capture. On input,
+the data format is set to what the application wants. On output it's set to the native format which should match as closely as possible to
+the requested format. The conversion between the format requested by the application and the device's native format will be handled
+internally by miniaudio.
+
+On input, if the sample format is set to `ma_format_unknown`, the backend is free to use whatever sample format it desires, so long as it's
+supported by miniaudio. When the channel count is set to 0, the backend should use the device's native channel count. The same applies for
+sample rate. For the channel map, the default should be used when `ma_channel_map_is_blank()` returns true (all channels set to
+`MA_CHANNEL_NONE`). On input, the `periodSizeInFrames` or `periodSizeInMilliseconds` option should always be set. The backend should
+inspect both of these variables. If `periodSizeInFrames` is set, it should take priority, otherwise it needs to be derived from the period
+size in milliseconds (`periodSizeInMilliseconds`) and the sample rate, keeping in mind that the sample rate may be 0, in which case the
+sample rate will need to be determined before calculating the period size in frames. On output, all members of the `ma_device_descriptor`
+object should be set to a valid value, except for `periodSizeInMilliseconds` which is optional (`periodSizeInFrames` *must* be set).
+
+Starting and stopping of the device is done with `onDeviceStart()` and `onDeviceStop()` and should be self-explanatory. If the backend uses
+asynchronous reading and writing, `onDeviceStart()` and `onDeviceStop()` should always be implemented.
+
+The handling of data delivery between the application and the device is the most complicated part of the process. To make this a bit
+easier, some helper callbacks are available. If the backend uses a blocking read/write style of API, the `onDeviceRead()` and
+`onDeviceWrite()` callbacks can optionally be implemented. These are blocking and work just like reading and writing from a file. If the
+backend uses a callback for data delivery, that callback must call `ma_device_handle_backend_data_callback()` from within its callback.
+This allows miniaudio to then process any necessary data conversion and then pass it to the miniaudio data callback.
+
+If the backend requires absolute flexibility with its data delivery, it can optionally implement the `onDeviceDataLoop()` callback
+which will allow it to implement the logic that will run on the audio thread. This is much more advanced and is completely optional.
+
+The audio thread should run data delivery logic in a loop while `ma_device_get_state() == ma_device_state_started` and no errors have been
+encountered. Do not start or stop the device here. That will be handled from outside the `onDeviceDataLoop()` callback.
+
+The invocation of the `onDeviceDataLoop()` callback will be handled by miniaudio. When you start the device, miniaudio will fire this
+callback. When the device is stopped, the `ma_device_get_state() == ma_device_state_started` condition will fail and the loop will be terminated
+which will then fall through to the part that stops the device. For an example on how to implement the `onDeviceDataLoop()` callback,
+look at `ma_device_audio_thread__default_read_write()`. Implement the `onDeviceDataLoopWakeup()` callback if you need a mechanism to
+wake up the audio thread.
+
+If the backend supports an optimized retrieval of device information from an initialized `ma_device` object, it should implement the
+`onDeviceGetInfo()` callback. This is optional, in which case it will fall back to `onContextGetDeviceInfo()` which is less efficient.
+*/
+struct ma_backend_callbacks
+{
+    ma_result (* onContextInit)(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks);
+    ma_result (* onContextUninit)(ma_context* pContext);
+    ma_result (* onContextEnumerateDevices)(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData);
+    ma_result (* onContextGetDeviceInfo)(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo);
+    ma_result (* onDeviceInit)(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture);
+    ma_result (* onDeviceUninit)(ma_device* pDevice);
+    ma_result (* onDeviceStart)(ma_device* pDevice);
+    ma_result (* onDeviceStop)(ma_device* pDevice);
+    ma_result (* onDeviceRead)(ma_device* pDevice, void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesRead);
+    ma_result (* onDeviceWrite)(ma_device* pDevice, const void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten);
+    ma_result (* onDeviceDataLoop)(ma_device* pDevice);
+    ma_result (* onDeviceDataLoopWakeup)(ma_device* pDevice);
+    ma_result (* onDeviceGetInfo)(ma_device* pDevice, ma_device_type type, ma_device_info* pDeviceInfo);
+};
+
+struct ma_context_config
+{
+    ma_log* pLog;
+    ma_thread_priority threadPriority;
+    size_t threadStackSize;
+    void* pUserData;
+    ma_allocation_callbacks allocationCallbacks;
+    struct
+    {
+        ma_handle hWnd; /* HWND. Optional window handle to pass into SetCooperativeLevel(). Will default to the foreground window, and if that fails, the desktop window. */
+    } dsound;
+    struct
+    {
+        ma_bool32 useVerboseDeviceEnumeration;
+    } alsa;
+    struct
+    {
+        const char* pApplicationName;
+        const char* pServerName;
+        ma_bool32 tryAutoSpawn; /* Enables autospawning of the PulseAudio daemon if necessary. */
+    } pulse;
+    struct
+    {
+        ma_ios_session_category sessionCategory;
+        ma_uint32 sessionCategoryOptions;
+        ma_bool32 noAudioSessionActivate;   /* iOS only. When set to true, does not perform an explicit [[AVAudioSession sharedInstace] setActive:true] on initialization. */
+        ma_bool32 noAudioSessionDeactivate; /* iOS only. When set to true, does not perform an explicit [[AVAudioSession sharedInstace] setActive:false] on uninitialization. */
+    } coreaudio;
+    struct
+    {
+        const char* pClientName;
+        ma_bool32 tryStartServer;
+    } jack;
+    ma_backend_callbacks custom;
+};
+
+/* WASAPI specific structure for some commands which must run on a common thread due to bugs in WASAPI. */
+typedef struct
+{
+    int code;
+    ma_event* pEvent;   /* This will be signalled when the event is complete. */
+    union
+    {
+        struct
+        {
+            int _unused;
+        } quit;
+        struct
+        {
+            ma_device_type deviceType;
+            void* pAudioClient;
+            void** ppAudioClientService;
+            ma_result* pResult; /* The result from creating the audio client service. */
+        } createAudioClient;
+        struct
+        {
+            ma_device* pDevice;
+            ma_device_type deviceType;
+        } releaseAudioClient;
+    } data;
+} ma_context_command__wasapi;
+
+struct ma_context
+{
+    ma_backend_callbacks callbacks;
+    ma_backend backend;                 /* DirectSound, ALSA, etc. */
+    ma_log* pLog;
+    ma_log log; /* Only used if the log is owned by the context. The pLog member will be set to &log in this case. */
+    ma_thread_priority threadPriority;
+    size_t threadStackSize;
+    void* pUserData;
+    ma_allocation_callbacks allocationCallbacks;
+    ma_mutex deviceEnumLock;            /* Used to make ma_context_get_devices() thread safe. */
+    ma_mutex deviceInfoLock;            /* Used to make ma_context_get_device_info() thread safe. */
+    ma_uint32 deviceInfoCapacity;       /* Total capacity of pDeviceInfos. */
+    ma_uint32 playbackDeviceInfoCount;
+    ma_uint32 captureDeviceInfoCount;
+    ma_device_info* pDeviceInfos;       /* Playback devices first, then capture. */
+
+    union
+    {
+#ifdef MA_SUPPORT_WASAPI
+        struct
+        {
+            ma_thread commandThread;
+            ma_mutex commandLock;
+            ma_semaphore commandSem;
+            ma_uint32 commandIndex;
+            ma_uint32 commandCount;
+            ma_context_command__wasapi commands[4];
+            ma_handle hAvrt;
+            ma_proc AvSetMmThreadCharacteristicsA;
+            ma_proc AvRevertMmThreadcharacteristics;
+            ma_handle hMMDevapi;
+            ma_proc ActivateAudioInterfaceAsync;
+        } wasapi;
+#endif
+#ifdef MA_SUPPORT_DSOUND
+        struct
+        {
+            ma_handle hWnd; /* Can be null. */
+            ma_handle hDSoundDLL;
+            ma_proc DirectSoundCreate;
+            ma_proc DirectSoundEnumerateA;
+            ma_proc DirectSoundCaptureCreate;
+            ma_proc DirectSoundCaptureEnumerateA;
+        } dsound;
+#endif
+#ifdef MA_SUPPORT_WINMM
+        struct
+        {
+            ma_handle hWinMM;
+            ma_proc waveOutGetNumDevs;
+            ma_proc waveOutGetDevCapsA;
+            ma_proc waveOutOpen;
+            ma_proc waveOutClose;
+            ma_proc waveOutPrepareHeader;
+            ma_proc waveOutUnprepareHeader;
+            ma_proc waveOutWrite;
+            ma_proc waveOutReset;
+            ma_proc waveInGetNumDevs;
+            ma_proc waveInGetDevCapsA;
+            ma_proc waveInOpen;
+            ma_proc waveInClose;
+            ma_proc waveInPrepareHeader;
+            ma_proc waveInUnprepareHeader;
+            ma_proc waveInAddBuffer;
+            ma_proc waveInStart;
+            ma_proc waveInReset;
+        } winmm;
+#endif
+#ifdef MA_SUPPORT_ALSA
+        struct
+        {
+            ma_handle asoundSO;
+            ma_proc snd_pcm_open;
+            ma_proc snd_pcm_close;
+            ma_proc snd_pcm_hw_params_sizeof;
+            ma_proc snd_pcm_hw_params_any;
+            ma_proc snd_pcm_hw_params_set_format;
+            ma_proc snd_pcm_hw_params_set_format_first;
+            ma_proc snd_pcm_hw_params_get_format_mask;
+            ma_proc snd_pcm_hw_params_set_channels;
+            ma_proc snd_pcm_hw_params_set_channels_near;
+            ma_proc snd_pcm_hw_params_set_channels_minmax;
+            ma_proc snd_pcm_hw_params_set_rate_resample;
+            ma_proc snd_pcm_hw_params_set_rate;
+            ma_proc snd_pcm_hw_params_set_rate_near;
+            ma_proc snd_pcm_hw_params_set_buffer_size_near;
+            ma_proc snd_pcm_hw_params_set_periods_near;
+            ma_proc snd_pcm_hw_params_set_access;
+            ma_proc snd_pcm_hw_params_get_format;
+            ma_proc snd_pcm_hw_params_get_channels;
+            ma_proc snd_pcm_hw_params_get_channels_min;
+            ma_proc snd_pcm_hw_params_get_channels_max;
+            ma_proc snd_pcm_hw_params_get_rate;
+            ma_proc snd_pcm_hw_params_get_rate_min;
+            ma_proc snd_pcm_hw_params_get_rate_max;
+            ma_proc snd_pcm_hw_params_get_buffer_size;
+            ma_proc snd_pcm_hw_params_get_periods;
+            ma_proc snd_pcm_hw_params_get_access;
+            ma_proc snd_pcm_hw_params_test_format;
+            ma_proc snd_pcm_hw_params_test_channels;
+            ma_proc snd_pcm_hw_params_test_rate;
+            ma_proc snd_pcm_hw_params;
+            ma_proc snd_pcm_sw_params_sizeof;
+            ma_proc snd_pcm_sw_params_current;
+            ma_proc snd_pcm_sw_params_get_boundary;
+            ma_proc snd_pcm_sw_params_set_avail_min;
+            ma_proc snd_pcm_sw_params_set_start_threshold;
+            ma_proc snd_pcm_sw_params_set_stop_threshold;
+            ma_proc snd_pcm_sw_params;
+            ma_proc snd_pcm_format_mask_sizeof;
+            ma_proc snd_pcm_format_mask_test;
+            ma_proc snd_pcm_get_chmap;
+            ma_proc snd_pcm_state;
+            ma_proc snd_pcm_prepare;
+            ma_proc snd_pcm_start;
+            ma_proc snd_pcm_drop;
+            ma_proc snd_pcm_drain;
+            ma_proc snd_pcm_reset;
+            ma_proc snd_device_name_hint;
+            ma_proc snd_device_name_get_hint;
+            ma_proc snd_card_get_index;
+            ma_proc snd_device_name_free_hint;
+            ma_proc snd_pcm_mmap_begin;
+            ma_proc snd_pcm_mmap_commit;
+            ma_proc snd_pcm_recover;
+            ma_proc snd_pcm_readi;
+            ma_proc snd_pcm_writei;
+            ma_proc snd_pcm_avail;
+            ma_proc snd_pcm_avail_update;
+            ma_proc snd_pcm_wait;
+            ma_proc snd_pcm_nonblock;
+            ma_proc snd_pcm_info;
+            ma_proc snd_pcm_info_sizeof;
+            ma_proc snd_pcm_info_get_name;
+            ma_proc snd_pcm_poll_descriptors;
+            ma_proc snd_pcm_poll_descriptors_count;
+            ma_proc snd_pcm_poll_descriptors_revents;
+            ma_proc snd_config_update_free_global;
+
+            ma_mutex internalDeviceEnumLock;
+            ma_bool32 useVerboseDeviceEnumeration;
+        } alsa;
+#endif
+#ifdef MA_SUPPORT_PULSEAUDIO
+        struct
+        {
+            ma_handle pulseSO;
+            ma_proc pa_mainloop_new;
+            ma_proc pa_mainloop_free;
+            ma_proc pa_mainloop_quit;
+            ma_proc pa_mainloop_get_api;
+            ma_proc pa_mainloop_iterate;
+            ma_proc pa_mainloop_wakeup;
+            ma_proc pa_threaded_mainloop_new;
+            ma_proc pa_threaded_mainloop_free;
+            ma_proc pa_threaded_mainloop_start;
+            ma_proc pa_threaded_mainloop_stop;
+            ma_proc pa_threaded_mainloop_lock;
+            ma_proc pa_threaded_mainloop_unlock;
+            ma_proc pa_threaded_mainloop_wait;
+            ma_proc pa_threaded_mainloop_signal;
+            ma_proc pa_threaded_mainloop_accept;
+            ma_proc pa_threaded_mainloop_get_retval;
+            ma_proc pa_threaded_mainloop_get_api;
+            ma_proc pa_threaded_mainloop_in_thread;
+            ma_proc pa_threaded_mainloop_set_name;
+            ma_proc pa_context_new;
+            ma_proc pa_context_unref;
+            ma_proc pa_context_connect;
+            ma_proc pa_context_disconnect;
+            ma_proc pa_context_set_state_callback;
+            ma_proc pa_context_get_state;
+            ma_proc pa_context_get_sink_info_list;
+            ma_proc pa_context_get_source_info_list;
+            ma_proc pa_context_get_sink_info_by_name;
+            ma_proc pa_context_get_source_info_by_name;
+            ma_proc pa_operation_unref;
+            ma_proc pa_operation_get_state;
+            ma_proc pa_channel_map_init_extend;
+            ma_proc pa_channel_map_valid;
+            ma_proc pa_channel_map_compatible;
+            ma_proc pa_stream_new;
+            ma_proc pa_stream_unref;
+            ma_proc pa_stream_connect_playback;
+            ma_proc pa_stream_connect_record;
+            ma_proc pa_stream_disconnect;
+            ma_proc pa_stream_get_state;
+            ma_proc pa_stream_get_sample_spec;
+            ma_proc pa_stream_get_channel_map;
+            ma_proc pa_stream_get_buffer_attr;
+            ma_proc pa_stream_set_buffer_attr;
+            ma_proc pa_stream_get_device_name;
+            ma_proc pa_stream_set_write_callback;
+            ma_proc pa_stream_set_read_callback;
+            ma_proc pa_stream_set_suspended_callback;
+            ma_proc pa_stream_set_moved_callback;
+            ma_proc pa_stream_is_suspended;
+            ma_proc pa_stream_flush;
+            ma_proc pa_stream_drain;
+            ma_proc pa_stream_is_corked;
+            ma_proc pa_stream_cork;
+            ma_proc pa_stream_trigger;
+            ma_proc pa_stream_begin_write;
+            ma_proc pa_stream_write;
+            ma_proc pa_stream_peek;
+            ma_proc pa_stream_drop;
+            ma_proc pa_stream_writable_size;
+            ma_proc pa_stream_readable_size;
+
+            /*pa_mainloop**/ ma_ptr pMainLoop;
+            /*pa_context**/ ma_ptr pPulseContext;
+            char* pApplicationName; /* Set when the context is initialized. Used by devices for their local pa_context objects. */
+            char* pServerName;      /* Set when the context is initialized. Used by devices for their local pa_context objects. */
+        } pulse;
+#endif
+#ifdef MA_SUPPORT_JACK
+        struct
+        {
+            ma_handle jackSO;
+            ma_proc jack_client_open;
+            ma_proc jack_client_close;
+            ma_proc jack_client_name_size;
+            ma_proc jack_set_process_callback;
+            ma_proc jack_set_buffer_size_callback;
+            ma_proc jack_on_shutdown;
+            ma_proc jack_get_sample_rate;
+            ma_proc jack_get_buffer_size;
+            ma_proc jack_get_ports;
+            ma_proc jack_activate;
+            ma_proc jack_deactivate;
+            ma_proc jack_connect;
+            ma_proc jack_port_register;
+            ma_proc jack_port_name;
+            ma_proc jack_port_get_buffer;
+            ma_proc jack_free;
+
+            char* pClientName;
+            ma_bool32 tryStartServer;
+        } jack;
+#endif
+#ifdef MA_SUPPORT_COREAUDIO
+        struct
+        {
+            ma_handle hCoreFoundation;
+            ma_proc CFStringGetCString;
+            ma_proc CFRelease;
+
+            ma_handle hCoreAudio;
+            ma_proc AudioObjectGetPropertyData;
+            ma_proc AudioObjectGetPropertyDataSize;
+            ma_proc AudioObjectSetPropertyData;
+            ma_proc AudioObjectAddPropertyListener;
+            ma_proc AudioObjectRemovePropertyListener;
+
+            ma_handle hAudioUnit;  /* Could possibly be set to AudioToolbox on later versions of macOS. */
+            ma_proc AudioComponentFindNext;
+            ma_proc AudioComponentInstanceDispose;
+            ma_proc AudioComponentInstanceNew;
+            ma_proc AudioOutputUnitStart;
+            ma_proc AudioOutputUnitStop;
+            ma_proc AudioUnitAddPropertyListener;
+            ma_proc AudioUnitGetPropertyInfo;
+            ma_proc AudioUnitGetProperty;
+            ma_proc AudioUnitSetProperty;
+            ma_proc AudioUnitInitialize;
+            ma_proc AudioUnitRender;
+
+            /*AudioComponent*/ ma_ptr component;
+            ma_bool32 noAudioSessionDeactivate; /* For tracking whether or not the iOS audio session should be explicitly deactivated. Set from the config in ma_context_init__coreaudio(). */
+        } coreaudio;
+#endif
+#ifdef MA_SUPPORT_SNDIO
+        struct
+        {
+            ma_handle sndioSO;
+            ma_proc sio_open;
+            ma_proc sio_close;
+            ma_proc sio_setpar;
+            ma_proc sio_getpar;
+            ma_proc sio_getcap;
+            ma_proc sio_start;
+            ma_proc sio_stop;
+            ma_proc sio_read;
+            ma_proc sio_write;
+            ma_proc sio_onmove;
+            ma_proc sio_nfds;
+            ma_proc sio_pollfd;
+            ma_proc sio_revents;
+            ma_proc sio_eof;
+            ma_proc sio_setvol;
+            ma_proc sio_onvol;
+            ma_proc sio_initpar;
+        } sndio;
+#endif
+#ifdef MA_SUPPORT_AUDIO4
+        struct
+        {
+            int _unused;
+        } audio4;
+#endif
+#ifdef MA_SUPPORT_OSS
+        struct
+        {
+            int versionMajor;
+            int versionMinor;
+        } oss;
+#endif
+#ifdef MA_SUPPORT_AAUDIO
+        struct
+        {
+            ma_handle hAAudio; /* libaaudio.so */
+            ma_proc AAudio_createStreamBuilder;
+            ma_proc AAudioStreamBuilder_delete;
+            ma_proc AAudioStreamBuilder_setDeviceId;
+            ma_proc AAudioStreamBuilder_setDirection;
+            ma_proc AAudioStreamBuilder_setSharingMode;
+            ma_proc AAudioStreamBuilder_setFormat;
+            ma_proc AAudioStreamBuilder_setChannelCount;
+            ma_proc AAudioStreamBuilder_setSampleRate;
+            ma_proc AAudioStreamBuilder_setBufferCapacityInFrames;
+            ma_proc AAudioStreamBuilder_setFramesPerDataCallback;
+            ma_proc AAudioStreamBuilder_setDataCallback;
+            ma_proc AAudioStreamBuilder_setErrorCallback;
+            ma_proc AAudioStreamBuilder_setPerformanceMode;
+            ma_proc AAudioStreamBuilder_setUsage;
+            ma_proc AAudioStreamBuilder_setContentType;
+            ma_proc AAudioStreamBuilder_setInputPreset;
+            ma_proc AAudioStreamBuilder_setAllowedCapturePolicy;
+            ma_proc AAudioStreamBuilder_openStream;
+            ma_proc AAudioStream_close;
+            ma_proc AAudioStream_getState;
+            ma_proc AAudioStream_waitForStateChange;
+            ma_proc AAudioStream_getFormat;
+            ma_proc AAudioStream_getChannelCount;
+            ma_proc AAudioStream_getSampleRate;
+            ma_proc AAudioStream_getBufferCapacityInFrames;
+            ma_proc AAudioStream_getFramesPerDataCallback;
+            ma_proc AAudioStream_getFramesPerBurst;
+            ma_proc AAudioStream_requestStart;
+            ma_proc AAudioStream_requestStop;
+            ma_device_job_thread jobThread; /* For processing operations outside of the error callback, specifically device disconnections and rerouting. */
+        } aaudio;
+#endif
+#ifdef MA_SUPPORT_OPENSL
+        struct
+        {
+            ma_handle libOpenSLES;
+            ma_handle SL_IID_ENGINE;
+            ma_handle SL_IID_AUDIOIODEVICECAPABILITIES;
+            ma_handle SL_IID_ANDROIDSIMPLEBUFFERQUEUE;
+            ma_handle SL_IID_RECORD;
+            ma_handle SL_IID_PLAY;
+            ma_handle SL_IID_OUTPUTMIX;
+            ma_handle SL_IID_ANDROIDCONFIGURATION;
+            ma_proc   slCreateEngine;
+        } opensl;
+#endif
+#ifdef MA_SUPPORT_WEBAUDIO
+        struct
+        {
+            int _unused;
+        } webaudio;
+#endif
+#ifdef MA_SUPPORT_NULL
+        struct
+        {
+            int _unused;
+        } null_backend;
+#endif
+    };
+
+    union
+    {
+#if defined(MA_WIN32)
+        struct
+        {
+            /*HMODULE*/ ma_handle hOle32DLL;
+            ma_proc CoInitialize;
+            ma_proc CoInitializeEx;
+            ma_proc CoUninitialize;
+            ma_proc CoCreateInstance;
+            ma_proc CoTaskMemFree;
+            ma_proc PropVariantClear;
+            ma_proc StringFromGUID2;
+
+            /*HMODULE*/ ma_handle hUser32DLL;
+            ma_proc GetForegroundWindow;
+            ma_proc GetDesktopWindow;
+
+            /*HMODULE*/ ma_handle hAdvapi32DLL;
+            ma_proc RegOpenKeyExA;
+            ma_proc RegCloseKey;
+            ma_proc RegQueryValueExA;
+
+            /*HRESULT*/ long CoInitializeResult;
+        } win32;
+#endif
+#ifdef MA_POSIX
+        struct
+        {
+            int _unused;
+        } posix;
+#endif
+        int _unused;
+    };
+};
+
+struct ma_device
+{
+    ma_context* pContext;
+    ma_device_type type;
+    ma_uint32 sampleRate;
+    ma_atomic_device_state state;               /* The state of the device is variable and can change at any time on any thread. Must be used atomically. */
+    ma_device_data_proc onData;                 /* Set once at initialization time and should not be changed after. */
+    ma_device_notification_proc onNotification; /* Set once at initialization time and should not be changed after. */
+    ma_stop_proc onStop;                        /* DEPRECATED. Use the notification callback instead. Set once at initialization time and should not be changed after. */
+    void* pUserData;                            /* Application defined data. */
+    ma_mutex startStopLock;
+    ma_event wakeupEvent;
+    ma_event startEvent;
+    ma_event stopEvent;
+    ma_thread thread;
+    ma_result workResult;                       /* This is set by the worker thread after it's finished doing a job. */
+    ma_bool8 isOwnerOfContext;                  /* When set to true, uninitializing the device will also uninitialize the context. Set to true when NULL is passed into ma_device_init(). */
+    ma_bool8 noPreSilencedOutputBuffer;
+    ma_bool8 noClip;
+    ma_bool8 noDisableDenormals;
+    ma_bool8 noFixedSizedCallback;
+    ma_atomic_float masterVolumeFactor;         /* Linear 0..1. Can be read and written simultaneously by different threads. Must be used atomically. */
+    ma_duplex_rb duplexRB;                      /* Intermediary buffer for duplex device on asynchronous backends. */
+    struct
+    {
+        ma_resample_algorithm algorithm;
+        ma_resampling_backend_vtable* pBackendVTable;
+        void* pBackendUserData;
+        struct
+        {
+            ma_uint32 lpfOrder;
+        } linear;
+    } resampling;
+    struct
+    {
+        ma_device_id* pID;                  /* Set to NULL if using default ID, otherwise set to the address of "id". */
+        ma_device_id id;                    /* If using an explicit device, will be set to a copy of the ID used for initialization. Otherwise cleared to 0. */
+        char name[MA_MAX_DEVICE_NAME_LENGTH + 1];                     /* Maybe temporary. Likely to be replaced with a query API. */
+        ma_share_mode shareMode;            /* Set to whatever was passed in when the device was initialized. */
+        ma_format format;
+        ma_uint32 channels;
+        ma_channel channelMap[MA_MAX_CHANNELS];
+        ma_format internalFormat;
+        ma_uint32 internalChannels;
+        ma_uint32 internalSampleRate;
+        ma_channel internalChannelMap[MA_MAX_CHANNELS];
+        ma_uint32 internalPeriodSizeInFrames;
+        ma_uint32 internalPeriods;
+        ma_channel_mix_mode channelMixMode;
+        ma_bool32 calculateLFEFromSpatialChannels;
+        ma_data_converter converter;
+        void* pIntermediaryBuffer;          /* For implementing fixed sized buffer callbacks. Will be null if using variable sized callbacks. */
+        ma_uint32 intermediaryBufferCap;
+        ma_uint32 intermediaryBufferLen;    /* How many valid frames are sitting in the intermediary buffer. */
+        void* pInputCache;                  /* In external format. Can be null. */
+        ma_uint64 inputCacheCap;
+        ma_uint64 inputCacheConsumed;
+        ma_uint64 inputCacheRemaining;
+    } playback;
+    struct
+    {
+        ma_device_id* pID;                  /* Set to NULL if using default ID, otherwise set to the address of "id". */
+        ma_device_id id;                    /* If using an explicit device, will be set to a copy of the ID used for initialization. Otherwise cleared to 0. */
+        char name[MA_MAX_DEVICE_NAME_LENGTH + 1];                     /* Maybe temporary. Likely to be replaced with a query API. */
+        ma_share_mode shareMode;            /* Set to whatever was passed in when the device was initialized. */
+        ma_format format;
+        ma_uint32 channels;
+        ma_channel channelMap[MA_MAX_CHANNELS];
+        ma_format internalFormat;
+        ma_uint32 internalChannels;
+        ma_uint32 internalSampleRate;
+        ma_channel internalChannelMap[MA_MAX_CHANNELS];
+        ma_uint32 internalPeriodSizeInFrames;
+        ma_uint32 internalPeriods;
+        ma_channel_mix_mode channelMixMode;
+        ma_bool32 calculateLFEFromSpatialChannels;
+        ma_data_converter converter;
+        void* pIntermediaryBuffer;          /* For implementing fixed sized buffer callbacks. Will be null if using variable sized callbacks. */
+        ma_uint32 intermediaryBufferCap;
+        ma_uint32 intermediaryBufferLen;    /* How many valid frames are sitting in the intermediary buffer. */
+    } capture;
+
+    union
+    {
+#ifdef MA_SUPPORT_WASAPI
+        struct
+        {
+            /*IAudioClient**/ ma_ptr pAudioClientPlayback;
+            /*IAudioClient**/ ma_ptr pAudioClientCapture;
+            /*IAudioRenderClient**/ ma_ptr pRenderClient;
+            /*IAudioCaptureClient**/ ma_ptr pCaptureClient;
+            /*IMMDeviceEnumerator**/ ma_ptr pDeviceEnumerator;      /* Used for IMMNotificationClient notifications. Required for detecting default device changes. */
+            ma_IMMNotificationClient notificationClient;
+            /*HANDLE*/ ma_handle hEventPlayback;                    /* Auto reset. Initialized to signaled. */
+            /*HANDLE*/ ma_handle hEventCapture;                     /* Auto reset. Initialized to unsignaled. */
+            ma_uint32 actualBufferSizeInFramesPlayback;             /* Value from GetBufferSize(). internalPeriodSizeInFrames is not set to the _actual_ buffer size when low-latency shared mode is being used due to the way the IAudioClient3 API works. */
+            ma_uint32 actualBufferSizeInFramesCapture;
+            ma_uint32 originalPeriodSizeInFrames;
+            ma_uint32 originalPeriodSizeInMilliseconds;
+            ma_uint32 originalPeriods;
+            ma_performance_profile originalPerformanceProfile;
+            ma_uint32 periodSizeInFramesPlayback;
+            ma_uint32 periodSizeInFramesCapture;
+            void* pMappedBufferCapture;
+            ma_uint32 mappedBufferCaptureCap;
+            ma_uint32 mappedBufferCaptureLen;
+            void* pMappedBufferPlayback;
+            ma_uint32 mappedBufferPlaybackCap;
+            ma_uint32 mappedBufferPlaybackLen;
+            ma_atomic_bool32 isStartedCapture;                      /* Can be read and written simultaneously across different threads. Must be used atomically, and must be 32-bit. */
+            ma_atomic_bool32 isStartedPlayback;                     /* Can be read and written simultaneously across different threads. Must be used atomically, and must be 32-bit. */
+            ma_uint32 loopbackProcessID;
+            ma_bool8 loopbackProcessExclude;
+            ma_bool8 noAutoConvertSRC;                              /* When set to true, disables the use of AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM. */
+            ma_bool8 noDefaultQualitySRC;                           /* When set to true, disables the use of AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY. */
+            ma_bool8 noHardwareOffloading;
+            ma_bool8 allowCaptureAutoStreamRouting;
+            ma_bool8 allowPlaybackAutoStreamRouting;
+            ma_bool8 isDetachedPlayback;
+            ma_bool8 isDetachedCapture;
+            ma_wasapi_usage usage;
+            void* hAvrtHandle;
+            ma_mutex rerouteLock;
+        } wasapi;
+#endif
+#ifdef MA_SUPPORT_DSOUND
+        struct
+        {
+            /*LPDIRECTSOUND*/ ma_ptr pPlayback;
+            /*LPDIRECTSOUNDBUFFER*/ ma_ptr pPlaybackPrimaryBuffer;
+            /*LPDIRECTSOUNDBUFFER*/ ma_ptr pPlaybackBuffer;
+            /*LPDIRECTSOUNDCAPTURE*/ ma_ptr pCapture;
+            /*LPDIRECTSOUNDCAPTUREBUFFER*/ ma_ptr pCaptureBuffer;
+        } dsound;
+#endif
+#ifdef MA_SUPPORT_WINMM
+        struct
+        {
+            /*HWAVEOUT*/ ma_handle hDevicePlayback;
+            /*HWAVEIN*/ ma_handle hDeviceCapture;
+            /*HANDLE*/ ma_handle hEventPlayback;
+            /*HANDLE*/ ma_handle hEventCapture;
+            ma_uint32 fragmentSizeInFrames;
+            ma_uint32 iNextHeaderPlayback;             /* [0,periods). Used as an index into pWAVEHDRPlayback. */
+            ma_uint32 iNextHeaderCapture;              /* [0,periods). Used as an index into pWAVEHDRCapture. */
+            ma_uint32 headerFramesConsumedPlayback;    /* The number of PCM frames consumed in the buffer in pWAVEHEADER[iNextHeader]. */
+            ma_uint32 headerFramesConsumedCapture;     /* ^^^ */
+            /*WAVEHDR**/ ma_uint8* pWAVEHDRPlayback;   /* One instantiation for each period. */
+            /*WAVEHDR**/ ma_uint8* pWAVEHDRCapture;    /* One instantiation for each period. */
+            ma_uint8* pIntermediaryBufferPlayback;
+            ma_uint8* pIntermediaryBufferCapture;
+            ma_uint8* _pHeapData;                      /* Used internally and is used for the heap allocated data for the intermediary buffer and the WAVEHDR structures. */
+        } winmm;
+#endif
+#ifdef MA_SUPPORT_ALSA
+        struct
+        {
+            /*snd_pcm_t**/ ma_ptr pPCMPlayback;
+            /*snd_pcm_t**/ ma_ptr pPCMCapture;
+            /*struct pollfd**/ void* pPollDescriptorsPlayback;
+            /*struct pollfd**/ void* pPollDescriptorsCapture;
+            int pollDescriptorCountPlayback;
+            int pollDescriptorCountCapture;
+            int wakeupfdPlayback;   /* eventfd for waking up from poll() when the playback device is stopped. */
+            int wakeupfdCapture;    /* eventfd for waking up from poll() when the capture device is stopped. */
+            ma_bool8 isUsingMMapPlayback;
+            ma_bool8 isUsingMMapCapture;
+        } alsa;
+#endif
+#ifdef MA_SUPPORT_PULSEAUDIO
+        struct
+        {
+            /*pa_mainloop**/ ma_ptr pMainLoop;
+            /*pa_context**/ ma_ptr pPulseContext;
+            /*pa_stream**/ ma_ptr pStreamPlayback;
+            /*pa_stream**/ ma_ptr pStreamCapture;
+        } pulse;
+#endif
+#ifdef MA_SUPPORT_JACK
+        struct
+        {
+            /*jack_client_t**/ ma_ptr pClient;
+            /*jack_port_t**/ ma_ptr* ppPortsPlayback;
+            /*jack_port_t**/ ma_ptr* ppPortsCapture;
+            float* pIntermediaryBufferPlayback; /* Typed as a float because JACK is always floating point. */
+            float* pIntermediaryBufferCapture;
+        } jack;
+#endif
+#ifdef MA_SUPPORT_COREAUDIO
+        struct
+        {
+            ma_uint32 deviceObjectIDPlayback;
+            ma_uint32 deviceObjectIDCapture;
+            /*AudioUnit*/ ma_ptr audioUnitPlayback;
+            /*AudioUnit*/ ma_ptr audioUnitCapture;
+            /*AudioBufferList**/ ma_ptr pAudioBufferList;   /* Only used for input devices. */
+            ma_uint32 audioBufferCapInFrames;               /* Only used for input devices. The capacity in frames of each buffer in pAudioBufferList. */
+            ma_event stopEvent;
+            ma_uint32 originalPeriodSizeInFrames;
+            ma_uint32 originalPeriodSizeInMilliseconds;
+            ma_uint32 originalPeriods;
+            ma_performance_profile originalPerformanceProfile;
+            ma_bool32 isDefaultPlaybackDevice;
+            ma_bool32 isDefaultCaptureDevice;
+            ma_bool32 isSwitchingPlaybackDevice;   /* <-- Set to true when the default device has changed and miniaudio is in the process of switching. */
+            ma_bool32 isSwitchingCaptureDevice;    /* <-- Set to true when the default device has changed and miniaudio is in the process of switching. */
+            void* pNotificationHandler;             /* Only used on mobile platforms. Obj-C object for handling route changes. */
+        } coreaudio;
+#endif
+#ifdef MA_SUPPORT_SNDIO
+        struct
+        {
+            ma_ptr handlePlayback;
+            ma_ptr handleCapture;
+            ma_bool32 isStartedPlayback;
+            ma_bool32 isStartedCapture;
+        } sndio;
+#endif
+#ifdef MA_SUPPORT_AUDIO4
+        struct
+        {
+            int fdPlayback;
+            int fdCapture;
+        } audio4;
+#endif
+#ifdef MA_SUPPORT_OSS
+        struct
+        {
+            int fdPlayback;
+            int fdCapture;
+        } oss;
+#endif
+#ifdef MA_SUPPORT_AAUDIO
+        struct
+        {
+            /*AAudioStream**/ ma_ptr pStreamPlayback;
+            /*AAudioStream**/ ma_ptr pStreamCapture;
+            ma_mutex rerouteLock;
+            ma_aaudio_usage usage;
+            ma_aaudio_content_type contentType;
+            ma_aaudio_input_preset inputPreset;
+            ma_aaudio_allowed_capture_policy allowedCapturePolicy;
+            ma_bool32 noAutoStartAfterReroute;
+        } aaudio;
+#endif
+#ifdef MA_SUPPORT_OPENSL
+        struct
+        {
+            /*SLObjectItf*/ ma_ptr pOutputMixObj;
+            /*SLOutputMixItf*/ ma_ptr pOutputMix;
+            /*SLObjectItf*/ ma_ptr pAudioPlayerObj;
+            /*SLPlayItf*/ ma_ptr pAudioPlayer;
+            /*SLObjectItf*/ ma_ptr pAudioRecorderObj;
+            /*SLRecordItf*/ ma_ptr pAudioRecorder;
+            /*SLAndroidSimpleBufferQueueItf*/ ma_ptr pBufferQueuePlayback;
+            /*SLAndroidSimpleBufferQueueItf*/ ma_ptr pBufferQueueCapture;
+            ma_bool32 isDrainingCapture;
+            ma_bool32 isDrainingPlayback;
+            ma_uint32 currentBufferIndexPlayback;
+            ma_uint32 currentBufferIndexCapture;
+            ma_uint8* pBufferPlayback;      /* This is malloc()'d and is used for storing audio data. Typed as ma_uint8 for easy offsetting. */
+            ma_uint8* pBufferCapture;
+        } opensl;
+#endif
+#ifdef MA_SUPPORT_WEBAUDIO
+        struct
+        {
+            /* AudioWorklets path. */
+            /* EMSCRIPTEN_WEBAUDIO_T */ int audioContext;
+            /* EMSCRIPTEN_WEBAUDIO_T */ int audioWorklet;
+            float* pIntermediaryBuffer;
+            void* pStackBuffer;
+            ma_result initResult;   /* Set to MA_BUSY while initialization is in progress. */
+            int deviceIndex;        /* We store the device in a list on the JavaScript side. This is used to map our C object to the JS object. */
+        } webaudio;
+#endif
+#ifdef MA_SUPPORT_NULL
+        struct
+        {
+            ma_thread deviceThread;
+            ma_event operationEvent;
+            ma_event operationCompletionEvent;
+            ma_semaphore operationSemaphore;
+            ma_uint32 operation;
+            ma_result operationResult;
+            ma_timer timer;
+            double priorRunTime;
+            ma_uint32 currentPeriodFramesRemainingPlayback;
+            ma_uint32 currentPeriodFramesRemainingCapture;
+            ma_uint64 lastProcessedFramePlayback;
+            ma_uint64 lastProcessedFrameCapture;
+            ma_atomic_bool32 isStarted; /* Read and written by multiple threads. Must be used atomically, and must be 32-bit for compiler compatibility. */
+        } null_device;
+#endif
+    };
+};
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(pop)
+#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
+    #pragma GCC diagnostic pop  /* For ISO C99 doesn't support unnamed structs/unions [-Wpedantic] */
+#endif
+
+/*
+Initializes a `ma_context_config` object.
+
+
+Return Value
+------------
+A `ma_context_config` initialized to defaults.
+
+
+Remarks
+-------
+You must always use this to initialize the default state of the `ma_context_config` object. Not using this will result in your program breaking when miniaudio
+is updated and new members are added to `ma_context_config`. It also sets logical defaults.
+
+You can override members of the returned object by changing it's members directly.
+
+
+See Also
+--------
+ma_context_init()
+*/
+MA_API ma_context_config ma_context_config_init(void);
+
+/*
+Initializes a context.
+
+The context is used for selecting and initializing an appropriate backend and to represent the backend at a more global level than that of an individual
+device. There is one context to many devices, and a device is created from a context. A context is required to enumerate devices.
+
+
+Parameters
+----------
+backends (in, optional)
+    A list of backends to try initializing, in priority order. Can be NULL, in which case it uses default priority order.
+
+backendCount (in, optional)
+    The number of items in `backend`. Ignored if `backend` is NULL.
+
+pConfig (in, optional)
+    The context configuration.
+
+pContext (in)
+    A pointer to the context object being initialized.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Unsafe. Do not call this function across multiple threads as some backends read and write to global state.
+
+
+Remarks
+-------
+When `backends` is NULL, the default priority order will be used. Below is a list of backends in priority order:
+
+    |-------------|-----------------------|--------------------------------------------------------|
+    | Name        | Enum Name             | Supported Operating Systems                            |
+    |-------------|-----------------------|--------------------------------------------------------|
+    | WASAPI      | ma_backend_wasapi     | Windows Vista+                                         |
+    | DirectSound | ma_backend_dsound     | Windows XP+                                            |
+    | WinMM       | ma_backend_winmm      | Windows XP+ (may work on older versions, but untested) |
+    | Core Audio  | ma_backend_coreaudio  | macOS, iOS                                             |
+    | ALSA        | ma_backend_alsa       | Linux                                                  |
+    | PulseAudio  | ma_backend_pulseaudio | Cross Platform (disabled on Windows, BSD and Android)  |
+    | JACK        | ma_backend_jack       | Cross Platform (disabled on BSD and Android)           |
+    | sndio       | ma_backend_sndio      | OpenBSD                                                |
+    | audio(4)    | ma_backend_audio4     | NetBSD, OpenBSD                                        |
+    | OSS         | ma_backend_oss        | FreeBSD                                                |
+    | AAudio      | ma_backend_aaudio     | Android 8+                                             |
+    | OpenSL|ES   | ma_backend_opensl     | Android (API level 16+)                                |
+    | Web Audio   | ma_backend_webaudio   | Web (via Emscripten)                                   |
+    | Null        | ma_backend_null       | Cross Platform (not used on Web)                       |
+    |-------------|-----------------------|--------------------------------------------------------|
+
+The context can be configured via the `pConfig` argument. The config object is initialized with `ma_context_config_init()`. Individual configuration settings
+can then be set directly on the structure. Below are the members of the `ma_context_config` object.
+
+    pLog
+        A pointer to the `ma_log` to post log messages to. Can be NULL if the application does not
+        require logging. See the `ma_log` API for details on how to use the logging system.
+
+    threadPriority
+        The desired priority to use for the audio thread. Allowable values include the following:
+
+        |--------------------------------------|
+        | Thread Priority                      |
+        |--------------------------------------|
+        | ma_thread_priority_idle              |
+        | ma_thread_priority_lowest            |
+        | ma_thread_priority_low               |
+        | ma_thread_priority_normal            |
+        | ma_thread_priority_high              |
+        | ma_thread_priority_highest (default) |
+        | ma_thread_priority_realtime          |
+        | ma_thread_priority_default           |
+        |--------------------------------------|
+
+    threadStackSize
+        The desired size of the stack for the audio thread. Defaults to the operating system's default.
+
+    pUserData
+        A pointer to application-defined data. This can be accessed from the context object directly such as `context.pUserData`.
+
+    allocationCallbacks
+        Structure containing custom allocation callbacks. Leaving this at defaults will cause it to use MA_MALLOC, MA_REALLOC and MA_FREE. These allocation
+        callbacks will be used for anything tied to the context, including devices.
+
+    alsa.useVerboseDeviceEnumeration
+        ALSA will typically enumerate many different devices which can be intrusive and not user-friendly. To combat this, miniaudio will enumerate only unique
+        card/device pairs by default. The problem with this is that you lose a bit of flexibility and control. Setting alsa.useVerboseDeviceEnumeration makes
+        it so the ALSA backend includes all devices. Defaults to false.
+
+    pulse.pApplicationName
+        PulseAudio only. The application name to use when initializing the PulseAudio context with `pa_context_new()`.
+
+    pulse.pServerName
+        PulseAudio only. The name of the server to connect to with `pa_context_connect()`.
+
+    pulse.tryAutoSpawn
+        PulseAudio only. Whether or not to try automatically starting the PulseAudio daemon. Defaults to false. If you set this to true, keep in mind that
+        miniaudio uses a trial and error method to find the most appropriate backend, and this will result in the PulseAudio daemon starting which may be
+        intrusive for the end user.
+
+    coreaudio.sessionCategory
+        iOS only. The session category to use for the shared AudioSession instance. Below is a list of allowable values and their Core Audio equivalents.
+
+        |-----------------------------------------|-------------------------------------|
+        | miniaudio Token                         | Core Audio Token                    |
+        |-----------------------------------------|-------------------------------------|
+        | ma_ios_session_category_ambient         | AVAudioSessionCategoryAmbient       |
+        | ma_ios_session_category_solo_ambient    | AVAudioSessionCategorySoloAmbient   |
+        | ma_ios_session_category_playback        | AVAudioSessionCategoryPlayback      |
+        | ma_ios_session_category_record          | AVAudioSessionCategoryRecord        |
+        | ma_ios_session_category_play_and_record | AVAudioSessionCategoryPlayAndRecord |
+        | ma_ios_session_category_multi_route     | AVAudioSessionCategoryMultiRoute    |
+        | ma_ios_session_category_none            | AVAudioSessionCategoryAmbient       |
+        | ma_ios_session_category_default         | AVAudioSessionCategoryAmbient       |
+        |-----------------------------------------|-------------------------------------|
+
+    coreaudio.sessionCategoryOptions
+        iOS only. Session category options to use with the shared AudioSession instance. Below is a list of allowable values and their Core Audio equivalents.
+
+        |---------------------------------------------------------------------------|------------------------------------------------------------------|
+        | miniaudio Token                                                           | Core Audio Token                                                 |
+        |---------------------------------------------------------------------------|------------------------------------------------------------------|
+        | ma_ios_session_category_option_mix_with_others                            | AVAudioSessionCategoryOptionMixWithOthers                        |
+        | ma_ios_session_category_option_duck_others                                | AVAudioSessionCategoryOptionDuckOthers                           |
+        | ma_ios_session_category_option_allow_bluetooth                            | AVAudioSessionCategoryOptionAllowBluetooth                       |
+        | ma_ios_session_category_option_default_to_speaker                         | AVAudioSessionCategoryOptionDefaultToSpeaker                     |
+        | ma_ios_session_category_option_interrupt_spoken_audio_and_mix_with_others | AVAudioSessionCategoryOptionInterruptSpokenAudioAndMixWithOthers |
+        | ma_ios_session_category_option_allow_bluetooth_a2dp                       | AVAudioSessionCategoryOptionAllowBluetoothA2DP                   |
+        | ma_ios_session_category_option_allow_air_play                             | AVAudioSessionCategoryOptionAllowAirPlay                         |
+        |---------------------------------------------------------------------------|------------------------------------------------------------------|
+
+    coreaudio.noAudioSessionActivate
+        iOS only. When set to true, does not perform an explicit [[AVAudioSession sharedInstace] setActive:true] on initialization.
+
+    coreaudio.noAudioSessionDeactivate
+        iOS only. When set to true, does not perform an explicit [[AVAudioSession sharedInstace] setActive:false] on uninitialization.
+
+    jack.pClientName
+        The name of the client to pass to `jack_client_open()`.
+
+    jack.tryStartServer
+        Whether or not to try auto-starting the JACK server. Defaults to false.
+
+
+It is recommended that only a single context is active at any given time because it's a bulky data structure which performs run-time linking for the
+relevant backends every time it's initialized.
+
+The location of the context cannot change throughout it's lifetime. Consider allocating the `ma_context` object with `malloc()` if this is an issue. The
+reason for this is that a pointer to the context is stored in the `ma_device` structure.
+
+
+Example 1 - Default Initialization
+----------------------------------
+The example below shows how to initialize the context using the default configuration.
+
+```c
+ma_context context;
+ma_result result = ma_context_init(NULL, 0, NULL, &context);
+if (result != MA_SUCCESS) {
+    // Error.
+}
+```
+
+
+Example 2 - Custom Configuration
+--------------------------------
+The example below shows how to initialize the context using custom backend priorities and a custom configuration. In this hypothetical example, the program
+wants to prioritize ALSA over PulseAudio on Linux. They also want to avoid using the WinMM backend on Windows because it's latency is too high. They also
+want an error to be returned if no valid backend is available which they achieve by excluding the Null backend.
+
+For the configuration, the program wants to capture any log messages so they can, for example, route it to a log file and user interface.
+
+```c
+ma_backend backends[] = {
+    ma_backend_alsa,
+    ma_backend_pulseaudio,
+    ma_backend_wasapi,
+    ma_backend_dsound
+};
+
+ma_log log;
+ma_log_init(&log);
+ma_log_register_callback(&log, ma_log_callback_init(my_log_callbac, pMyLogUserData));
+
+ma_context_config config = ma_context_config_init();
+config.pLog = &log; // Specify a custom log object in the config so any logs that are posted from ma_context_init() are captured.
+
+ma_context context;
+ma_result result = ma_context_init(backends, sizeof(backends)/sizeof(backends[0]), &config, &context);
+if (result != MA_SUCCESS) {
+    // Error.
+    if (result == MA_NO_BACKEND) {
+        // Couldn't find an appropriate backend.
+    }
+}
+
+// You could also attach a log callback post-initialization:
+ma_log_register_callback(ma_context_get_log(&context), ma_log_callback_init(my_log_callback, pMyLogUserData));
+```
+
+
+See Also
+--------
+ma_context_config_init()
+ma_context_uninit()
+*/
+MA_API ma_result ma_context_init(const ma_backend backends[], ma_uint32 backendCount, const ma_context_config* pConfig, ma_context* pContext);
+
+/*
+Uninitializes a context.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Unsafe. Do not call this function across multiple threads as some backends read and write to global state.
+
+
+Remarks
+-------
+Results are undefined if you call this while any device created by this context is still active.
+
+
+See Also
+--------
+ma_context_init()
+*/
+MA_API ma_result ma_context_uninit(ma_context* pContext);
+
+/*
+Retrieves the size of the ma_context object.
+
+This is mainly for the purpose of bindings to know how much memory to allocate.
+*/
+MA_API size_t ma_context_sizeof(void);
+
+/*
+Retrieves a pointer to the log object associated with this context.
+
+
+Remarks
+-------
+Pass the returned pointer to `ma_log_post()`, `ma_log_postv()` or `ma_log_postf()` to post a log
+message.
+
+You can attach your own logging callback to the log with `ma_log_register_callback()`
+
+
+Return Value
+------------
+A pointer to the `ma_log` object that the context uses to post log messages. If some error occurs,
+NULL will be returned.
+*/
+MA_API ma_log* ma_context_get_log(ma_context* pContext);
+
+/*
+Enumerates over every device (both playback and capture).
+
+This is a lower-level enumeration function to the easier to use `ma_context_get_devices()`. Use `ma_context_enumerate_devices()` if you would rather not incur
+an internal heap allocation, or it simply suits your code better.
+
+Note that this only retrieves the ID and name/description of the device. The reason for only retrieving basic information is that it would otherwise require
+opening the backend device in order to probe it for more detailed information which can be inefficient. Consider using `ma_context_get_device_info()` for this,
+but don't call it from within the enumeration callback.
+
+Returning false from the callback will stop enumeration. Returning true will continue enumeration.
+
+
+Parameters
+----------
+pContext (in)
+    A pointer to the context performing the enumeration.
+
+callback (in)
+    The callback to fire for each enumerated device.
+
+pUserData (in)
+    A pointer to application-defined data passed to the callback.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Safe. This is guarded using a simple mutex lock.
+
+
+Remarks
+-------
+Do _not_ assume the first enumerated device of a given type is the default device.
+
+Some backends and platforms may only support default playback and capture devices.
+
+In general, you should not do anything complicated from within the callback. In particular, do not try initializing a device from within the callback. Also,
+do not try to call `ma_context_get_device_info()` from within the callback.
+
+Consider using `ma_context_get_devices()` for a simpler and safer API, albeit at the expense of an internal heap allocation.
+
+
+Example 1 - Simple Enumeration
+------------------------------
+ma_bool32 ma_device_enum_callback(ma_context* pContext, ma_device_type deviceType, const ma_device_info* pInfo, void* pUserData)
+{
+    printf("Device Name: %s\n", pInfo->name);
+    return MA_TRUE;
+}
+
+ma_result result = ma_context_enumerate_devices(&context, my_device_enum_callback, pMyUserData);
+if (result != MA_SUCCESS) {
+    // Error.
+}
+
+
+See Also
+--------
+ma_context_get_devices()
+*/
+MA_API ma_result ma_context_enumerate_devices(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData);
+
+/*
+Retrieves basic information about every active playback and/or capture device.
+
+This function will allocate memory internally for the device lists and return a pointer to them through the `ppPlaybackDeviceInfos` and `ppCaptureDeviceInfos`
+parameters. If you do not want to incur the overhead of these allocations consider using `ma_context_enumerate_devices()` which will instead use a callback.
+
+Note that this only retrieves the ID and name/description of the device. The reason for only retrieving basic information is that it would otherwise require
+opening the backend device in order to probe it for more detailed information which can be inefficient. Consider using `ma_context_get_device_info()` for this,
+but don't call it from within the enumeration callback.
+
+
+Parameters
+----------
+pContext (in)
+    A pointer to the context performing the enumeration.
+
+ppPlaybackDeviceInfos (out)
+    A pointer to a pointer that will receive the address of a buffer containing the list of `ma_device_info` structures for playback devices.
+
+pPlaybackDeviceCount (out)
+    A pointer to an unsigned integer that will receive the number of playback devices.
+
+ppCaptureDeviceInfos (out)
+    A pointer to a pointer that will receive the address of a buffer containing the list of `ma_device_info` structures for capture devices.
+
+pCaptureDeviceCount (out)
+    A pointer to an unsigned integer that will receive the number of capture devices.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Unsafe. Since each call to this function invalidates the pointers from the previous call, you should not be calling this simultaneously across multiple
+threads. Instead, you need to make a copy of the returned data with your own higher level synchronization.
+
+
+Remarks
+-------
+It is _not_ safe to assume the first device in the list is the default device.
+
+You can pass in NULL for the playback or capture lists in which case they'll be ignored.
+
+The returned pointers will become invalid upon the next call this this function, or when the context is uninitialized. Do not free the returned pointers.
+
+
+See Also
+--------
+ma_context_enumerate_devices()
+*/
+MA_API ma_result ma_context_get_devices(ma_context* pContext, ma_device_info** ppPlaybackDeviceInfos, ma_uint32* pPlaybackDeviceCount, ma_device_info** ppCaptureDeviceInfos, ma_uint32* pCaptureDeviceCount);
+
+/*
+Retrieves information about a device of the given type, with the specified ID and share mode.
+
+
+Parameters
+----------
+pContext (in)
+    A pointer to the context performing the query.
+
+deviceType (in)
+    The type of the device being queried. Must be either `ma_device_type_playback` or `ma_device_type_capture`.
+
+pDeviceID (in)
+    The ID of the device being queried.
+
+pDeviceInfo (out)
+    A pointer to the `ma_device_info` structure that will receive the device information.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Safe. This is guarded using a simple mutex lock.
+
+
+Remarks
+-------
+Do _not_ call this from within the `ma_context_enumerate_devices()` callback.
+
+It's possible for a device to have different information and capabilities depending on whether or not it's opened in shared or exclusive mode. For example, in
+shared mode, WASAPI always uses floating point samples for mixing, but in exclusive mode it can be anything. Therefore, this function allows you to specify
+which share mode you want information for. Note that not all backends and devices support shared or exclusive mode, in which case this function will fail if
+the requested share mode is unsupported.
+
+This leaves pDeviceInfo unmodified in the result of an error.
+*/
+MA_API ma_result ma_context_get_device_info(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo);
+
+/*
+Determines if the given context supports loopback mode.
+
+
+Parameters
+----------
+pContext (in)
+    A pointer to the context getting queried.
+
+
+Return Value
+------------
+MA_TRUE if the context supports loopback mode; MA_FALSE otherwise.
+*/
+MA_API ma_bool32 ma_context_is_loopback_supported(ma_context* pContext);
+
+
+
+/*
+Initializes a device config with default settings.
+
+
+Parameters
+----------
+deviceType (in)
+    The type of the device this config is being initialized for. This must set to one of the following:
+
+    |-------------------------|
+    | Device Type             |
+    |-------------------------|
+    | ma_device_type_playback |
+    | ma_device_type_capture  |
+    | ma_device_type_duplex   |
+    | ma_device_type_loopback |
+    |-------------------------|
+
+
+Return Value
+------------
+A new device config object with default settings. You will typically want to adjust the config after this function returns. See remarks.
+
+
+Thread Safety
+-------------
+Safe.
+
+
+Callback Safety
+---------------
+Safe, but don't try initializing a device in a callback.
+
+
+Remarks
+-------
+The returned config will be initialized to defaults. You will normally want to customize a few variables before initializing the device. See Example 1 for a
+typical configuration which sets the sample format, channel count, sample rate, data callback and user data. These are usually things you will want to change
+before initializing the device.
+
+See `ma_device_init()` for details on specific configuration options.
+
+
+Example 1 - Simple Configuration
+--------------------------------
+The example below is what a program will typically want to configure for each device at a minimum. Notice how `ma_device_config_init()` is called first, and
+then the returned object is modified directly. This is important because it ensures that your program continues to work as new configuration options are added
+to the `ma_device_config` structure.
+
+```c
+ma_device_config config = ma_device_config_init(ma_device_type_playback);
+config.playback.format   = ma_format_f32;
+config.playback.channels = 2;
+config.sampleRate        = 48000;
+config.dataCallback      = ma_data_callback;
+config.pUserData         = pMyUserData;
+```
+
+
+See Also
+--------
+ma_device_init()
+ma_device_init_ex()
+*/
+MA_API ma_device_config ma_device_config_init(ma_device_type deviceType);
+
+
+/*
+Initializes a device.
+
+A device represents a physical audio device. The idea is you send or receive audio data from the device to either play it back through a speaker, or capture it
+from a microphone. Whether or not you should send or receive data from the device (or both) depends on the type of device you are initializing which can be
+playback, capture, full-duplex or loopback. (Note that loopback mode is only supported on select backends.) Sending and receiving audio data to and from the
+device is done via a callback which is fired by miniaudio at periodic time intervals.
+
+The frequency at which data is delivered to and from a device depends on the size of its period. The size of the period can be defined in terms of PCM frames
+or milliseconds, whichever is more convenient. Generally speaking, the smaller the period, the lower the latency at the expense of higher CPU usage and
+increased risk of glitching due to the more frequent and granular data deliver intervals. The size of a period will depend on your requirements, but
+miniaudio's defaults should work fine for most scenarios. If you're building a game you should leave this fairly small, whereas if you're building a simple
+media player you can make it larger. Note that the period size you request is actually just a hint - miniaudio will tell the backend what you want, but the
+backend is ultimately responsible for what it gives you. You cannot assume you will get exactly what you ask for.
+
+When delivering data to and from a device you need to make sure it's in the correct format which you can set through the device configuration. You just set the
+format that you want to use and miniaudio will perform all of the necessary conversion for you internally. When delivering data to and from the callback you
+can assume the format is the same as what you requested when you initialized the device. See Remarks for more details on miniaudio's data conversion pipeline.
+
+
+Parameters
+----------
+pContext (in, optional)
+    A pointer to the context that owns the device. This can be null, in which case it creates a default context internally.
+
+pConfig (in)
+    A pointer to the device configuration. Cannot be null. See remarks for details.
+
+pDevice (out)
+    A pointer to the device object being initialized.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Unsafe. It is not safe to call this function simultaneously for different devices because some backends depend on and mutate global state. The same applies to
+calling this at the same time as `ma_device_uninit()`.
+
+
+Callback Safety
+---------------
+Unsafe. It is not safe to call this inside any callback.
+
+
+Remarks
+-------
+Setting `pContext` to NULL will result in miniaudio creating a default context internally and is equivalent to passing in a context initialized like so:
+
+    ```c
+    ma_context_init(NULL, 0, NULL, &context);
+    ```
+
+Do not set `pContext` to NULL if you are needing to open multiple devices. You can, however, use NULL when initializing the first device, and then use
+device.pContext for the initialization of other devices.
+
+The device can be configured via the `pConfig` argument. The config object is initialized with `ma_device_config_init()`. Individual configuration settings can
+then be set directly on the structure. Below are the members of the `ma_device_config` object.
+
+    deviceType
+        Must be `ma_device_type_playback`, `ma_device_type_capture`, `ma_device_type_duplex` of `ma_device_type_loopback`.
+
+    sampleRate
+        The sample rate, in hertz. The most common sample rates are 48000 and 44100. Setting this to 0 will use the device's native sample rate.
+
+    periodSizeInFrames
+        The desired size of a period in PCM frames. If this is 0, `periodSizeInMilliseconds` will be used instead. If both are 0 the default buffer size will
+        be used depending on the selected performance profile. This value affects latency. See below for details.
+
+    periodSizeInMilliseconds
+        The desired size of a period in milliseconds. If this is 0, `periodSizeInFrames` will be used instead. If both are 0 the default buffer size will be
+        used depending on the selected performance profile. The value affects latency. See below for details.
+
+    periods
+        The number of periods making up the device's entire buffer. The total buffer size is `periodSizeInFrames` or `periodSizeInMilliseconds` multiplied by
+        this value. This is just a hint as backends will be the ones who ultimately decide how your periods will be configured.
+
+    performanceProfile
+        A hint to miniaudio as to the performance requirements of your program. Can be either `ma_performance_profile_low_latency` (default) or
+        `ma_performance_profile_conservative`. This mainly affects the size of default buffers and can usually be left at its default value.
+
+    noPreSilencedOutputBuffer
+        When set to true, the contents of the output buffer passed into the data callback will be left undefined. When set to false (default), the contents of
+        the output buffer will be cleared the zero. You can use this to avoid the overhead of zeroing out the buffer if you can guarantee that your data
+        callback will write to every sample in the output buffer, or if you are doing your own clearing.
+
+    noClip
+        When set to true, the contents of the output buffer are left alone after returning and it will be left up to the backend itself to decide whether or
+        not to clip. When set to false (default), the contents of the output buffer passed into the data callback will be clipped after returning. This only
+        applies when the playback sample format is f32.
+
+    noDisableDenormals
+        By default, miniaudio will disable denormals when the data callback is called. Setting this to true will prevent the disabling of denormals.
+
+    noFixedSizedCallback
+        Allows miniaudio to fire the data callback with any frame count. When this is set to false (the default), the data callback will be fired with a
+        consistent frame count as specified by `periodSizeInFrames` or `periodSizeInMilliseconds`. When set to true, miniaudio will fire the callback with
+        whatever the backend requests, which could be anything.
+
+    dataCallback
+        The callback to fire whenever data is ready to be delivered to or from the device.
+
+    notificationCallback
+        The callback to fire when something has changed with the device, such as whether or not it has been started or stopped.
+
+    pUserData
+        The user data pointer to use with the device. You can access this directly from the device object like `device.pUserData`.
+
+    resampling.algorithm
+        The resampling algorithm to use when miniaudio needs to perform resampling between the rate specified by `sampleRate` and the device's native rate. The
+        default value is `ma_resample_algorithm_linear`, and the quality can be configured with `resampling.linear.lpfOrder`.
+
+    resampling.pBackendVTable
+        A pointer to an optional vtable that can be used for plugging in a custom resampler.
+
+    resampling.pBackendUserData
+        A pointer that will passed to callbacks in pBackendVTable.
+
+    resampling.linear.lpfOrder
+        The linear resampler applies a low-pass filter as part of its processing for anti-aliasing. This setting controls the order of the filter. The higher
+        the value, the better the quality, in general. Setting this to 0 will disable low-pass filtering altogether. The maximum value is
+        `MA_MAX_FILTER_ORDER`. The default value is `min(4, MA_MAX_FILTER_ORDER)`.
+
+    playback.pDeviceID
+        A pointer to a `ma_device_id` structure containing the ID of the playback device to initialize. Setting this NULL (default) will use the system's
+        default playback device. Retrieve the device ID from the `ma_device_info` structure, which can be retrieved using device enumeration.
+
+    playback.format
+        The sample format to use for playback. When set to `ma_format_unknown` the device's native format will be used. This can be retrieved after
+        initialization from the device object directly with `device.playback.format`.
+
+    playback.channels
+        The number of channels to use for playback. When set to 0 the device's native channel count will be used. This can be retrieved after initialization
+        from the device object directly with `device.playback.channels`.
+
+    playback.pChannelMap
+        The channel map to use for playback. When left empty, the device's native channel map will be used. This can be retrieved after initialization from the
+        device object direct with `device.playback.pChannelMap`. When set, the buffer should contain `channels` items.
+
+    playback.shareMode
+        The preferred share mode to use for playback. Can be either `ma_share_mode_shared` (default) or `ma_share_mode_exclusive`. Note that if you specify
+        exclusive mode, but it's not supported by the backend, initialization will fail. You can then fall back to shared mode if desired by changing this to
+        ma_share_mode_shared and reinitializing.
+
+    capture.pDeviceID
+        A pointer to a `ma_device_id` structure containing the ID of the capture device to initialize. Setting this NULL (default) will use the system's
+        default capture device. Retrieve the device ID from the `ma_device_info` structure, which can be retrieved using device enumeration.
+
+    capture.format
+        The sample format to use for capture. When set to `ma_format_unknown` the device's native format will be used. This can be retrieved after
+        initialization from the device object directly with `device.capture.format`.
+
+    capture.channels
+        The number of channels to use for capture. When set to 0 the device's native channel count will be used. This can be retrieved after initialization
+        from the device object directly with `device.capture.channels`.
+
+    capture.pChannelMap
+        The channel map to use for capture. When left empty, the device's native channel map will be used. This can be retrieved after initialization from the
+        device object direct with `device.capture.pChannelMap`. When set, the buffer should contain `channels` items.
+
+    capture.shareMode
+        The preferred share mode to use for capture. Can be either `ma_share_mode_shared` (default) or `ma_share_mode_exclusive`. Note that if you specify
+        exclusive mode, but it's not supported by the backend, initialization will fail. You can then fall back to shared mode if desired by changing this to
+        ma_share_mode_shared and reinitializing.
+
+    wasapi.noAutoConvertSRC
+        WASAPI only. When set to true, disables WASAPI's automatic resampling and forces the use of miniaudio's resampler. Defaults to false.
+
+    wasapi.noDefaultQualitySRC
+        WASAPI only. Only used when `wasapi.noAutoConvertSRC` is set to false. When set to true, disables the use of `AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY`.
+        You should usually leave this set to false, which is the default.
+
+    wasapi.noAutoStreamRouting
+        WASAPI only. When set to true, disables automatic stream routing on the WASAPI backend. Defaults to false.
+
+    wasapi.noHardwareOffloading
+        WASAPI only. When set to true, disables the use of WASAPI's hardware offloading feature. Defaults to false.
+
+    alsa.noMMap
+        ALSA only. When set to true, disables MMap mode. Defaults to false.
+
+    alsa.noAutoFormat
+        ALSA only. When set to true, disables ALSA's automatic format conversion by including the SND_PCM_NO_AUTO_FORMAT flag. Defaults to false.
+
+    alsa.noAutoChannels
+        ALSA only. When set to true, disables ALSA's automatic channel conversion by including the SND_PCM_NO_AUTO_CHANNELS flag. Defaults to false.
+
+    alsa.noAutoResample
+        ALSA only. When set to true, disables ALSA's automatic resampling by including the SND_PCM_NO_AUTO_RESAMPLE flag. Defaults to false.
+
+    pulse.pStreamNamePlayback
+        PulseAudio only. Sets the stream name for playback.
+
+    pulse.pStreamNameCapture
+        PulseAudio only. Sets the stream name for capture.
+
+    pulse.channelMap
+        PulseAudio only. Sets the channel map that is requested from PulseAudio. See MA_PA_CHANNEL_MAP_* constants. Defaults to MA_PA_CHANNEL_MAP_AIFF.
+
+    coreaudio.allowNominalSampleRateChange
+        Core Audio only. Desktop only. When enabled, allows the sample rate of the device to be changed at the operating system level. This
+        is disabled by default in order to prevent intrusive changes to the user's system. This is useful if you want to use a sample rate
+        that is known to be natively supported by the hardware thereby avoiding the cost of resampling. When set to true, miniaudio will
+        find the closest match between the sample rate requested in the device config and the sample rates natively supported by the
+        hardware. When set to false, the sample rate currently set by the operating system will always be used.
+
+    opensl.streamType
+        OpenSL only. Explicitly sets the stream type. If left unset (`ma_opensl_stream_type_default`), the
+        stream type will be left unset. Think of this as the type of audio you're playing.
+
+    opensl.recordingPreset
+        OpenSL only. Explicitly sets the type of recording your program will be doing. When left
+        unset, the recording preset will be left unchanged.
+
+    aaudio.usage
+        AAudio only. Explicitly sets the nature of the audio the program will be consuming. When
+        left unset, the usage will be left unchanged.
+
+    aaudio.contentType
+        AAudio only. Sets the content type. When left unset, the content type will be left unchanged.
+
+    aaudio.inputPreset
+        AAudio only. Explicitly sets the type of recording your program will be doing. When left
+        unset, the input preset will be left unchanged.
+
+    aaudio.noAutoStartAfterReroute
+        AAudio only. Controls whether or not the device should be automatically restarted after a
+        stream reroute. When set to false (default) the device will be restarted automatically;
+        otherwise the device will be stopped.
+
+
+Once initialized, the device's config is immutable. If you need to change the config you will need to initialize a new device.
+
+After initializing the device it will be in a stopped state. To start it, use `ma_device_start()`.
+
+If both `periodSizeInFrames` and `periodSizeInMilliseconds` are set to zero, it will default to `MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY` or
+`MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE`, depending on whether or not `performanceProfile` is set to `ma_performance_profile_low_latency` or
+`ma_performance_profile_conservative`.
+
+If you request exclusive mode and the backend does not support it an error will be returned. For robustness, you may want to first try initializing the device
+in exclusive mode, and then fall back to shared mode if required. Alternatively you can just request shared mode (the default if you leave it unset in the
+config) which is the most reliable option. Some backends do not have a practical way of choosing whether or not the device should be exclusive or not (ALSA,
+for example) in which case it just acts as a hint. Unless you have special requirements you should try avoiding exclusive mode as it's intrusive to the user.
+Starting with Windows 10, miniaudio will use low-latency shared mode where possible which may make exclusive mode unnecessary.
+
+When sending or receiving data to/from a device, miniaudio will internally perform a format conversion to convert between the format specified by the config
+and the format used internally by the backend. If you pass in 0 for the sample format, channel count, sample rate _and_ channel map, data transmission will run
+on an optimized pass-through fast path. You can retrieve the format, channel count and sample rate by inspecting the `playback/capture.format`,
+`playback/capture.channels` and `sampleRate` members of the device object.
+
+When compiling for UWP you must ensure you call this function on the main UI thread because the operating system may need to present the user with a message
+asking for permissions. Please refer to the official documentation for ActivateAudioInterfaceAsync() for more information.
+
+ALSA Specific: When initializing the default device, requesting shared mode will try using the "dmix" device for playback and the "dsnoop" device for capture.
+If these fail it will try falling back to the "hw" device.
+
+
+Example 1 - Simple Initialization
+---------------------------------
+This example shows how to initialize a simple playback device using a standard configuration. If you are just needing to do simple playback from the default
+playback device this is usually all you need.
+
+```c
+ma_device_config config = ma_device_config_init(ma_device_type_playback);
+config.playback.format   = ma_format_f32;
+config.playback.channels = 2;
+config.sampleRate        = 48000;
+config.dataCallback      = ma_data_callback;
+config.pMyUserData       = pMyUserData;
+
+ma_device device;
+ma_result result = ma_device_init(NULL, &config, &device);
+if (result != MA_SUCCESS) {
+    // Error
+}
+```
+
+
+Example 2 - Advanced Initialization
+-----------------------------------
+This example shows how you might do some more advanced initialization. In this hypothetical example we want to control the latency by setting the buffer size
+and period count. We also want to allow the user to be able to choose which device to output from which means we need a context so we can perform device
+enumeration.
+
+```c
+ma_context context;
+ma_result result = ma_context_init(NULL, 0, NULL, &context);
+if (result != MA_SUCCESS) {
+    // Error
+}
+
+ma_device_info* pPlaybackDeviceInfos;
+ma_uint32 playbackDeviceCount;
+result = ma_context_get_devices(&context, &pPlaybackDeviceInfos, &playbackDeviceCount, NULL, NULL);
+if (result != MA_SUCCESS) {
+    // Error
+}
+
+// ... choose a device from pPlaybackDeviceInfos ...
+
+ma_device_config config = ma_device_config_init(ma_device_type_playback);
+config.playback.pDeviceID       = pMyChosenDeviceID;    // <-- Get this from the `id` member of one of the `ma_device_info` objects returned by ma_context_get_devices().
+config.playback.format          = ma_format_f32;
+config.playback.channels        = 2;
+config.sampleRate               = 48000;
+config.dataCallback             = ma_data_callback;
+config.pUserData                = pMyUserData;
+config.periodSizeInMilliseconds = 10;
+config.periods                  = 3;
+
+ma_device device;
+result = ma_device_init(&context, &config, &device);
+if (result != MA_SUCCESS) {
+    // Error
+}
+```
+
+
+See Also
+--------
+ma_device_config_init()
+ma_device_uninit()
+ma_device_start()
+ma_context_init()
+ma_context_get_devices()
+ma_context_enumerate_devices()
+*/
+MA_API ma_result ma_device_init(ma_context* pContext, const ma_device_config* pConfig, ma_device* pDevice);
+
+/*
+Initializes a device without a context, with extra parameters for controlling the configuration of the internal self-managed context.
+
+This is the same as `ma_device_init()`, only instead of a context being passed in, the parameters from `ma_context_init()` are passed in instead. This function
+allows you to configure the internally created context.
+
+
+Parameters
+----------
+backends (in, optional)
+    A list of backends to try initializing, in priority order. Can be NULL, in which case it uses default priority order.
+
+backendCount (in, optional)
+    The number of items in `backend`. Ignored if `backend` is NULL.
+
+pContextConfig (in, optional)
+    The context configuration.
+
+pConfig (in)
+    A pointer to the device configuration. Cannot be null. See remarks for details.
+
+pDevice (out)
+    A pointer to the device object being initialized.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Unsafe. It is not safe to call this function simultaneously for different devices because some backends depend on and mutate global state. The same applies to
+calling this at the same time as `ma_device_uninit()`.
+
+
+Callback Safety
+---------------
+Unsafe. It is not safe to call this inside any callback.
+
+
+Remarks
+-------
+You only need to use this function if you want to configure the context differently to its defaults. You should never use this function if you want to manage
+your own context.
+
+See the documentation for `ma_context_init()` for information on the different context configuration options.
+
+
+See Also
+--------
+ma_device_init()
+ma_device_uninit()
+ma_device_config_init()
+ma_context_init()
+*/
+MA_API ma_result ma_device_init_ex(const ma_backend backends[], ma_uint32 backendCount, const ma_context_config* pContextConfig, const ma_device_config* pConfig, ma_device* pDevice);
+
+/*
+Uninitializes a device.
+
+This will explicitly stop the device. You do not need to call `ma_device_stop()` beforehand, but it's harmless if you do.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device to stop.
+
+
+Return Value
+------------
+Nothing
+
+
+Thread Safety
+-------------
+Unsafe. As soon as this API is called the device should be considered undefined.
+
+
+Callback Safety
+---------------
+Unsafe. It is not safe to call this inside any callback. Doing this will result in a deadlock.
+
+
+See Also
+--------
+ma_device_init()
+ma_device_stop()
+*/
+MA_API void ma_device_uninit(ma_device* pDevice);
+
+
+/*
+Retrieves a pointer to the context that owns the given device.
+*/
+MA_API ma_context* ma_device_get_context(ma_device* pDevice);
+
+/*
+Helper function for retrieving the log object associated with the context that owns this device.
+*/
+MA_API ma_log* ma_device_get_log(ma_device* pDevice);
+
+
+/*
+Retrieves information about the device.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device whose information is being retrieved.
+
+type (in)
+    The device type. This parameter is required for duplex devices. When retrieving device
+    information, you are doing so for an individual playback or capture device.
+
+pDeviceInfo (out)
+    A pointer to the `ma_device_info` that will receive the device information.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Unsafe. This should be considered unsafe because it may be calling into the backend which may or
+may not be safe.
+
+
+Callback Safety
+---------------
+Unsafe. You should avoid calling this in the data callback because it may call into the backend
+which may or may not be safe.
+*/
+MA_API ma_result ma_device_get_info(ma_device* pDevice, ma_device_type type, ma_device_info* pDeviceInfo);
+
+
+/*
+Retrieves the name of the device.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device whose information is being retrieved.
+
+type (in)
+    The device type. This parameter is required for duplex devices. When retrieving device
+    information, you are doing so for an individual playback or capture device.
+
+pName (out)
+    A pointer to the buffer that will receive the name.
+
+nameCap (in)
+    The capacity of the output buffer, including space for the null terminator.
+
+pLengthNotIncludingNullTerminator (out, optional)
+    A pointer to the variable that will receive the length of the name, not including the null
+    terminator.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Unsafe. This should be considered unsafe because it may be calling into the backend which may or
+may not be safe.
+
+
+Callback Safety
+---------------
+Unsafe. You should avoid calling this in the data callback because it may call into the backend
+which may or may not be safe.
+
+
+Remarks
+-------
+If the name does not fully fit into the output buffer, it'll be truncated. You can pass in NULL to
+`pName` if you want to first get the length of the name for the purpose of memory allocation of the
+output buffer. Allocating a buffer of size `MA_MAX_DEVICE_NAME_LENGTH + 1` should be enough for
+most cases and will avoid the need for the inefficiency of calling this function twice.
+
+This is implemented in terms of `ma_device_get_info()`.
+*/
+MA_API ma_result ma_device_get_name(ma_device* pDevice, ma_device_type type, char* pName, size_t nameCap, size_t* pLengthNotIncludingNullTerminator);
+
+
+/*
+Starts the device. For playback devices this begins playback. For capture devices it begins recording.
+
+Use `ma_device_stop()` to stop the device.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device to start.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Safe. It's safe to call this from any thread with the exception of the callback thread.
+
+
+Callback Safety
+---------------
+Unsafe. It is not safe to call this inside any callback.
+
+
+Remarks
+-------
+For a playback device, this will retrieve an initial chunk of audio data from the client before returning. The reason for this is to ensure there is valid
+audio data in the buffer, which needs to be done before the device begins playback.
+
+This API waits until the backend device has been started for real by the worker thread. It also waits on a mutex for thread-safety.
+
+Do not call this in any callback.
+
+
+See Also
+--------
+ma_device_stop()
+*/
+MA_API ma_result ma_device_start(ma_device* pDevice);
+
+/*
+Stops the device. For playback devices this stops playback. For capture devices it stops recording.
+
+Use `ma_device_start()` to start the device again.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device to stop.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error code otherwise.
+
+
+Thread Safety
+-------------
+Safe. It's safe to call this from any thread with the exception of the callback thread.
+
+
+Callback Safety
+---------------
+Unsafe. It is not safe to call this inside any callback. Doing this will result in a deadlock.
+
+
+Remarks
+-------
+This API needs to wait on the worker thread to stop the backend device properly before returning. It also waits on a mutex for thread-safety. In addition, some
+backends need to wait for the device to finish playback/recording of the current fragment which can take some time (usually proportionate to the buffer size
+that was specified at initialization time).
+
+Backends are required to either pause the stream in-place or drain the buffer if pausing is not possible. The reason for this is that stopping the device and
+the resuming it with ma_device_start() (which you might do when your program loses focus) may result in a situation where those samples are never output to the
+speakers or received from the microphone which can in turn result in de-syncs.
+
+Do not call this in any callback.
+
+
+See Also
+--------
+ma_device_start()
+*/
+MA_API ma_result ma_device_stop(ma_device* pDevice);
+
+/*
+Determines whether or not the device is started.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device whose start state is being retrieved.
+
+
+Return Value
+------------
+True if the device is started, false otherwise.
+
+
+Thread Safety
+-------------
+Safe. If another thread calls `ma_device_start()` or `ma_device_stop()` at this same time as this function is called, there's a very small chance the return
+value will be out of sync.
+
+
+Callback Safety
+---------------
+Safe. This is implemented as a simple accessor.
+
+
+See Also
+--------
+ma_device_start()
+ma_device_stop()
+*/
+MA_API ma_bool32 ma_device_is_started(const ma_device* pDevice);
+
+
+/*
+Retrieves the state of the device.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device whose state is being retrieved.
+
+
+Return Value
+------------
+The current state of the device. The return value will be one of the following:
+
+    +-------------------------------+------------------------------------------------------------------------------+
+    | ma_device_state_uninitialized | Will only be returned if the device is in the middle of initialization.      |
+    +-------------------------------+------------------------------------------------------------------------------+
+    | ma_device_state_stopped       | The device is stopped. The initial state of the device after initialization. |
+    +-------------------------------+------------------------------------------------------------------------------+
+    | ma_device_state_started       | The device started and requesting and/or delivering audio data.              |
+    +-------------------------------+------------------------------------------------------------------------------+
+    | ma_device_state_starting      | The device is in the process of starting.                                    |
+    +-------------------------------+------------------------------------------------------------------------------+
+    | ma_device_state_stopping      | The device is in the process of stopping.                                    |
+    +-------------------------------+------------------------------------------------------------------------------+
+
+
+Thread Safety
+-------------
+Safe. This is implemented as a simple accessor. Note that if the device is started or stopped at the same time as this function is called,
+there's a possibility the return value could be out of sync. See remarks.
+
+
+Callback Safety
+---------------
+Safe. This is implemented as a simple accessor.
+
+
+Remarks
+-------
+The general flow of a devices state goes like this:
+
+    ```
+    ma_device_init()  -> ma_device_state_uninitialized -> ma_device_state_stopped
+    ma_device_start() -> ma_device_state_starting      -> ma_device_state_started
+    ma_device_stop()  -> ma_device_state_stopping      -> ma_device_state_stopped
+    ```
+
+When the state of the device is changed with `ma_device_start()` or `ma_device_stop()` at this same time as this function is called, the
+value returned by this function could potentially be out of sync. If this is significant to your program you need to implement your own
+synchronization.
+*/
+MA_API ma_device_state ma_device_get_state(const ma_device* pDevice);
+
+
+/*
+Performs post backend initialization routines for setting up internal data conversion.
+
+This should be called whenever the backend is initialized. The only time this should be called from
+outside of miniaudio is if you're implementing a custom backend, and you would only do it if you
+are reinitializing the backend due to rerouting or reinitializing for some reason.
+
+
+Parameters
+----------
+pDevice [in]
+    A pointer to the device.
+
+deviceType [in]
+    The type of the device that was just reinitialized.
+
+pPlaybackDescriptor [in]
+    The descriptor of the playback device containing the internal data format and buffer sizes.
+
+pPlaybackDescriptor [in]
+    The descriptor of the capture device containing the internal data format and buffer sizes.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other error otherwise.
+
+
+Thread Safety
+-------------
+Unsafe. This will be reinitializing internal data converters which may be in use by another thread.
+
+
+Callback Safety
+---------------
+Unsafe. This will be reinitializing internal data converters which may be in use by the callback.
+
+
+Remarks
+-------
+For a duplex device, you can call this for only one side of the system. This is why the deviceType
+is specified as a parameter rather than deriving it from the device.
+
+You do not need to call this manually unless you are doing a custom backend, in which case you need
+only do it if you're manually performing rerouting or reinitialization.
+*/
+MA_API ma_result ma_device_post_init(ma_device* pDevice, ma_device_type deviceType, const ma_device_descriptor* pPlaybackDescriptor, const ma_device_descriptor* pCaptureDescriptor);
+
+
+/*
+Sets the master volume factor for the device.
+
+The volume factor must be between 0 (silence) and 1 (full volume). Use `ma_device_set_master_volume_db()` to use decibel notation, where 0 is full volume and
+values less than 0 decreases the volume.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device whose volume is being set.
+
+volume (in)
+    The new volume factor. Must be >= 0.
+
+
+Return Value
+------------
+MA_SUCCESS if the volume was set successfully.
+MA_INVALID_ARGS if pDevice is NULL.
+MA_INVALID_ARGS if volume is negative.
+
+
+Thread Safety
+-------------
+Safe. This just sets a local member of the device object.
+
+
+Callback Safety
+---------------
+Safe. If you set the volume in the data callback, that data written to the output buffer will have the new volume applied.
+
+
+Remarks
+-------
+This applies the volume factor across all channels.
+
+This does not change the operating system's volume. It only affects the volume for the given `ma_device` object's audio stream.
+
+
+See Also
+--------
+ma_device_get_master_volume()
+ma_device_set_master_volume_db()
+ma_device_get_master_volume_db()
+*/
+MA_API ma_result ma_device_set_master_volume(ma_device* pDevice, float volume);
+
+/*
+Retrieves the master volume factor for the device.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device whose volume factor is being retrieved.
+
+pVolume (in)
+    A pointer to the variable that will receive the volume factor. The returned value will be in the range of [0, 1].
+
+
+Return Value
+------------
+MA_SUCCESS if successful.
+MA_INVALID_ARGS if pDevice is NULL.
+MA_INVALID_ARGS if pVolume is NULL.
+
+
+Thread Safety
+-------------
+Safe. This just a simple member retrieval.
+
+
+Callback Safety
+---------------
+Safe.
+
+
+Remarks
+-------
+If an error occurs, `*pVolume` will be set to 0.
+
+
+See Also
+--------
+ma_device_set_master_volume()
+ma_device_set_master_volume_gain_db()
+ma_device_get_master_volume_gain_db()
+*/
+MA_API ma_result ma_device_get_master_volume(ma_device* pDevice, float* pVolume);
+
+/*
+Sets the master volume for the device as gain in decibels.
+
+A gain of 0 is full volume, whereas a gain of < 0 will decrease the volume.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device whose gain is being set.
+
+gainDB (in)
+    The new volume as gain in decibels. Must be less than or equal to 0, where 0 is full volume and anything less than 0 decreases the volume.
+
+
+Return Value
+------------
+MA_SUCCESS if the volume was set successfully.
+MA_INVALID_ARGS if pDevice is NULL.
+MA_INVALID_ARGS if the gain is > 0.
+
+
+Thread Safety
+-------------
+Safe. This just sets a local member of the device object.
+
+
+Callback Safety
+---------------
+Safe. If you set the volume in the data callback, that data written to the output buffer will have the new volume applied.
+
+
+Remarks
+-------
+This applies the gain across all channels.
+
+This does not change the operating system's volume. It only affects the volume for the given `ma_device` object's audio stream.
+
+
+See Also
+--------
+ma_device_get_master_volume_gain_db()
+ma_device_set_master_volume()
+ma_device_get_master_volume()
+*/
+MA_API ma_result ma_device_set_master_volume_db(ma_device* pDevice, float gainDB);
+
+/*
+Retrieves the master gain in decibels.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to the device whose gain is being retrieved.
+
+pGainDB (in)
+    A pointer to the variable that will receive the gain in decibels. The returned value will be <= 0.
+
+
+Return Value
+------------
+MA_SUCCESS if successful.
+MA_INVALID_ARGS if pDevice is NULL.
+MA_INVALID_ARGS if pGainDB is NULL.
+
+
+Thread Safety
+-------------
+Safe. This just a simple member retrieval.
+
+
+Callback Safety
+---------------
+Safe.
+
+
+Remarks
+-------
+If an error occurs, `*pGainDB` will be set to 0.
+
+
+See Also
+--------
+ma_device_set_master_volume_db()
+ma_device_set_master_volume()
+ma_device_get_master_volume()
+*/
+MA_API ma_result ma_device_get_master_volume_db(ma_device* pDevice, float* pGainDB);
+
+
+/*
+Called from the data callback of asynchronous backends to allow miniaudio to process the data and fire the miniaudio data callback.
+
+
+Parameters
+----------
+pDevice (in)
+    A pointer to device whose processing the data callback.
+
+pOutput (out)
+    A pointer to the buffer that will receive the output PCM frame data. On a playback device this must not be NULL. On a duplex device
+    this can be NULL, in which case pInput must not be NULL.
+
+pInput (in)
+    A pointer to the buffer containing input PCM frame data. On a capture device this must not be NULL. On a duplex device this can be
+    NULL, in which case `pOutput` must not be NULL.
+
+frameCount (in)
+    The number of frames being processed.
+
+
+Return Value
+------------
+MA_SUCCESS if successful; any other result code otherwise.
+
+
+Thread Safety
+-------------
+This function should only ever be called from the internal data callback of the backend. It is safe to call this simultaneously between a
+playback and capture device in duplex setups.
+
+
+Callback Safety
+---------------
+Do not call this from the miniaudio data callback. It should only ever be called from the internal data callback of the backend.
+
+
+Remarks
+-------
+If both `pOutput` and `pInput` are NULL, and error will be returned. In duplex scenarios, both `pOutput` and `pInput` can be non-NULL, in
+which case `pInput` will be processed first, followed by `pOutput`.
+
+If you are implementing a custom backend, and that backend uses a callback for data delivery, you'll need to call this from inside that
+callback.
+*/
+MA_API ma_result ma_device_handle_backend_data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount);
+
+
+/*
+Calculates an appropriate buffer size from a descriptor, native sample rate and performance profile.
+
+This function is used by backends for helping determine an appropriately sized buffer to use with
+the device depending on the values of `periodSizeInFrames` and `periodSizeInMilliseconds` in the
+`pDescriptor` object. Since buffer size calculations based on time depends on the sample rate, a
+best guess at the device's native sample rate is also required which is where `nativeSampleRate`
+comes in. In addition, the performance profile is also needed for cases where both the period size
+in frames and milliseconds are both zero.
+
+
+Parameters
+----------
+pDescriptor (in)
+    A pointer to device descriptor whose `periodSizeInFrames` and `periodSizeInMilliseconds` members
+    will be used for the calculation of the buffer size.
+
+nativeSampleRate (in)
+    The device's native sample rate. This is only ever used when the `periodSizeInFrames` member of
+    `pDescriptor` is zero. In this case, `periodSizeInMilliseconds` will be used instead, in which
+    case a sample rate is required to convert to a size in frames.
+
+performanceProfile (in)
+    When both the `periodSizeInFrames` and `periodSizeInMilliseconds` members of `pDescriptor` are
+    zero, miniaudio will fall back to a buffer size based on the performance profile. The profile
+    to use for this calculation is determine by this parameter.
+
+
+Return Value
+------------
+The calculated buffer size in frames.
+
+
+Thread Safety
+-------------
+This is safe so long as nothing modifies `pDescriptor` at the same time. However, this function
+should only ever be called from within the backend's device initialization routine and therefore
+shouldn't have any multithreading concerns.
+
+
+Callback Safety
+---------------
+This is safe to call within the data callback, but there is no reason to ever do this.
+
+
+Remarks
+-------
+If `nativeSampleRate` is zero, this function will fall back to `pDescriptor->sampleRate`. If that
+is also zero, `MA_DEFAULT_SAMPLE_RATE` will be used instead.
+*/
+MA_API ma_uint32 ma_calculate_buffer_size_in_frames_from_descriptor(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile);
+
+
+
+/*
+Retrieves a friendly name for a backend.
+*/
+MA_API const char* ma_get_backend_name(ma_backend backend);
+
+/*
+Retrieves the backend enum from the given name.
+*/
+MA_API ma_result ma_get_backend_from_name(const char* pBackendName, ma_backend* pBackend);
+
+/*
+Determines whether or not the given backend is available by the compilation environment.
+*/
+MA_API ma_bool32 ma_is_backend_enabled(ma_backend backend);
+
+/*
+Retrieves compile-time enabled backends.
+
+
+Parameters
+----------
+pBackends (out, optional)
+    A pointer to the buffer that will receive the enabled backends. Set to NULL to retrieve the backend count. Setting
+    the capacity of the buffer to `MA_BUFFER_COUNT` will guarantee it's large enough for all backends.
+
+backendCap (in)
+    The capacity of the `pBackends` buffer.
+
+pBackendCount (out)
+    A pointer to the variable that will receive the enabled backend count.
+
+
+Return Value
+------------
+MA_SUCCESS if successful.
+MA_INVALID_ARGS if `pBackendCount` is NULL.
+MA_NO_SPACE if the capacity of `pBackends` is not large enough.
+
+If `MA_NO_SPACE` is returned, the `pBackends` buffer will be filled with `*pBackendCount` values.
+
+
+Thread Safety
+-------------
+Safe.
+
+
+Callback Safety
+---------------
+Safe.
+
+
+Remarks
+-------
+If you want to retrieve the number of backends so you can determine the capacity of `pBackends` buffer, you can call
+this function with `pBackends` set to NULL.
+
+This will also enumerate the null backend. If you don't want to include this you need to check for `ma_backend_null`
+when you enumerate over the returned backends and handle it appropriately. Alternatively, you can disable it at
+compile time with `MA_NO_NULL`.
+
+The returned backends are determined based on compile time settings, not the platform it's currently running on. For
+example, PulseAudio will be returned if it was enabled at compile time, even when the user doesn't actually have
+PulseAudio installed.
+
+
+Example 1
+---------
+The example below retrieves the enabled backend count using a fixed sized buffer allocated on the stack. The buffer is
+given a capacity of `MA_BACKEND_COUNT` which will guarantee it'll be large enough to store all available backends.
+Since `MA_BACKEND_COUNT` is always a relatively small value, this should be suitable for most scenarios.
+
+```
+ma_backend enabledBackends[MA_BACKEND_COUNT];
+size_t enabledBackendCount;
+
+result = ma_get_enabled_backends(enabledBackends, MA_BACKEND_COUNT, &enabledBackendCount);
+if (result != MA_SUCCESS) {
+    // Failed to retrieve enabled backends. Should never happen in this example since all inputs are valid.
+}
+```
+
+
+See Also
+--------
+ma_is_backend_enabled()
+*/
+MA_API ma_result ma_get_enabled_backends(ma_backend* pBackends, size_t backendCap, size_t* pBackendCount);
+
+/*
+Determines whether or not loopback mode is support by a backend.
+*/
+MA_API ma_bool32 ma_is_loopback_supported(ma_backend backend);
+
+#endif  /* MA_NO_DEVICE_IO */
+
+
+
+/************************************************************************************************************************************************************
+
+Utilities
+
+************************************************************************************************************************************************************/
+
+/*
+Calculates a buffer size in milliseconds (rounded up) from the specified number of frames and sample rate.
+*/
+MA_API ma_uint32 ma_calculate_buffer_size_in_milliseconds_from_frames(ma_uint32 bufferSizeInFrames, ma_uint32 sampleRate);
+
+/*
+Calculates a buffer size in frames from the specified number of milliseconds and sample rate.
+*/
+MA_API ma_uint32 ma_calculate_buffer_size_in_frames_from_milliseconds(ma_uint32 bufferSizeInMilliseconds, ma_uint32 sampleRate);
+
+/*
+Copies PCM frames from one buffer to another.
+*/
+MA_API void ma_copy_pcm_frames(void* dst, const void* src, ma_uint64 frameCount, ma_format format, ma_uint32 channels);
+
+/*
+Copies silent frames into the given buffer.
+
+Remarks
+-------
+For all formats except `ma_format_u8`, the output buffer will be filled with 0. For `ma_format_u8` it will be filled with 128. The reason for this is that it
+makes more sense for the purpose of mixing to initialize it to the center point.
+*/
+MA_API void ma_silence_pcm_frames(void* p, ma_uint64 frameCount, ma_format format, ma_uint32 channels);
+
+
+/*
+Offsets a pointer by the specified number of PCM frames.
+*/
+MA_API void* ma_offset_pcm_frames_ptr(void* p, ma_uint64 offsetInFrames, ma_format format, ma_uint32 channels);
+MA_API const void* ma_offset_pcm_frames_const_ptr(const void* p, ma_uint64 offsetInFrames, ma_format format, ma_uint32 channels);
+static MA_INLINE float* ma_offset_pcm_frames_ptr_f32(float* p, ma_uint64 offsetInFrames, ma_uint32 channels) { return (float*)ma_offset_pcm_frames_ptr((void*)p, offsetInFrames, ma_format_f32, channels); }
+static MA_INLINE const float* ma_offset_pcm_frames_const_ptr_f32(const float* p, ma_uint64 offsetInFrames, ma_uint32 channels) { return (const float*)ma_offset_pcm_frames_const_ptr((const void*)p, offsetInFrames, ma_format_f32, channels); }
+
+
+/*
+Clips samples.
+*/
+MA_API void ma_clip_samples_u8(ma_uint8* pDst, const ma_int16* pSrc, ma_uint64 count);
+MA_API void ma_clip_samples_s16(ma_int16* pDst, const ma_int32* pSrc, ma_uint64 count);
+MA_API void ma_clip_samples_s24(ma_uint8* pDst, const ma_int64* pSrc, ma_uint64 count);
+MA_API void ma_clip_samples_s32(ma_int32* pDst, const ma_int64* pSrc, ma_uint64 count);
+MA_API void ma_clip_samples_f32(float* pDst, const float* pSrc, ma_uint64 count);
+MA_API void ma_clip_pcm_frames(void* pDst, const void* pSrc, ma_uint64 frameCount, ma_format format, ma_uint32 channels);
+
+/*
+Helper for applying a volume factor to samples.
+
+Note that the source and destination buffers can be the same, in which case it'll perform the operation in-place.
+*/
+MA_API void ma_copy_and_apply_volume_factor_u8(ma_uint8* pSamplesOut, const ma_uint8* pSamplesIn, ma_uint64 sampleCount, float factor);
+MA_API void ma_copy_and_apply_volume_factor_s16(ma_int16* pSamplesOut, const ma_int16* pSamplesIn, ma_uint64 sampleCount, float factor);
+MA_API void ma_copy_and_apply_volume_factor_s24(void* pSamplesOut, const void* pSamplesIn, ma_uint64 sampleCount, float factor);
+MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_int32* pSamplesIn, ma_uint64 sampleCount, float factor);
+MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor);
+
+MA_API void ma_apply_volume_factor_u8(ma_uint8* pSamples, ma_uint64 sampleCount, float factor);
+MA_API void ma_apply_volume_factor_s16(ma_int16* pSamples, ma_uint64 sampleCount, float factor);
+MA_API void ma_apply_volume_factor_s24(void* pSamples, ma_uint64 sampleCount, float factor);
+MA_API void ma_apply_volume_factor_s32(ma_int32* pSamples, ma_uint64 sampleCount, float factor);
+MA_API void ma_apply_volume_factor_f32(float* pSamples, ma_uint64 sampleCount, float factor);
+
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_u8(ma_uint8* pFramesOut, const ma_uint8* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s16(ma_int16* pFramesOut, const ma_int16* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s24(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s32(ma_int32* pFramesOut, const ma_int32* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float factor);
+
+MA_API void ma_apply_volume_factor_pcm_frames_u8(ma_uint8* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_apply_volume_factor_pcm_frames_s16(ma_int16* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_apply_volume_factor_pcm_frames_s24(void* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_apply_volume_factor_pcm_frames_s32(ma_int32* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_apply_volume_factor_pcm_frames_f32(float* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor);
+MA_API void ma_apply_volume_factor_pcm_frames(void* pFrames, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float factor);
+
+MA_API void ma_copy_and_apply_volume_factor_per_channel_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float* pChannelGains);
+
+
+MA_API void ma_copy_and_apply_volume_and_clip_samples_u8(ma_uint8* pDst, const ma_int16* pSrc, ma_uint64 count, float volume);
+MA_API void ma_copy_and_apply_volume_and_clip_samples_s16(ma_int16* pDst, const ma_int32* pSrc, ma_uint64 count, float volume);
+MA_API void ma_copy_and_apply_volume_and_clip_samples_s24(ma_uint8* pDst, const ma_int64* pSrc, ma_uint64 count, float volume);
+MA_API void ma_copy_and_apply_volume_and_clip_samples_s32(ma_int32* pDst, const ma_int64* pSrc, ma_uint64 count, float volume);
+MA_API void ma_copy_and_apply_volume_and_clip_samples_f32(float* pDst, const float* pSrc, ma_uint64 count, float volume);
+MA_API void ma_copy_and_apply_volume_and_clip_pcm_frames(void* pDst, const void* pSrc, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float volume);
+
+
+/*
+Helper for converting a linear factor to gain in decibels.
+*/
+MA_API float ma_volume_linear_to_db(float factor);
+
+/*
+Helper for converting gain in decibels to a linear factor.
+*/
+MA_API float ma_volume_db_to_linear(float gain);
+
+
+/*
+Mixes the specified number of frames in floating point format with a volume factor.
+
+This will run on an optimized path when the volume is equal to 1.
+*/
+MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume);
+
+
+
+
+/************************************************************************************************************************************************************
+
+VFS
+===
+
+The VFS object (virtual file system) is what's used to customize file access. This is useful in cases where stdio FILE* based APIs may not be entirely
+appropriate for a given situation.
+
+************************************************************************************************************************************************************/
+typedef void      ma_vfs;
+typedef ma_handle ma_vfs_file;
+
+typedef enum
+{
+    MA_OPEN_MODE_READ  = 0x00000001,
+    MA_OPEN_MODE_WRITE = 0x00000002
+} ma_open_mode_flags;
+
+typedef enum
+{
+    ma_seek_origin_start,
+    ma_seek_origin_current,
+    ma_seek_origin_end  /* Not used by decoders. */
+} ma_seek_origin;
+
+typedef struct
+{
+    ma_uint64 sizeInBytes;
+} ma_file_info;
+
+typedef struct
+{
+    ma_result (* onOpen) (ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile);
+    ma_result (* onOpenW)(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile);
+    ma_result (* onClose)(ma_vfs* pVFS, ma_vfs_file file);
+    ma_result (* onRead) (ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead);
+    ma_result (* onWrite)(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten);
+    ma_result (* onSeek) (ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin);
+    ma_result (* onTell) (ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor);
+    ma_result (* onInfo) (ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo);
+} ma_vfs_callbacks;
+
+MA_API ma_result ma_vfs_open(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile);
+MA_API ma_result ma_vfs_open_w(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile);
+MA_API ma_result ma_vfs_close(ma_vfs* pVFS, ma_vfs_file file);
+MA_API ma_result ma_vfs_read(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead);
+MA_API ma_result ma_vfs_write(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten);
+MA_API ma_result ma_vfs_seek(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin);
+MA_API ma_result ma_vfs_tell(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor);
+MA_API ma_result ma_vfs_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo);
+MA_API ma_result ma_vfs_open_and_read_file(ma_vfs* pVFS, const char* pFilePath, void** ppData, size_t* pSize, const ma_allocation_callbacks* pAllocationCallbacks);
+
+typedef struct
+{
+    ma_vfs_callbacks cb;
+    ma_allocation_callbacks allocationCallbacks;    /* Only used for the wchar_t version of open() on non-Windows platforms. */
+} ma_default_vfs;
+
+MA_API ma_result ma_default_vfs_init(ma_default_vfs* pVFS, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+
+typedef ma_result (* ma_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead);
+typedef ma_result (* ma_seek_proc)(void* pUserData, ma_int64 offset, ma_seek_origin origin);
+typedef ma_result (* ma_tell_proc)(void* pUserData, ma_int64* pCursor);
+
+
+
+#if !defined(MA_NO_DECODING) || !defined(MA_NO_ENCODING)
+typedef enum
+{
+    ma_encoding_format_unknown = 0,
+    ma_encoding_format_wav,
+    ma_encoding_format_flac,
+    ma_encoding_format_mp3,
+    ma_encoding_format_vorbis
+} ma_encoding_format;
+#endif
+
+/************************************************************************************************************************************************************
+
+Decoding
+========
+
+Decoders are independent of the main device API. Decoding APIs can be called freely inside the device's data callback, but they are not thread safe unless
+you do your own synchronization.
+
+************************************************************************************************************************************************************/
+#ifndef MA_NO_DECODING
+typedef struct ma_decoder ma_decoder;
+
+
+typedef struct
+{
+    ma_format preferredFormat;
+    ma_uint32 seekPointCount;   /* Set to > 0 to generate a seektable if the decoding backend supports it. */
+} ma_decoding_backend_config;
+
+MA_API ma_decoding_backend_config ma_decoding_backend_config_init(ma_format preferredFormat, ma_uint32 seekPointCount);
+
+
+typedef struct
+{
+    ma_result (* onInit      )(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend);
+    ma_result (* onInitFile  )(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend);               /* Optional. */
+    ma_result (* onInitFileW )(void* pUserData, const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend);            /* Optional. */
+    ma_result (* onInitMemory)(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend);  /* Optional. */
+    void      (* onUninit    )(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks);
+} ma_decoding_backend_vtable;
+
+
+typedef ma_result (* ma_decoder_read_proc)(ma_decoder* pDecoder, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead);         /* Returns the number of bytes read. */
+typedef ma_result (* ma_decoder_seek_proc)(ma_decoder* pDecoder, ma_int64 byteOffset, ma_seek_origin origin);
+typedef ma_result (* ma_decoder_tell_proc)(ma_decoder* pDecoder, ma_int64* pCursor);
+
+typedef struct
+{
+    ma_format format;      /* Set to 0 or ma_format_unknown to use the stream's internal format. */
+    ma_uint32 channels;    /* Set to 0 to use the stream's internal channels. */
+    ma_uint32 sampleRate;  /* Set to 0 to use the stream's internal sample rate. */
+    ma_channel* pChannelMap;
+    ma_channel_mix_mode channelMixMode;
+    ma_dither_mode ditherMode;
+    ma_resampler_config resampling;
+    ma_allocation_callbacks allocationCallbacks;
+    ma_encoding_format encodingFormat;
+    ma_uint32 seekPointCount;   /* When set to > 0, specifies the number of seek points to use for the generation of a seek table. Not all decoding backends support this. */
+    ma_decoding_backend_vtable** ppCustomBackendVTables;
+    ma_uint32 customBackendCount;
+    void* pCustomBackendUserData;
+} ma_decoder_config;
+
+struct ma_decoder
+{
+    ma_data_source_base ds;
+    ma_data_source* pBackend;                   /* The decoding backend we'll be pulling data from. */
+    const ma_decoding_backend_vtable* pBackendVTable; /* The vtable for the decoding backend. This needs to be stored so we can access the onUninit() callback. */
+    void* pBackendUserData;
+    ma_decoder_read_proc onRead;
+    ma_decoder_seek_proc onSeek;
+    ma_decoder_tell_proc onTell;
+    void* pUserData;
+    ma_uint64 readPointerInPCMFrames;      /* In output sample rate. Used for keeping track of how many frames are available for decoding. */
+    ma_format outputFormat;
+    ma_uint32 outputChannels;
+    ma_uint32 outputSampleRate;
+    ma_data_converter converter;    /* Data conversion is achieved by running frames through this. */
+    void* pInputCache;              /* In input format. Can be null if it's not needed. */
+    ma_uint64 inputCacheCap;        /* The capacity of the input cache. */
+    ma_uint64 inputCacheConsumed;   /* The number of frames that have been consumed in the cache. Used for determining the next valid frame. */
+    ma_uint64 inputCacheRemaining;  /* The number of valid frames remaining in the cache. */
+    ma_allocation_callbacks allocationCallbacks;
+    union
+    {
+        struct
+        {
+            ma_vfs* pVFS;
+            ma_vfs_file file;
+        } vfs;
+        struct
+        {
+            const ma_uint8* pData;
+            size_t dataSize;
+            size_t currentReadPos;
+        } memory;               /* Only used for decoders that were opened against a block of memory. */
+    } data;
+};
+
+MA_API ma_decoder_config ma_decoder_config_init(ma_format outputFormat, ma_uint32 outputChannels, ma_uint32 outputSampleRate);
+MA_API ma_decoder_config ma_decoder_config_init_default(void);
+
+MA_API ma_result ma_decoder_init(ma_decoder_read_proc onRead, ma_decoder_seek_proc onSeek, void* pUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
+MA_API ma_result ma_decoder_init_memory(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
+MA_API ma_result ma_decoder_init_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
+MA_API ma_result ma_decoder_init_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
+MA_API ma_result ma_decoder_init_file(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
+MA_API ma_result ma_decoder_init_file_w(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder);
+
+/*
+Uninitializes a decoder.
+*/
+MA_API ma_result ma_decoder_uninit(ma_decoder* pDecoder);
+
+/*
+Reads PCM frames from the given decoder.
+
+This is not thread safe without your own synchronization.
+*/
+MA_API ma_result ma_decoder_read_pcm_frames(ma_decoder* pDecoder, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+
+/*
+Seeks to a PCM frame based on its absolute index.
+
+This is not thread safe without your own synchronization.
+*/
+MA_API ma_result ma_decoder_seek_to_pcm_frame(ma_decoder* pDecoder, ma_uint64 frameIndex);
+
+/*
+Retrieves the decoder's output data format.
+*/
+MA_API ma_result ma_decoder_get_data_format(ma_decoder* pDecoder, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+
+/*
+Retrieves the current position of the read cursor in PCM frames.
+*/
+MA_API ma_result ma_decoder_get_cursor_in_pcm_frames(ma_decoder* pDecoder, ma_uint64* pCursor);
+
+/*
+Retrieves the length of the decoder in PCM frames.
+
+Do not call this on streams of an undefined length, such as internet radio.
+
+If the length is unknown or an error occurs, 0 will be returned.
+
+This will always return 0 for Vorbis decoders. This is due to a limitation with stb_vorbis in push mode which is what miniaudio
+uses internally.
+
+For MP3's, this will decode the entire file. Do not call this in time critical scenarios.
+
+This function is not thread safe without your own synchronization.
+*/
+MA_API ma_result ma_decoder_get_length_in_pcm_frames(ma_decoder* pDecoder, ma_uint64* pLength);
+
+/*
+Retrieves the number of frames that can be read before reaching the end.
+
+This calls `ma_decoder_get_length_in_pcm_frames()` so you need to be aware of the rules for that function, in
+particular ensuring you do not call it on streams of an undefined length, such as internet radio.
+
+If the total length of the decoder cannot be retrieved, such as with Vorbis decoders, `MA_NOT_IMPLEMENTED` will be
+returned.
+*/
+MA_API ma_result ma_decoder_get_available_frames(ma_decoder* pDecoder, ma_uint64* pAvailableFrames);
+
+/*
+Helper for opening and decoding a file into a heap allocated block of memory. Free the returned pointer with ma_free(). On input,
+pConfig should be set to what you want. On output it will be set to what you got.
+*/
+MA_API ma_result ma_decode_from_vfs(ma_vfs* pVFS, const char* pFilePath, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut);
+MA_API ma_result ma_decode_file(const char* pFilePath, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut);
+MA_API ma_result ma_decode_memory(const void* pData, size_t dataSize, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut);
+
+#endif  /* MA_NO_DECODING */
+
+
+/************************************************************************************************************************************************************
+
+Encoding
+========
+
+Encoders do not perform any format conversion for you. If your target format does not support the format, and error will be returned.
+
+************************************************************************************************************************************************************/
+#ifndef MA_NO_ENCODING
+typedef struct ma_encoder ma_encoder;
+
+typedef ma_result (* ma_encoder_write_proc)           (ma_encoder* pEncoder, const void* pBufferIn, size_t bytesToWrite, size_t* pBytesWritten);
+typedef ma_result (* ma_encoder_seek_proc)            (ma_encoder* pEncoder, ma_int64 offset, ma_seek_origin origin);
+typedef ma_result (* ma_encoder_init_proc)            (ma_encoder* pEncoder);
+typedef void      (* ma_encoder_uninit_proc)          (ma_encoder* pEncoder);
+typedef ma_result (* ma_encoder_write_pcm_frames_proc)(ma_encoder* pEncoder, const void* pFramesIn, ma_uint64 frameCount, ma_uint64* pFramesWritten);
+
+typedef struct
+{
+    ma_encoding_format encodingFormat;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_allocation_callbacks allocationCallbacks;
+} ma_encoder_config;
+
+MA_API ma_encoder_config ma_encoder_config_init(ma_encoding_format encodingFormat, ma_format format, ma_uint32 channels, ma_uint32 sampleRate);
+
+struct ma_encoder
+{
+    ma_encoder_config config;
+    ma_encoder_write_proc onWrite;
+    ma_encoder_seek_proc onSeek;
+    ma_encoder_init_proc onInit;
+    ma_encoder_uninit_proc onUninit;
+    ma_encoder_write_pcm_frames_proc onWritePCMFrames;
+    void* pUserData;
+    void* pInternalEncoder;
+    union
+    {
+        struct
+        {
+            ma_vfs* pVFS;
+            ma_vfs_file file;
+        } vfs;
+    } data;
+};
+
+MA_API ma_result ma_encoder_init(ma_encoder_write_proc onWrite, ma_encoder_seek_proc onSeek, void* pUserData, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
+MA_API ma_result ma_encoder_init_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
+MA_API ma_result ma_encoder_init_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
+MA_API ma_result ma_encoder_init_file(const char* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
+MA_API ma_result ma_encoder_init_file_w(const wchar_t* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder);
+MA_API void ma_encoder_uninit(ma_encoder* pEncoder);
+MA_API ma_result ma_encoder_write_pcm_frames(ma_encoder* pEncoder, const void* pFramesIn, ma_uint64 frameCount, ma_uint64* pFramesWritten);
+
+#endif /* MA_NO_ENCODING */
+
+
+/************************************************************************************************************************************************************
+
+Generation
+
+************************************************************************************************************************************************************/
+#ifndef MA_NO_GENERATION
+typedef enum
+{
+    ma_waveform_type_sine,
+    ma_waveform_type_square,
+    ma_waveform_type_triangle,
+    ma_waveform_type_sawtooth
+} ma_waveform_type;
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_waveform_type type;
+    double amplitude;
+    double frequency;
+} ma_waveform_config;
+
+MA_API ma_waveform_config ma_waveform_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_waveform_type type, double amplitude, double frequency);
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_waveform_config config;
+    double advance;
+    double time;
+} ma_waveform;
+
+MA_API ma_result ma_waveform_init(const ma_waveform_config* pConfig, ma_waveform* pWaveform);
+MA_API void ma_waveform_uninit(ma_waveform* pWaveform);
+MA_API ma_result ma_waveform_read_pcm_frames(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_waveform_seek_to_pcm_frame(ma_waveform* pWaveform, ma_uint64 frameIndex);
+MA_API ma_result ma_waveform_set_amplitude(ma_waveform* pWaveform, double amplitude);
+MA_API ma_result ma_waveform_set_frequency(ma_waveform* pWaveform, double frequency);
+MA_API ma_result ma_waveform_set_type(ma_waveform* pWaveform, ma_waveform_type type);
+MA_API ma_result ma_waveform_set_sample_rate(ma_waveform* pWaveform, ma_uint32 sampleRate);
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    double dutyCycle;
+    double amplitude;
+    double frequency;
+} ma_pulsewave_config;
+
+MA_API ma_pulsewave_config ma_pulsewave_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double dutyCycle, double amplitude, double frequency);
+
+typedef struct
+{
+    ma_waveform waveform;
+    ma_pulsewave_config config;
+} ma_pulsewave;
+
+MA_API ma_result ma_pulsewave_init(const ma_pulsewave_config* pConfig, ma_pulsewave* pWaveform);
+MA_API void ma_pulsewave_uninit(ma_pulsewave* pWaveform);
+MA_API ma_result ma_pulsewave_read_pcm_frames(ma_pulsewave* pWaveform, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_pulsewave_seek_to_pcm_frame(ma_pulsewave* pWaveform, ma_uint64 frameIndex);
+MA_API ma_result ma_pulsewave_set_amplitude(ma_pulsewave* pWaveform, double amplitude);
+MA_API ma_result ma_pulsewave_set_frequency(ma_pulsewave* pWaveform, double frequency);
+MA_API ma_result ma_pulsewave_set_sample_rate(ma_pulsewave* pWaveform, ma_uint32 sampleRate);
+MA_API ma_result ma_pulsewave_set_duty_cycle(ma_pulsewave* pWaveform, double dutyCycle);
+
+typedef enum
+{
+    ma_noise_type_white,
+    ma_noise_type_pink,
+    ma_noise_type_brownian
+} ma_noise_type;
+
+
+typedef struct
+{
+    ma_format format;
+    ma_uint32 channels;
+    ma_noise_type type;
+    ma_int32 seed;
+    double amplitude;
+    ma_bool32 duplicateChannels;
+} ma_noise_config;
+
+MA_API ma_noise_config ma_noise_config_init(ma_format format, ma_uint32 channels, ma_noise_type type, ma_int32 seed, double amplitude);
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_noise_config config;
+    ma_lcg lcg;
+    union
+    {
+        struct
+        {
+            double** bin;
+            double* accumulation;
+            ma_uint32* counter;
+        } pink;
+        struct
+        {
+            double* accumulation;
+        } brownian;
+    } state;
+
+    /* Memory management. */
+    void* _pHeap;
+    ma_bool32 _ownsHeap;
+} ma_noise;
+
+MA_API ma_result ma_noise_get_heap_size(const ma_noise_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_noise_init_preallocated(const ma_noise_config* pConfig, void* pHeap, ma_noise* pNoise);
+MA_API ma_result ma_noise_init(const ma_noise_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_noise* pNoise);
+MA_API void ma_noise_uninit(ma_noise* pNoise, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_noise_read_pcm_frames(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_noise_set_amplitude(ma_noise* pNoise, double amplitude);
+MA_API ma_result ma_noise_set_seed(ma_noise* pNoise, ma_int32 seed);
+MA_API ma_result ma_noise_set_type(ma_noise* pNoise, ma_noise_type type);
+
+#endif  /* MA_NO_GENERATION */
+
+
+
+/************************************************************************************************************************************************************
+
+Resource Manager
+
+************************************************************************************************************************************************************/
+/* The resource manager cannot be enabled if there is no decoder. */
+#if !defined(MA_NO_RESOURCE_MANAGER) && defined(MA_NO_DECODING)
+#define MA_NO_RESOURCE_MANAGER
+#endif
+
+#ifndef MA_NO_RESOURCE_MANAGER
+typedef struct ma_resource_manager                  ma_resource_manager;
+typedef struct ma_resource_manager_data_buffer_node ma_resource_manager_data_buffer_node;
+typedef struct ma_resource_manager_data_buffer      ma_resource_manager_data_buffer;
+typedef struct ma_resource_manager_data_stream      ma_resource_manager_data_stream;
+typedef struct ma_resource_manager_data_source      ma_resource_manager_data_source;
+
+typedef enum
+{
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM         = 0x00000001,   /* When set, does not load the entire data source in memory. Disk I/O will happen on job threads. */
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE         = 0x00000002,   /* Decode data before storing in memory. When set, decoding is done at the resource manager level rather than the mixing thread. Results in faster mixing, but higher memory usage. */
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC          = 0x00000004,   /* When set, the resource manager will load the data source asynchronously. */
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT      = 0x00000008,   /* When set, waits for initialization of the underlying data source before returning from ma_resource_manager_data_source_init(). */
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_UNKNOWN_LENGTH = 0x00000010,   /* Gives the resource manager a hint that the length of the data source is unknown and calling `ma_data_source_get_length_in_pcm_frames()` should be avoided. */
+    MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING        = 0x00000020    /* When set, configures the data source to loop by default. */
+} ma_resource_manager_data_source_flags;
+
+
+/*
+Pipeline notifications used by the resource manager. Made up of both an async notification and a fence, both of which are optional.
+*/
+typedef struct
+{
+    ma_async_notification* pNotification;
+    ma_fence* pFence;
+} ma_resource_manager_pipeline_stage_notification;
+
+typedef struct
+{
+    ma_resource_manager_pipeline_stage_notification init;    /* Initialization of the decoder. */
+    ma_resource_manager_pipeline_stage_notification done;    /* Decoding fully completed. */
+} ma_resource_manager_pipeline_notifications;
+
+MA_API ma_resource_manager_pipeline_notifications ma_resource_manager_pipeline_notifications_init(void);
+
+
+
+/* BEGIN BACKWARDS COMPATIBILITY */
+/* TODO: Remove this block in version 0.12. */
+#if 1
+#define ma_resource_manager_job                         ma_job
+#define ma_resource_manager_job_init                    ma_job_init
+#define MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_FLAG_NON_BLOCKING MA_JOB_QUEUE_FLAG_NON_BLOCKING
+#define ma_resource_manager_job_queue_config            ma_job_queue_config
+#define ma_resource_manager_job_queue_config_init       ma_job_queue_config_init
+#define ma_resource_manager_job_queue                   ma_job_queue
+#define ma_resource_manager_job_queue_get_heap_size     ma_job_queue_get_heap_size
+#define ma_resource_manager_job_queue_init_preallocated ma_job_queue_init_preallocated
+#define ma_resource_manager_job_queue_init              ma_job_queue_init
+#define ma_resource_manager_job_queue_uninit            ma_job_queue_uninit
+#define ma_resource_manager_job_queue_post              ma_job_queue_post
+#define ma_resource_manager_job_queue_next              ma_job_queue_next
+#endif
+/* END BACKWARDS COMPATIBILITY */
+
+
+
+
+/* Maximum job thread count will be restricted to this, but this may be removed later and replaced with a heap allocation thereby removing any limitation. */
+#ifndef MA_RESOURCE_MANAGER_MAX_JOB_THREAD_COUNT
+#define MA_RESOURCE_MANAGER_MAX_JOB_THREAD_COUNT    64
+#endif
+
+typedef enum
+{
+    /* Indicates ma_resource_manager_next_job() should not block. Only valid when the job thread count is 0. */
+    MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING = 0x00000001,
+
+    /* Disables any kind of multithreading. Implicitly enables MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING. */
+    MA_RESOURCE_MANAGER_FLAG_NO_THREADING = 0x00000002
+} ma_resource_manager_flags;
+
+typedef struct
+{
+    const char* pFilePath;
+    const wchar_t* pFilePathW;
+    const ma_resource_manager_pipeline_notifications* pNotifications;
+    ma_uint64 initialSeekPointInPCMFrames;
+    ma_uint64 rangeBegInPCMFrames;
+    ma_uint64 rangeEndInPCMFrames;
+    ma_uint64 loopPointBegInPCMFrames;
+    ma_uint64 loopPointEndInPCMFrames;
+    ma_uint32 flags;
+    ma_bool32 isLooping;    /* Deprecated. Use the MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING flag in `flags` instead. */
+} ma_resource_manager_data_source_config;
+
+MA_API ma_resource_manager_data_source_config ma_resource_manager_data_source_config_init(void);
+
+
+typedef enum
+{
+    ma_resource_manager_data_supply_type_unknown = 0,   /* Used for determining whether or the data supply has been initialized. */
+    ma_resource_manager_data_supply_type_encoded,       /* Data supply is an encoded buffer. Connector is ma_decoder. */
+    ma_resource_manager_data_supply_type_decoded,       /* Data supply is a decoded buffer. Connector is ma_audio_buffer. */
+    ma_resource_manager_data_supply_type_decoded_paged  /* Data supply is a linked list of decoded buffers. Connector is ma_paged_audio_buffer. */
+} ma_resource_manager_data_supply_type;
+
+typedef struct
+{
+    MA_ATOMIC(4, ma_resource_manager_data_supply_type) type;    /* Read and written from different threads so needs to be accessed atomically. */
+    union
+    {
+        struct
+        {
+            const void* pData;
+            size_t sizeInBytes;
+        } encoded;
+        struct
+        {
+            const void* pData;
+            ma_uint64 totalFrameCount;
+            ma_uint64 decodedFrameCount;
+            ma_format format;
+            ma_uint32 channels;
+            ma_uint32 sampleRate;
+        } decoded;
+        struct
+        {
+            ma_paged_audio_buffer_data data;
+            ma_uint64 decodedFrameCount;
+            ma_uint32 sampleRate;
+        } decodedPaged;
+    } backend;
+} ma_resource_manager_data_supply;
+
+struct ma_resource_manager_data_buffer_node
+{
+    ma_uint32 hashedName32;                         /* The hashed name. This is the key. */
+    ma_uint32 refCount;
+    MA_ATOMIC(4, ma_result) result;                 /* Result from asynchronous loading. When loading set to MA_BUSY. When fully loaded set to MA_SUCCESS. When deleting set to MA_UNAVAILABLE. */
+    MA_ATOMIC(4, ma_uint32) executionCounter;       /* For allocating execution orders for jobs. */
+    MA_ATOMIC(4, ma_uint32) executionPointer;       /* For managing the order of execution for asynchronous jobs relating to this object. Incremented as jobs complete processing. */
+    ma_bool32 isDataOwnedByResourceManager;         /* Set to true when the underlying data buffer was allocated the resource manager. Set to false if it is owned by the application (via ma_resource_manager_register_*()). */
+    ma_resource_manager_data_supply data;
+    ma_resource_manager_data_buffer_node* pParent;
+    ma_resource_manager_data_buffer_node* pChildLo;
+    ma_resource_manager_data_buffer_node* pChildHi;
+};
+
+struct ma_resource_manager_data_buffer
+{
+    ma_data_source_base ds;                         /* Base data source. A data buffer is a data source. */
+    ma_resource_manager* pResourceManager;          /* A pointer to the resource manager that owns this buffer. */
+    ma_resource_manager_data_buffer_node* pNode;    /* The data node. This is reference counted and is what supplies the data. */
+    ma_uint32 flags;                                /* The flags that were passed used to initialize the buffer. */
+    MA_ATOMIC(4, ma_uint32) executionCounter;       /* For allocating execution orders for jobs. */
+    MA_ATOMIC(4, ma_uint32) executionPointer;       /* For managing the order of execution for asynchronous jobs relating to this object. Incremented as jobs complete processing. */
+    ma_uint64 seekTargetInPCMFrames;                /* Only updated by the public API. Never written nor read from the job thread. */
+    ma_bool32 seekToCursorOnNextRead;               /* On the next read we need to seek to the frame cursor. */
+    MA_ATOMIC(4, ma_result) result;                 /* Keeps track of a result of decoding. Set to MA_BUSY while the buffer is still loading. Set to MA_SUCCESS when loading is finished successfully. Otherwise set to some other code. */
+    MA_ATOMIC(4, ma_bool32) isLooping;              /* Can be read and written by different threads at the same time. Must be used atomically. */
+    ma_atomic_bool32 isConnectorInitialized;        /* Used for asynchronous loading to ensure we don't try to initialize the connector multiple times while waiting for the node to fully load. */
+    union
+    {
+        ma_decoder decoder;                 /* Supply type is ma_resource_manager_data_supply_type_encoded */
+        ma_audio_buffer buffer;             /* Supply type is ma_resource_manager_data_supply_type_decoded */
+        ma_paged_audio_buffer pagedBuffer;  /* Supply type is ma_resource_manager_data_supply_type_decoded_paged */
+    } connector;    /* Connects this object to the node's data supply. */
+};
+
+struct ma_resource_manager_data_stream
+{
+    ma_data_source_base ds;                     /* Base data source. A data stream is a data source. */
+    ma_resource_manager* pResourceManager;      /* A pointer to the resource manager that owns this data stream. */
+    ma_uint32 flags;                            /* The flags that were passed used to initialize the stream. */
+    ma_decoder decoder;                         /* Used for filling pages with data. This is only ever accessed by the job thread. The public API should never touch this. */
+    ma_bool32 isDecoderInitialized;             /* Required for determining whether or not the decoder should be uninitialized in MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_STREAM. */
+    ma_uint64 totalLengthInPCMFrames;           /* This is calculated when first loaded by the MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_STREAM. */
+    ma_uint32 relativeCursor;                   /* The playback cursor, relative to the current page. Only ever accessed by the public API. Never accessed by the job thread. */
+    MA_ATOMIC(8, ma_uint64) absoluteCursor;     /* The playback cursor, in absolute position starting from the start of the file. */
+    ma_uint32 currentPageIndex;                 /* Toggles between 0 and 1. Index 0 is the first half of pPageData. Index 1 is the second half. Only ever accessed by the public API. Never accessed by the job thread. */
+    MA_ATOMIC(4, ma_uint32) executionCounter;   /* For allocating execution orders for jobs. */
+    MA_ATOMIC(4, ma_uint32) executionPointer;   /* For managing the order of execution for asynchronous jobs relating to this object. Incremented as jobs complete processing. */
+
+    /* Written by the public API, read by the job thread. */
+    MA_ATOMIC(4, ma_bool32) isLooping;          /* Whether or not the stream is looping. It's important to set the looping flag at the data stream level for smooth loop transitions. */
+
+    /* Written by the job thread, read by the public API. */
+    void* pPageData;                            /* Buffer containing the decoded data of each page. Allocated once at initialization time. */
+    MA_ATOMIC(4, ma_uint32) pageFrameCount[2];  /* The number of valid PCM frames in each page. Used to determine the last valid frame. */
+
+    /* Written and read by both the public API and the job thread. These must be atomic. */
+    MA_ATOMIC(4, ma_result) result;             /* Result from asynchronous loading. When loading set to MA_BUSY. When initialized set to MA_SUCCESS. When deleting set to MA_UNAVAILABLE. If an error occurs when loading, set to an error code. */
+    MA_ATOMIC(4, ma_bool32) isDecoderAtEnd;     /* Whether or not the decoder has reached the end. */
+    MA_ATOMIC(4, ma_bool32) isPageValid[2];     /* Booleans to indicate whether or not a page is valid. Set to false by the public API, set to true by the job thread. Set to false as the pages are consumed, true when they are filled. */
+    MA_ATOMIC(4, ma_bool32) seekCounter;        /* When 0, no seeking is being performed. When > 0, a seek is being performed and reading should be delayed with MA_BUSY. */
+};
+
+struct ma_resource_manager_data_source
+{
+    union
+    {
+        ma_resource_manager_data_buffer buffer;
+        ma_resource_manager_data_stream stream;
+    } backend;  /* Must be the first item because we need the first item to be the data source callbacks for the buffer or stream. */
+
+    ma_uint32 flags;                          /* The flags that were passed in to ma_resource_manager_data_source_init(). */
+    MA_ATOMIC(4, ma_uint32) executionCounter;     /* For allocating execution orders for jobs. */
+    MA_ATOMIC(4, ma_uint32) executionPointer;     /* For managing the order of execution for asynchronous jobs relating to this object. Incremented as jobs complete processing. */
+};
+
+typedef struct
+{
+    ma_allocation_callbacks allocationCallbacks;
+    ma_log* pLog;
+    ma_format decodedFormat;        /* The decoded format to use. Set to ma_format_unknown (default) to use the file's native format. */
+    ma_uint32 decodedChannels;      /* The decoded channel count to use. Set to 0 (default) to use the file's native channel count. */
+    ma_uint32 decodedSampleRate;    /* the decoded sample rate to use. Set to 0 (default) to use the file's native sample rate. */
+    ma_uint32 jobThreadCount;       /* Set to 0 if you want to self-manage your job threads. Defaults to 1. */
+    size_t jobThreadStackSize;
+    ma_uint32 jobQueueCapacity;     /* The maximum number of jobs that can fit in the queue at a time. Defaults to MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_CAPACITY. Cannot be zero. */
+    ma_uint32 flags;
+    ma_vfs* pVFS;                   /* Can be NULL in which case defaults will be used. */
+    ma_decoding_backend_vtable** ppCustomDecodingBackendVTables;
+    ma_uint32 customDecodingBackendCount;
+    void* pCustomDecodingBackendUserData;
+} ma_resource_manager_config;
+
+MA_API ma_resource_manager_config ma_resource_manager_config_init(void);
+
+struct ma_resource_manager
+{
+    ma_resource_manager_config config;
+    ma_resource_manager_data_buffer_node* pRootDataBufferNode;      /* The root buffer in the binary tree. */
+#ifndef MA_NO_THREADING
+    ma_mutex dataBufferBSTLock;                                     /* For synchronizing access to the data buffer binary tree. */
+    ma_thread jobThreads[MA_RESOURCE_MANAGER_MAX_JOB_THREAD_COUNT]; /* The threads for executing jobs. */
+#endif
+    ma_job_queue jobQueue;                                          /* Multi-consumer, multi-producer job queue for managing jobs for asynchronous decoding and streaming. */
+    ma_default_vfs defaultVFS;                                      /* Only used if a custom VFS is not specified. */
+    ma_log log;                                                     /* Only used if no log was specified in the config. */
+};
+
+/* Init. */
+MA_API ma_result ma_resource_manager_init(const ma_resource_manager_config* pConfig, ma_resource_manager* pResourceManager);
+MA_API void ma_resource_manager_uninit(ma_resource_manager* pResourceManager);
+MA_API ma_log* ma_resource_manager_get_log(ma_resource_manager* pResourceManager);
+
+/* Registration. */
+MA_API ma_result ma_resource_manager_register_file(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags);
+MA_API ma_result ma_resource_manager_register_file_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags);
+MA_API ma_result ma_resource_manager_register_decoded_data(ma_resource_manager* pResourceManager, const char* pName, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate);  /* Does not copy. Increments the reference count if already exists and returns MA_SUCCESS. */
+MA_API ma_result ma_resource_manager_register_decoded_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate);
+MA_API ma_result ma_resource_manager_register_encoded_data(ma_resource_manager* pResourceManager, const char* pName, const void* pData, size_t sizeInBytes);    /* Does not copy. Increments the reference count if already exists and returns MA_SUCCESS. */
+MA_API ma_result ma_resource_manager_register_encoded_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName, const void* pData, size_t sizeInBytes);
+MA_API ma_result ma_resource_manager_unregister_file(ma_resource_manager* pResourceManager, const char* pFilePath);
+MA_API ma_result ma_resource_manager_unregister_file_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath);
+MA_API ma_result ma_resource_manager_unregister_data(ma_resource_manager* pResourceManager, const char* pName);
+MA_API ma_result ma_resource_manager_unregister_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName);
+
+/* Data Buffers. */
+MA_API ma_result ma_resource_manager_data_buffer_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_buffer* pDataBuffer);
+MA_API ma_result ma_resource_manager_data_buffer_init(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_buffer* pDataBuffer);
+MA_API ma_result ma_resource_manager_data_buffer_init_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_buffer* pDataBuffer);
+MA_API ma_result ma_resource_manager_data_buffer_init_copy(ma_resource_manager* pResourceManager, const ma_resource_manager_data_buffer* pExistingDataBuffer, ma_resource_manager_data_buffer* pDataBuffer);
+MA_API ma_result ma_resource_manager_data_buffer_uninit(ma_resource_manager_data_buffer* pDataBuffer);
+MA_API ma_result ma_resource_manager_data_buffer_read_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_resource_manager_data_buffer_seek_to_pcm_frame(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64 frameIndex);
+MA_API ma_result ma_resource_manager_data_buffer_get_data_format(ma_resource_manager_data_buffer* pDataBuffer, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_resource_manager_data_buffer_get_cursor_in_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pCursor);
+MA_API ma_result ma_resource_manager_data_buffer_get_length_in_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pLength);
+MA_API ma_result ma_resource_manager_data_buffer_result(const ma_resource_manager_data_buffer* pDataBuffer);
+MA_API ma_result ma_resource_manager_data_buffer_set_looping(ma_resource_manager_data_buffer* pDataBuffer, ma_bool32 isLooping);
+MA_API ma_bool32 ma_resource_manager_data_buffer_is_looping(const ma_resource_manager_data_buffer* pDataBuffer);
+MA_API ma_result ma_resource_manager_data_buffer_get_available_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pAvailableFrames);
+
+/* Data Streams. */
+MA_API ma_result ma_resource_manager_data_stream_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_stream* pDataStream);
+MA_API ma_result ma_resource_manager_data_stream_init(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_stream* pDataStream);
+MA_API ma_result ma_resource_manager_data_stream_init_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_stream* pDataStream);
+MA_API ma_result ma_resource_manager_data_stream_uninit(ma_resource_manager_data_stream* pDataStream);
+MA_API ma_result ma_resource_manager_data_stream_read_pcm_frames(ma_resource_manager_data_stream* pDataStream, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_resource_manager_data_stream_seek_to_pcm_frame(ma_resource_manager_data_stream* pDataStream, ma_uint64 frameIndex);
+MA_API ma_result ma_resource_manager_data_stream_get_data_format(ma_resource_manager_data_stream* pDataStream, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_resource_manager_data_stream_get_cursor_in_pcm_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pCursor);
+MA_API ma_result ma_resource_manager_data_stream_get_length_in_pcm_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pLength);
+MA_API ma_result ma_resource_manager_data_stream_result(const ma_resource_manager_data_stream* pDataStream);
+MA_API ma_result ma_resource_manager_data_stream_set_looping(ma_resource_manager_data_stream* pDataStream, ma_bool32 isLooping);
+MA_API ma_bool32 ma_resource_manager_data_stream_is_looping(const ma_resource_manager_data_stream* pDataStream);
+MA_API ma_result ma_resource_manager_data_stream_get_available_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pAvailableFrames);
+
+/* Data Sources. */
+MA_API ma_result ma_resource_manager_data_source_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_source* pDataSource);
+MA_API ma_result ma_resource_manager_data_source_init(ma_resource_manager* pResourceManager, const char* pName, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_source* pDataSource);
+MA_API ma_result ma_resource_manager_data_source_init_w(ma_resource_manager* pResourceManager, const wchar_t* pName, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_source* pDataSource);
+MA_API ma_result ma_resource_manager_data_source_init_copy(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source* pExistingDataSource, ma_resource_manager_data_source* pDataSource);
+MA_API ma_result ma_resource_manager_data_source_uninit(ma_resource_manager_data_source* pDataSource);
+MA_API ma_result ma_resource_manager_data_source_read_pcm_frames(ma_resource_manager_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_resource_manager_data_source_seek_to_pcm_frame(ma_resource_manager_data_source* pDataSource, ma_uint64 frameIndex);
+MA_API ma_result ma_resource_manager_data_source_get_data_format(ma_resource_manager_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_resource_manager_data_source_get_cursor_in_pcm_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pCursor);
+MA_API ma_result ma_resource_manager_data_source_get_length_in_pcm_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pLength);
+MA_API ma_result ma_resource_manager_data_source_result(const ma_resource_manager_data_source* pDataSource);
+MA_API ma_result ma_resource_manager_data_source_set_looping(ma_resource_manager_data_source* pDataSource, ma_bool32 isLooping);
+MA_API ma_bool32 ma_resource_manager_data_source_is_looping(const ma_resource_manager_data_source* pDataSource);
+MA_API ma_result ma_resource_manager_data_source_get_available_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pAvailableFrames);
+
+/* Job management. */
+MA_API ma_result ma_resource_manager_post_job(ma_resource_manager* pResourceManager, const ma_job* pJob);
+MA_API ma_result ma_resource_manager_post_job_quit(ma_resource_manager* pResourceManager);  /* Helper for posting a quit job. */
+MA_API ma_result ma_resource_manager_next_job(ma_resource_manager* pResourceManager, ma_job* pJob);
+MA_API ma_result ma_resource_manager_process_job(ma_resource_manager* pResourceManager, ma_job* pJob);  /* DEPRECATED. Use ma_job_process(). Will be removed in version 0.12. */
+MA_API ma_result ma_resource_manager_process_next_job(ma_resource_manager* pResourceManager);   /* Returns MA_CANCELLED if a MA_JOB_TYPE_QUIT job is found. In non-blocking mode, returns MA_NO_DATA_AVAILABLE if no jobs are available. */
+#endif  /* MA_NO_RESOURCE_MANAGER */
+
+
+
+/************************************************************************************************************************************************************
+
+Node Graph
+
+************************************************************************************************************************************************************/
+#ifndef MA_NO_NODE_GRAPH
+/* Must never exceed 254. */
+#ifndef MA_MAX_NODE_BUS_COUNT
+#define MA_MAX_NODE_BUS_COUNT       254
+#endif
+
+/* Used internally by miniaudio for memory management. Must never exceed MA_MAX_NODE_BUS_COUNT. */
+#ifndef MA_MAX_NODE_LOCAL_BUS_COUNT
+#define MA_MAX_NODE_LOCAL_BUS_COUNT 2
+#endif
+
+/* Use this when the bus count is determined by the node instance rather than the vtable. */
+#define MA_NODE_BUS_COUNT_UNKNOWN   255
+
+
+/* For some internal memory management of ma_node_graph. */
+typedef struct
+{
+    size_t offset;
+    size_t sizeInBytes;
+    unsigned char _data[1];
+} ma_stack;
+
+
+typedef struct ma_node_graph ma_node_graph;
+typedef void ma_node;
+
+
+/* Node flags. */
+typedef enum
+{
+    MA_NODE_FLAG_PASSTHROUGH                = 0x00000001,
+    MA_NODE_FLAG_CONTINUOUS_PROCESSING      = 0x00000002,
+    MA_NODE_FLAG_ALLOW_NULL_INPUT           = 0x00000004,
+    MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES = 0x00000008,
+    MA_NODE_FLAG_SILENT_OUTPUT              = 0x00000010
+} ma_node_flags;
+
+
+/* The playback state of a node. Either started or stopped. */
+typedef enum
+{
+    ma_node_state_started = 0,
+    ma_node_state_stopped = 1
+} ma_node_state;
+
+
+typedef struct
+{
+    /*
+    Extended processing callback. This callback is used for effects that process input and output
+    at different rates (i.e. they perform resampling). This is similar to the simple version, only
+    they take two separate frame counts: one for input, and one for output.
+
+    On input, `pFrameCountOut` is equal to the capacity of the output buffer for each bus, whereas
+    `pFrameCountIn` will be equal to the number of PCM frames in each of the buffers in `ppFramesIn`.
+
+    On output, set `pFrameCountOut` to the number of PCM frames that were actually output and set
+    `pFrameCountIn` to the number of input frames that were consumed.
+    */
+    void (* onProcess)(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut);
+
+    /*
+    A callback for retrieving the number of input frames that are required to output the
+    specified number of output frames. You would only want to implement this when the node performs
+    resampling. This is optional, even for nodes that perform resampling, but it does offer a
+    small reduction in latency as it allows miniaudio to calculate the exact number of input frames
+    to read at a time instead of having to estimate.
+    */
+    ma_result (* onGetRequiredInputFrameCount)(ma_node* pNode, ma_uint32 outputFrameCount, ma_uint32* pInputFrameCount);
+
+    /*
+    The number of input buses. This is how many sub-buffers will be contained in the `ppFramesIn`
+    parameters of the callbacks above.
+    */
+    ma_uint8 inputBusCount;
+
+    /*
+    The number of output buses. This is how many sub-buffers will be contained in the `ppFramesOut`
+    parameters of the callbacks above.
+    */
+    ma_uint8 outputBusCount;
+
+    /*
+    Flags describing characteristics of the node. This is currently just a placeholder for some
+    ideas for later on.
+    */
+    ma_uint32 flags;
+} ma_node_vtable;
+
+typedef struct
+{
+    const ma_node_vtable* vtable;       /* Should never be null. Initialization of the node will fail if so. */
+    ma_node_state initialState;         /* Defaults to ma_node_state_started. */
+    ma_uint32 inputBusCount;            /* Only used if the vtable specifies an input bus count of `MA_NODE_BUS_COUNT_UNKNOWN`, otherwise must be set to `MA_NODE_BUS_COUNT_UNKNOWN` (default). */
+    ma_uint32 outputBusCount;           /* Only used if the vtable specifies an output bus count of `MA_NODE_BUS_COUNT_UNKNOWN`, otherwise  be set to `MA_NODE_BUS_COUNT_UNKNOWN` (default). */
+    const ma_uint32* pInputChannels;    /* The number of elements are determined by the input bus count as determined by the vtable, or `inputBusCount` if the vtable specifies `MA_NODE_BUS_COUNT_UNKNOWN`. */
+    const ma_uint32* pOutputChannels;   /* The number of elements are determined by the output bus count as determined by the vtable, or `outputBusCount` if the vtable specifies `MA_NODE_BUS_COUNT_UNKNOWN`. */
+} ma_node_config;
+
+MA_API ma_node_config ma_node_config_init(void);
+
+
+/*
+A node has multiple output buses. An output bus is attached to an input bus as an item in a linked
+list. Think of the input bus as a linked list, with the output bus being an item in that list.
+*/
+typedef struct ma_node_output_bus ma_node_output_bus;
+struct ma_node_output_bus
+{
+    /* Immutable. */
+    ma_node* pNode;                                         /* The node that owns this output bus. The input node. Will be null for dummy head and tail nodes. */
+    ma_uint8 outputBusIndex;                                /* The index of the output bus on pNode that this output bus represents. */
+    ma_uint8 channels;                                      /* The number of channels in the audio stream for this bus. */
+
+    /* Mutable via multiple threads. Must be used atomically. The weird ordering here is for packing reasons. */
+    ma_uint8 inputNodeInputBusIndex;                        /* The index of the input bus on the input. Required for detaching. Will only be used within the spinlock so does not need to be atomic. */
+    MA_ATOMIC(4, ma_uint32) flags;                          /* Some state flags for tracking the read state of the output buffer. A combination of MA_NODE_OUTPUT_BUS_FLAG_*. */
+    MA_ATOMIC(4, ma_uint32) refCount;                       /* Reference count for some thread-safety when detaching. */
+    MA_ATOMIC(4, ma_bool32) isAttached;                     /* This is used to prevent iteration of nodes that are in the middle of being detached. Used for thread safety. */
+    MA_ATOMIC(4, ma_spinlock) lock;                         /* Unfortunate lock, but significantly simplifies the implementation. Required for thread-safe attaching and detaching. */
+    MA_ATOMIC(4, float) volume;                             /* Linear. */
+    MA_ATOMIC(MA_SIZEOF_PTR, ma_node_output_bus*) pNext;    /* If null, it's the tail node or detached. */
+    MA_ATOMIC(MA_SIZEOF_PTR, ma_node_output_bus*) pPrev;    /* If null, it's the head node or detached. */
+    MA_ATOMIC(MA_SIZEOF_PTR, ma_node*) pInputNode;          /* The node that this output bus is attached to. Required for detaching. */
+};
+
+/*
+A node has multiple input buses. The output buses of a node are connecting to the input busses of
+another. An input bus is essentially just a linked list of output buses.
+*/
+typedef struct ma_node_input_bus ma_node_input_bus;
+struct ma_node_input_bus
+{
+    /* Mutable via multiple threads. */
+    ma_node_output_bus head;                /* Dummy head node for simplifying some lock-free thread-safety stuff. */
+    MA_ATOMIC(4, ma_uint32) nextCounter;    /* This is used to determine whether or not the input bus is finding the next node in the list. Used for thread safety when detaching output buses. */
+    MA_ATOMIC(4, ma_spinlock) lock;         /* Unfortunate lock, but significantly simplifies the implementation. Required for thread-safe attaching and detaching. */
+
+    /* Set once at startup. */
+    ma_uint8 channels;                      /* The number of channels in the audio stream for this bus. */
+};
+
+
+typedef struct ma_node_base ma_node_base;
+struct ma_node_base
+{
+    /* These variables are set once at startup. */
+    ma_node_graph* pNodeGraph;                  /* The graph this node belongs to. */
+    const ma_node_vtable* vtable;
+    ma_uint32 inputBusCount;
+    ma_uint32 outputBusCount;
+    ma_node_input_bus* pInputBuses;
+    ma_node_output_bus* pOutputBuses;
+    float* pCachedData;                         /* Allocated on the heap. Fixed size. Needs to be stored on the heap because reading from output buses is done in separate function calls. */
+    ma_uint16 cachedDataCapInFramesPerBus;      /* The capacity of the input data cache in frames, per bus. */
+
+    /* These variables are read and written only from the audio thread. */
+    ma_uint16 cachedFrameCountOut;
+    ma_uint16 cachedFrameCountIn;
+    ma_uint16 consumedFrameCountIn;
+
+    /* These variables are read and written between different threads. */
+    MA_ATOMIC(4, ma_node_state) state;          /* When set to stopped, nothing will be read, regardless of the times in stateTimes. */
+    MA_ATOMIC(8, ma_uint64) stateTimes[2];      /* Indexed by ma_node_state. Specifies the time based on the global clock that a node should be considered to be in the relevant state. */
+    MA_ATOMIC(8, ma_uint64) localTime;          /* The node's local clock. This is just a running sum of the number of output frames that have been processed. Can be modified by any thread with `ma_node_set_time()`. */
+
+    /* Memory management. */
+    ma_node_input_bus _inputBuses[MA_MAX_NODE_LOCAL_BUS_COUNT];
+    ma_node_output_bus _outputBuses[MA_MAX_NODE_LOCAL_BUS_COUNT];
+    void* _pHeap;   /* A heap allocation for internal use only. pInputBuses and/or pOutputBuses will point to this if the bus count exceeds MA_MAX_NODE_LOCAL_BUS_COUNT. */
+    ma_bool32 _ownsHeap;    /* If set to true, the node owns the heap allocation and _pHeap will be freed in ma_node_uninit(). */
+};
+
+MA_API ma_result ma_node_get_heap_size(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_node_init_preallocated(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, void* pHeap, ma_node* pNode);
+MA_API ma_result ma_node_init(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_node* pNode);
+MA_API void ma_node_uninit(ma_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_node_graph* ma_node_get_node_graph(const ma_node* pNode);
+MA_API ma_uint32 ma_node_get_input_bus_count(const ma_node* pNode);
+MA_API ma_uint32 ma_node_get_output_bus_count(const ma_node* pNode);
+MA_API ma_uint32 ma_node_get_input_channels(const ma_node* pNode, ma_uint32 inputBusIndex);
+MA_API ma_uint32 ma_node_get_output_channels(const ma_node* pNode, ma_uint32 outputBusIndex);
+MA_API ma_result ma_node_attach_output_bus(ma_node* pNode, ma_uint32 outputBusIndex, ma_node* pOtherNode, ma_uint32 otherNodeInputBusIndex);
+MA_API ma_result ma_node_detach_output_bus(ma_node* pNode, ma_uint32 outputBusIndex);
+MA_API ma_result ma_node_detach_all_output_buses(ma_node* pNode);
+MA_API ma_result ma_node_set_output_bus_volume(ma_node* pNode, ma_uint32 outputBusIndex, float volume);
+MA_API float ma_node_get_output_bus_volume(const ma_node* pNode, ma_uint32 outputBusIndex);
+MA_API ma_result ma_node_set_state(ma_node* pNode, ma_node_state state);
+MA_API ma_node_state ma_node_get_state(const ma_node* pNode);
+MA_API ma_result ma_node_set_state_time(ma_node* pNode, ma_node_state state, ma_uint64 globalTime);
+MA_API ma_uint64 ma_node_get_state_time(const ma_node* pNode, ma_node_state state);
+MA_API ma_node_state ma_node_get_state_by_time(const ma_node* pNode, ma_uint64 globalTime);
+MA_API ma_node_state ma_node_get_state_by_time_range(const ma_node* pNode, ma_uint64 globalTimeBeg, ma_uint64 globalTimeEnd);
+MA_API ma_uint64 ma_node_get_time(const ma_node* pNode);
+MA_API ma_result ma_node_set_time(ma_node* pNode, ma_uint64 localTime);
+
+
+typedef struct
+{
+    ma_uint32 channels;
+    ma_uint32 processingSizeInFrames;   /* This is the preferred processing size for node processing callbacks unless overridden by a node itself. Can be 0 in which case it will be based on the frame count passed into ma_node_graph_read_pcm_frames(), but will not be well defined. */
+    size_t preMixStackSizeInBytes;      /* Defaults to 512KB per channel. Reducing this will save memory, but the depth of your node graph will be more restricted. */
+} ma_node_graph_config;
+
+MA_API ma_node_graph_config ma_node_graph_config_init(ma_uint32 channels);
+
+
+struct ma_node_graph
+{
+    /* Immutable. */
+    ma_node_base base;                  /* The node graph itself is a node so it can be connected as an input to different node graph. This has zero inputs and calls ma_node_graph_read_pcm_frames() to generate it's output. */
+    ma_node_base endpoint;              /* Special node that all nodes eventually connect to. Data is read from this node in ma_node_graph_read_pcm_frames(). */
+    float* pProcessingCache;            /* This will be allocated when processingSizeInFrames is non-zero. This is needed because ma_node_graph_read_pcm_frames() can be called with a variable number of frames, and we may need to do some buffering in situations where the caller requests a frame count that's not a multiple of processingSizeInFrames. */
+    ma_uint32 processingCacheFramesRemaining;
+    ma_uint32 processingSizeInFrames;
+
+    /* Read and written by multiple threads. */
+    MA_ATOMIC(4, ma_bool32) isReading;
+
+    /* Modified only by the audio thread. */
+    ma_stack* pPreMixStack;
+};
+
+MA_API ma_result ma_node_graph_init(const ma_node_graph_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_node_graph* pNodeGraph);
+MA_API void ma_node_graph_uninit(ma_node_graph* pNodeGraph, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_node* ma_node_graph_get_endpoint(ma_node_graph* pNodeGraph);
+MA_API ma_result ma_node_graph_read_pcm_frames(ma_node_graph* pNodeGraph, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_uint32 ma_node_graph_get_channels(const ma_node_graph* pNodeGraph);
+MA_API ma_uint64 ma_node_graph_get_time(const ma_node_graph* pNodeGraph);
+MA_API ma_result ma_node_graph_set_time(ma_node_graph* pNodeGraph, ma_uint64 globalTime);
+
+
+
+/* Data source node. 0 input buses, 1 output bus. Used for reading from a data source. */
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_data_source* pDataSource;
+} ma_data_source_node_config;
+
+MA_API ma_data_source_node_config ma_data_source_node_config_init(ma_data_source* pDataSource);
+
+
+typedef struct
+{
+    ma_node_base base;
+    ma_data_source* pDataSource;
+} ma_data_source_node;
+
+MA_API ma_result ma_data_source_node_init(ma_node_graph* pNodeGraph, const ma_data_source_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source_node* pDataSourceNode);
+MA_API void ma_data_source_node_uninit(ma_data_source_node* pDataSourceNode, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_data_source_node_set_looping(ma_data_source_node* pDataSourceNode, ma_bool32 isLooping);
+MA_API ma_bool32 ma_data_source_node_is_looping(ma_data_source_node* pDataSourceNode);
+
+
+/* Splitter Node. 1 input, many outputs. Used for splitting/copying a stream so it can be as input into two separate output nodes. */
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_uint32 channels;
+    ma_uint32 outputBusCount;
+} ma_splitter_node_config;
+
+MA_API ma_splitter_node_config ma_splitter_node_config_init(ma_uint32 channels);
+
+
+typedef struct
+{
+    ma_node_base base;
+} ma_splitter_node;
+
+MA_API ma_result ma_splitter_node_init(ma_node_graph* pNodeGraph, const ma_splitter_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_splitter_node* pSplitterNode);
+MA_API void ma_splitter_node_uninit(ma_splitter_node* pSplitterNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+Biquad Node
+*/
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_biquad_config biquad;
+} ma_biquad_node_config;
+
+MA_API ma_biquad_node_config ma_biquad_node_config_init(ma_uint32 channels, float b0, float b1, float b2, float a0, float a1, float a2);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_biquad biquad;
+} ma_biquad_node;
+
+MA_API ma_result ma_biquad_node_init(ma_node_graph* pNodeGraph, const ma_biquad_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_biquad_node* pNode);
+MA_API ma_result ma_biquad_node_reinit(const ma_biquad_config* pConfig, ma_biquad_node* pNode);
+MA_API void ma_biquad_node_uninit(ma_biquad_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+Low Pass Filter Node
+*/
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_lpf_config lpf;
+} ma_lpf_node_config;
+
+MA_API ma_lpf_node_config ma_lpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_lpf lpf;
+} ma_lpf_node;
+
+MA_API ma_result ma_lpf_node_init(ma_node_graph* pNodeGraph, const ma_lpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf_node* pNode);
+MA_API ma_result ma_lpf_node_reinit(const ma_lpf_config* pConfig, ma_lpf_node* pNode);
+MA_API void ma_lpf_node_uninit(ma_lpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+High Pass Filter Node
+*/
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_hpf_config hpf;
+} ma_hpf_node_config;
+
+MA_API ma_hpf_node_config ma_hpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_hpf hpf;
+} ma_hpf_node;
+
+MA_API ma_result ma_hpf_node_init(ma_node_graph* pNodeGraph, const ma_hpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf_node* pNode);
+MA_API ma_result ma_hpf_node_reinit(const ma_hpf_config* pConfig, ma_hpf_node* pNode);
+MA_API void ma_hpf_node_uninit(ma_hpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+Band Pass Filter Node
+*/
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_bpf_config bpf;
+} ma_bpf_node_config;
+
+MA_API ma_bpf_node_config ma_bpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_bpf bpf;
+} ma_bpf_node;
+
+MA_API ma_result ma_bpf_node_init(ma_node_graph* pNodeGraph, const ma_bpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf_node* pNode);
+MA_API ma_result ma_bpf_node_reinit(const ma_bpf_config* pConfig, ma_bpf_node* pNode);
+MA_API void ma_bpf_node_uninit(ma_bpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+Notching Filter Node
+*/
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_notch_config notch;
+} ma_notch_node_config;
+
+MA_API ma_notch_node_config ma_notch_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double q, double frequency);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_notch2 notch;
+} ma_notch_node;
+
+MA_API ma_result ma_notch_node_init(ma_node_graph* pNodeGraph, const ma_notch_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_notch_node* pNode);
+MA_API ma_result ma_notch_node_reinit(const ma_notch_config* pConfig, ma_notch_node* pNode);
+MA_API void ma_notch_node_uninit(ma_notch_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+Peaking Filter Node
+*/
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_peak_config peak;
+} ma_peak_node_config;
+
+MA_API ma_peak_node_config ma_peak_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_peak2 peak;
+} ma_peak_node;
+
+MA_API ma_result ma_peak_node_init(ma_node_graph* pNodeGraph, const ma_peak_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_peak_node* pNode);
+MA_API ma_result ma_peak_node_reinit(const ma_peak_config* pConfig, ma_peak_node* pNode);
+MA_API void ma_peak_node_uninit(ma_peak_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+Low Shelf Filter Node
+*/
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_loshelf_config loshelf;
+} ma_loshelf_node_config;
+
+MA_API ma_loshelf_node_config ma_loshelf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_loshelf2 loshelf;
+} ma_loshelf_node;
+
+MA_API ma_result ma_loshelf_node_init(ma_node_graph* pNodeGraph, const ma_loshelf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_loshelf_node* pNode);
+MA_API ma_result ma_loshelf_node_reinit(const ma_loshelf_config* pConfig, ma_loshelf_node* pNode);
+MA_API void ma_loshelf_node_uninit(ma_loshelf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+/*
+High Shelf Filter Node
+*/
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_hishelf_config hishelf;
+} ma_hishelf_node_config;
+
+MA_API ma_hishelf_node_config ma_hishelf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_hishelf2 hishelf;
+} ma_hishelf_node;
+
+MA_API ma_result ma_hishelf_node_init(ma_node_graph* pNodeGraph, const ma_hishelf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hishelf_node* pNode);
+MA_API ma_result ma_hishelf_node_reinit(const ma_hishelf_config* pConfig, ma_hishelf_node* pNode);
+MA_API void ma_hishelf_node_uninit(ma_hishelf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+typedef struct
+{
+    ma_node_config nodeConfig;
+    ma_delay_config delay;
+} ma_delay_node_config;
+
+MA_API ma_delay_node_config ma_delay_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 delayInFrames, float decay);
+
+
+typedef struct
+{
+    ma_node_base baseNode;
+    ma_delay delay;
+} ma_delay_node;
+
+MA_API ma_result ma_delay_node_init(ma_node_graph* pNodeGraph, const ma_delay_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_delay_node* pDelayNode);
+MA_API void ma_delay_node_uninit(ma_delay_node* pDelayNode, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API void ma_delay_node_set_wet(ma_delay_node* pDelayNode, float value);
+MA_API float ma_delay_node_get_wet(const ma_delay_node* pDelayNode);
+MA_API void ma_delay_node_set_dry(ma_delay_node* pDelayNode, float value);
+MA_API float ma_delay_node_get_dry(const ma_delay_node* pDelayNode);
+MA_API void ma_delay_node_set_decay(ma_delay_node* pDelayNode, float value);
+MA_API float ma_delay_node_get_decay(const ma_delay_node* pDelayNode);
+#endif  /* MA_NO_NODE_GRAPH */
+
+
+/* SECTION: miniaudio_engine.h */
+/************************************************************************************************************************************************************
+
+Engine
+
+************************************************************************************************************************************************************/
+#if !defined(MA_NO_ENGINE) && !defined(MA_NO_NODE_GRAPH)
+typedef struct ma_engine ma_engine;
+typedef struct ma_sound  ma_sound;
+
+
+/* Sound flags. */
+typedef enum
+{
+    /* Resource manager flags. */
+    MA_SOUND_FLAG_STREAM                = 0x00000001,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM */
+    MA_SOUND_FLAG_DECODE                = 0x00000002,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE */
+    MA_SOUND_FLAG_ASYNC                 = 0x00000004,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC */
+    MA_SOUND_FLAG_WAIT_INIT             = 0x00000008,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT */
+    MA_SOUND_FLAG_UNKNOWN_LENGTH        = 0x00000010,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_UNKNOWN_LENGTH */
+    MA_SOUND_FLAG_LOOPING               = 0x00000020,   /* MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING */
+
+    /* ma_sound specific flags. */
+    MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT = 0x00001000,   /* Do not attach to the endpoint by default. Useful for when setting up nodes in a complex graph system. */
+    MA_SOUND_FLAG_NO_PITCH              = 0x00002000,   /* Disable pitch shifting with ma_sound_set_pitch() and ma_sound_group_set_pitch(). This is an optimization. */
+    MA_SOUND_FLAG_NO_SPATIALIZATION     = 0x00004000    /* Disable spatialization. */
+} ma_sound_flags;
+
+#ifndef MA_ENGINE_MAX_LISTENERS
+#define MA_ENGINE_MAX_LISTENERS             4
+#endif
+
+#define MA_LISTENER_INDEX_CLOSEST           ((ma_uint8)-1)
+
+typedef enum
+{
+    ma_engine_node_type_sound,
+    ma_engine_node_type_group
+} ma_engine_node_type;
+
+typedef struct
+{
+    ma_engine* pEngine;
+    ma_engine_node_type type;
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_uint32 sampleRate;               /* Only used when the type is set to ma_engine_node_type_sound. */
+    ma_uint32 volumeSmoothTimeInPCMFrames;  /* The number of frames to smooth over volume changes. Defaults to 0 in which case no smoothing is used. */
+    ma_mono_expansion_mode monoExpansionMode;
+    ma_bool8 isPitchDisabled;           /* Pitching can be explicitly disabled with MA_SOUND_FLAG_NO_PITCH to optimize processing. */
+    ma_bool8 isSpatializationDisabled;  /* Spatialization can be explicitly disabled with MA_SOUND_FLAG_NO_SPATIALIZATION. */
+    ma_uint8 pinnedListenerIndex;       /* The index of the listener this node should always use for spatialization. If set to MA_LISTENER_INDEX_CLOSEST the engine will use the closest listener. */
+} ma_engine_node_config;
+
+MA_API ma_engine_node_config ma_engine_node_config_init(ma_engine* pEngine, ma_engine_node_type type, ma_uint32 flags);
+
+
+/* Base node object for both ma_sound and ma_sound_group. */
+typedef struct
+{
+    ma_node_base baseNode;                              /* Must be the first member for compatibility with the ma_node API. */
+    ma_engine* pEngine;                                 /* A pointer to the engine. Set based on the value from the config. */
+    ma_uint32 sampleRate;                               /* The sample rate of the input data. For sounds backed by a data source, this will be the data source's sample rate. Otherwise it'll be the engine's sample rate. */
+    ma_uint32 volumeSmoothTimeInPCMFrames;
+    ma_mono_expansion_mode monoExpansionMode;
+    ma_fader fader;
+    ma_linear_resampler resampler;                      /* For pitch shift. */
+    ma_spatializer spatializer;
+    ma_panner panner;
+    ma_gainer volumeGainer;                             /* This will only be used if volumeSmoothTimeInPCMFrames is > 0. */
+    ma_atomic_float volume;                             /* Defaults to 1. */
+    MA_ATOMIC(4, float) pitch;
+    float oldPitch;                                     /* For determining whether or not the resampler needs to be updated to reflect the new pitch. The resampler will be updated on the mixing thread. */
+    float oldDopplerPitch;                              /* For determining whether or not the resampler needs to be updated to take a new doppler pitch into account. */
+    MA_ATOMIC(4, ma_bool32) isPitchDisabled;            /* When set to true, pitching will be disabled which will allow the resampler to be bypassed to save some computation. */
+    MA_ATOMIC(4, ma_bool32) isSpatializationDisabled;   /* Set to false by default. When set to false, will not have spatialisation applied. */
+    MA_ATOMIC(4, ma_uint32) pinnedListenerIndex;        /* The index of the listener this node should always use for spatialization. If set to MA_LISTENER_INDEX_CLOSEST the engine will use the closest listener. */
+
+    /* When setting a fade, it's not done immediately in ma_sound_set_fade(). It's deferred to the audio thread which means we need to store the settings here. */
+    struct
+    {
+        ma_atomic_float volumeBeg;
+        ma_atomic_float volumeEnd;
+        ma_atomic_uint64 fadeLengthInFrames;            /* <-- Defaults to (~(ma_uint64)0) which is used to indicate that no fade should be applied. */
+        ma_atomic_uint64 absoluteGlobalTimeInFrames;    /* <-- The time to start the fade. */
+    } fadeSettings;
+
+    /* Memory management. */
+    ma_bool8 _ownsHeap;
+    void* _pHeap;
+} ma_engine_node;
+
+MA_API ma_result ma_engine_node_get_heap_size(const ma_engine_node_config* pConfig, size_t* pHeapSizeInBytes);
+MA_API ma_result ma_engine_node_init_preallocated(const ma_engine_node_config* pConfig, void* pHeap, ma_engine_node* pEngineNode);
+MA_API ma_result ma_engine_node_init(const ma_engine_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_engine_node* pEngineNode);
+MA_API void ma_engine_node_uninit(ma_engine_node* pEngineNode, const ma_allocation_callbacks* pAllocationCallbacks);
+
+
+#define MA_SOUND_SOURCE_CHANNEL_COUNT   0xFFFFFFFF
+
+/* Callback for when a sound reaches the end. */
+typedef void (* ma_sound_end_proc)(void* pUserData, ma_sound* pSound);
+
+typedef struct
+{
+    const char* pFilePath;                      /* Set this to load from the resource manager. */
+    const wchar_t* pFilePathW;                  /* Set this to load from the resource manager. */
+    ma_data_source* pDataSource;                /* Set this to load from an existing data source. */
+    ma_node* pInitialAttachment;                /* If set, the sound will be attached to an input of this node. This can be set to a ma_sound. If set to NULL, the sound will be attached directly to the endpoint unless MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT is set in `flags`. */
+    ma_uint32 initialAttachmentInputBusIndex;   /* The index of the input bus of pInitialAttachment to attach the sound to. */
+    ma_uint32 channelsIn;                       /* Ignored if using a data source as input (the data source's channel count will be used always). Otherwise, setting to 0 will cause the engine's channel count to be used. */
+    ma_uint32 channelsOut;                      /* Set this to 0 (default) to use the engine's channel count. Set to MA_SOUND_SOURCE_CHANNEL_COUNT to use the data source's channel count (only used if using a data source as input). */
+    ma_mono_expansion_mode monoExpansionMode;   /* Controls how the mono channel should be expanded to other channels when spatialization is disabled on a sound. */
+    ma_uint32 flags;                            /* A combination of MA_SOUND_FLAG_* flags. */
+    ma_uint32 volumeSmoothTimeInPCMFrames;      /* The number of frames to smooth over volume changes. Defaults to 0 in which case no smoothing is used. */
+    ma_uint64 initialSeekPointInPCMFrames;      /* Initializes the sound such that it's seeked to this location by default. */
+    ma_uint64 rangeBegInPCMFrames;
+    ma_uint64 rangeEndInPCMFrames;
+    ma_uint64 loopPointBegInPCMFrames;
+    ma_uint64 loopPointEndInPCMFrames;
+    ma_sound_end_proc endCallback;              /* Fired when the sound reaches the end. Will be fired from the audio thread. Do not restart, uninitialize or otherwise change the state of the sound from here. Instead fire an event or set a variable to indicate to a different thread to change the start of the sound. Will not be fired in response to a scheduled stop with ma_sound_set_stop_time_*(). */
+    void* pEndCallbackUserData;
+#ifndef MA_NO_RESOURCE_MANAGER
+    ma_resource_manager_pipeline_notifications initNotifications;
+#endif
+    ma_fence* pDoneFence;                       /* Deprecated. Use initNotifications instead. Released when the resource manager has finished decoding the entire sound. Not used with streams. */
+    ma_bool32 isLooping;                        /* Deprecated. Use the MA_SOUND_FLAG_LOOPING flag in `flags` instead. */
+} ma_sound_config;
+
+MA_API ma_sound_config ma_sound_config_init(void);                  /* Deprecated. Will be removed in version 0.12. Use ma_sound_config_2() instead. */
+MA_API ma_sound_config ma_sound_config_init_2(ma_engine* pEngine);  /* Will be renamed to ma_sound_config_init() in version 0.12. */
+
+struct ma_sound
+{
+    ma_engine_node engineNode;          /* Must be the first member for compatibility with the ma_node API. */
+    ma_data_source* pDataSource;
+    MA_ATOMIC(8, ma_uint64) seekTarget; /* The PCM frame index to seek to in the mixing thread. Set to (~(ma_uint64)0) to not perform any seeking. */
+    MA_ATOMIC(4, ma_bool32) atEnd;
+    ma_sound_end_proc endCallback;
+    void* pEndCallbackUserData;
+    ma_bool8 ownsDataSource;
+
+    /*
+    We're declaring a resource manager data source object here to save us a malloc when loading a
+    sound via the resource manager, which I *think* will be the most common scenario.
+    */
+#ifndef MA_NO_RESOURCE_MANAGER
+    ma_resource_manager_data_source* pResourceManagerDataSource;
+#endif
+};
+
+/* Structure specifically for sounds played with ma_engine_play_sound(). Making this a separate structure to reduce overhead. */
+typedef struct ma_sound_inlined ma_sound_inlined;
+struct ma_sound_inlined
+{
+    ma_sound sound;
+    ma_sound_inlined* pNext;
+    ma_sound_inlined* pPrev;
+};
+
+/* A sound group is just a sound. */
+typedef ma_sound_config ma_sound_group_config;
+typedef ma_sound        ma_sound_group;
+
+MA_API ma_sound_group_config ma_sound_group_config_init(void);                  /* Deprecated. Will be removed in version 0.12. Use ma_sound_config_2() instead. */
+MA_API ma_sound_group_config ma_sound_group_config_init_2(ma_engine* pEngine);  /* Will be renamed to ma_sound_config_init() in version 0.12. */
+
+typedef void (* ma_engine_process_proc)(void* pUserData, float* pFramesOut, ma_uint64 frameCount);
+
+typedef struct
+{
+#if !defined(MA_NO_RESOURCE_MANAGER)
+    ma_resource_manager* pResourceManager;          /* Can be null in which case a resource manager will be created for you. */
+#endif
+#if !defined(MA_NO_DEVICE_IO)
+    ma_context* pContext;
+    ma_device* pDevice;                             /* If set, the caller is responsible for calling ma_engine_data_callback() in the device's data callback. */
+    ma_device_id* pPlaybackDeviceID;                /* The ID of the playback device to use with the default listener. */
+    ma_device_data_proc dataCallback;               /* Can be null. Can be used to provide a custom device data callback. */
+    ma_device_notification_proc notificationCallback;
+#endif
+    ma_log* pLog;                                   /* When set to NULL, will use the context's log. */
+    ma_uint32 listenerCount;                        /* Must be between 1 and MA_ENGINE_MAX_LISTENERS. */
+    ma_uint32 channels;                             /* The number of channels to use when mixing and spatializing. When set to 0, will use the native channel count of the device. */
+    ma_uint32 sampleRate;                           /* The sample rate. When set to 0 will use the native channel count of the device. */
+    ma_uint32 periodSizeInFrames;                   /* If set to something other than 0, updates will always be exactly this size. The underlying device may be a different size, but from the perspective of the mixer that won't matter.*/
+    ma_uint32 periodSizeInMilliseconds;             /* Used if periodSizeInFrames is unset. */
+    ma_uint32 gainSmoothTimeInFrames;               /* The number of frames to interpolate the gain of spatialized sounds across. If set to 0, will use gainSmoothTimeInMilliseconds. */
+    ma_uint32 gainSmoothTimeInMilliseconds;         /* When set to 0, gainSmoothTimeInFrames will be used. If both are set to 0, a default value will be used. */
+    ma_uint32 defaultVolumeSmoothTimeInPCMFrames;   /* Defaults to 0. Controls the default amount of smoothing to apply to volume changes to sounds. High values means more smoothing at the expense of high latency (will take longer to reach the new volume). */
+    ma_uint32 preMixStackSizeInBytes;               /* A stack is used for internal processing in the node graph. This allows you to configure the size of this stack. Smaller values will reduce the maximum depth of your node graph. You should rarely need to modify this. */
+    ma_allocation_callbacks allocationCallbacks;
+    ma_bool32 noAutoStart;                          /* When set to true, requires an explicit call to ma_engine_start(). This is false by default, meaning the engine will be started automatically in ma_engine_init(). */
+    ma_bool32 noDevice;                             /* When set to true, don't create a default device. ma_engine_read_pcm_frames() can be called manually to read data. */
+    ma_mono_expansion_mode monoExpansionMode;       /* Controls how the mono channel should be expanded to other channels when spatialization is disabled on a sound. */
+    ma_vfs* pResourceManagerVFS;                    /* A pointer to a pre-allocated VFS object to use with the resource manager. This is ignored if pResourceManager is not NULL. */
+    ma_engine_process_proc onProcess;               /* Fired at the end of each call to ma_engine_read_pcm_frames(). For engine's that manage their own internal device (the default configuration), this will be fired from the audio thread, and you do not need to call ma_engine_read_pcm_frames() manually in order to trigger this. */
+    void* pProcessUserData;                         /* User data that's passed into onProcess. */
+} ma_engine_config;
+
+MA_API ma_engine_config ma_engine_config_init(void);
+
+
+struct ma_engine
+{
+    ma_node_graph nodeGraph;                        /* An engine is a node graph. It should be able to be plugged into any ma_node_graph API (with a cast) which means this must be the first member of this struct. */
+#if !defined(MA_NO_RESOURCE_MANAGER)
+    ma_resource_manager* pResourceManager;
+#endif
+#if !defined(MA_NO_DEVICE_IO)
+    ma_device* pDevice;                             /* Optionally set via the config, otherwise allocated by the engine in ma_engine_init(). */
+#endif
+    ma_log* pLog;
+    ma_uint32 sampleRate;
+    ma_uint32 listenerCount;
+    ma_spatializer_listener listeners[MA_ENGINE_MAX_LISTENERS];
+    ma_allocation_callbacks allocationCallbacks;
+    ma_bool8 ownsResourceManager;
+    ma_bool8 ownsDevice;
+    ma_spinlock inlinedSoundLock;                   /* For synchronizing access to the inlined sound list. */
+    ma_sound_inlined* pInlinedSoundHead;            /* The first inlined sound. Inlined sounds are tracked in a linked list. */
+    MA_ATOMIC(4, ma_uint32) inlinedSoundCount;      /* The total number of allocated inlined sound objects. Used for debugging. */
+    ma_uint32 gainSmoothTimeInFrames;               /* The number of frames to interpolate the gain of spatialized sounds across. */
+    ma_uint32 defaultVolumeSmoothTimeInPCMFrames;
+    ma_mono_expansion_mode monoExpansionMode;
+    ma_engine_process_proc onProcess;
+    void* pProcessUserData;
+};
+
+MA_API ma_result ma_engine_init(const ma_engine_config* pConfig, ma_engine* pEngine);
+MA_API void ma_engine_uninit(ma_engine* pEngine);
+MA_API ma_result ma_engine_read_pcm_frames(ma_engine* pEngine, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_node_graph* ma_engine_get_node_graph(ma_engine* pEngine);
+#if !defined(MA_NO_RESOURCE_MANAGER)
+MA_API ma_resource_manager* ma_engine_get_resource_manager(ma_engine* pEngine);
+#endif
+MA_API ma_device* ma_engine_get_device(ma_engine* pEngine);
+MA_API ma_log* ma_engine_get_log(ma_engine* pEngine);
+MA_API ma_node* ma_engine_get_endpoint(ma_engine* pEngine);
+MA_API ma_uint64 ma_engine_get_time_in_pcm_frames(const ma_engine* pEngine);
+MA_API ma_uint64 ma_engine_get_time_in_milliseconds(const ma_engine* pEngine);
+MA_API ma_result ma_engine_set_time_in_pcm_frames(ma_engine* pEngine, ma_uint64 globalTime);
+MA_API ma_result ma_engine_set_time_in_milliseconds(ma_engine* pEngine, ma_uint64 globalTime);
+MA_API ma_uint64 ma_engine_get_time(const ma_engine* pEngine);                  /* Deprecated. Use ma_engine_get_time_in_pcm_frames(). Will be removed in version 0.12. */
+MA_API ma_result ma_engine_set_time(ma_engine* pEngine, ma_uint64 globalTime);  /* Deprecated. Use ma_engine_set_time_in_pcm_frames(). Will be removed in version 0.12. */
+MA_API ma_uint32 ma_engine_get_channels(const ma_engine* pEngine);
+MA_API ma_uint32 ma_engine_get_sample_rate(const ma_engine* pEngine);
+
+MA_API ma_result ma_engine_start(ma_engine* pEngine);
+MA_API ma_result ma_engine_stop(ma_engine* pEngine);
+MA_API ma_result ma_engine_set_volume(ma_engine* pEngine, float volume);
+MA_API float ma_engine_get_volume(ma_engine* pEngine);
+MA_API ma_result ma_engine_set_gain_db(ma_engine* pEngine, float gainDB);
+MA_API float ma_engine_get_gain_db(ma_engine* pEngine);
+
+MA_API ma_uint32 ma_engine_get_listener_count(const ma_engine* pEngine);
+MA_API ma_uint32 ma_engine_find_closest_listener(const ma_engine* pEngine, float absolutePosX, float absolutePosY, float absolutePosZ);
+MA_API void ma_engine_listener_set_position(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z);
+MA_API ma_vec3f ma_engine_listener_get_position(const ma_engine* pEngine, ma_uint32 listenerIndex);
+MA_API void ma_engine_listener_set_direction(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z);
+MA_API ma_vec3f ma_engine_listener_get_direction(const ma_engine* pEngine, ma_uint32 listenerIndex);
+MA_API void ma_engine_listener_set_velocity(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z);
+MA_API ma_vec3f ma_engine_listener_get_velocity(const ma_engine* pEngine, ma_uint32 listenerIndex);
+MA_API void ma_engine_listener_set_cone(ma_engine* pEngine, ma_uint32 listenerIndex, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
+MA_API void ma_engine_listener_get_cone(const ma_engine* pEngine, ma_uint32 listenerIndex, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
+MA_API void ma_engine_listener_set_world_up(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z);
+MA_API ma_vec3f ma_engine_listener_get_world_up(const ma_engine* pEngine, ma_uint32 listenerIndex);
+MA_API void ma_engine_listener_set_enabled(ma_engine* pEngine, ma_uint32 listenerIndex, ma_bool32 isEnabled);
+MA_API ma_bool32 ma_engine_listener_is_enabled(const ma_engine* pEngine, ma_uint32 listenerIndex);
+
+#ifndef MA_NO_RESOURCE_MANAGER
+MA_API ma_result ma_engine_play_sound_ex(ma_engine* pEngine, const char* pFilePath, ma_node* pNode, ma_uint32 nodeInputBusIndex);
+MA_API ma_result ma_engine_play_sound(ma_engine* pEngine, const char* pFilePath, ma_sound_group* pGroup);   /* Fire and forget. */
+#endif
+
+#ifndef MA_NO_RESOURCE_MANAGER
+MA_API ma_result ma_sound_init_from_file(ma_engine* pEngine, const char* pFilePath, ma_uint32 flags, ma_sound_group* pGroup, ma_fence* pDoneFence, ma_sound* pSound);
+MA_API ma_result ma_sound_init_from_file_w(ma_engine* pEngine, const wchar_t* pFilePath, ma_uint32 flags, ma_sound_group* pGroup, ma_fence* pDoneFence, ma_sound* pSound);
+MA_API ma_result ma_sound_init_copy(ma_engine* pEngine, const ma_sound* pExistingSound, ma_uint32 flags, ma_sound_group* pGroup, ma_sound* pSound);
+#endif
+MA_API ma_result ma_sound_init_from_data_source(ma_engine* pEngine, ma_data_source* pDataSource, ma_uint32 flags, ma_sound_group* pGroup, ma_sound* pSound);
+MA_API ma_result ma_sound_init_ex(ma_engine* pEngine, const ma_sound_config* pConfig, ma_sound* pSound);
+MA_API void ma_sound_uninit(ma_sound* pSound);
+MA_API ma_engine* ma_sound_get_engine(const ma_sound* pSound);
+MA_API ma_data_source* ma_sound_get_data_source(const ma_sound* pSound);
+MA_API ma_result ma_sound_start(ma_sound* pSound);
+MA_API ma_result ma_sound_stop(ma_sound* pSound);
+MA_API ma_result ma_sound_stop_with_fade_in_pcm_frames(ma_sound* pSound, ma_uint64 fadeLengthInFrames);     /* Will overwrite any scheduled stop and fade. */
+MA_API ma_result ma_sound_stop_with_fade_in_milliseconds(ma_sound* pSound, ma_uint64 fadeLengthInFrames);   /* Will overwrite any scheduled stop and fade. */
+MA_API void ma_sound_set_volume(ma_sound* pSound, float volume);
+MA_API float ma_sound_get_volume(const ma_sound* pSound);
+MA_API void ma_sound_set_pan(ma_sound* pSound, float pan);
+MA_API float ma_sound_get_pan(const ma_sound* pSound);
+MA_API void ma_sound_set_pan_mode(ma_sound* pSound, ma_pan_mode panMode);
+MA_API ma_pan_mode ma_sound_get_pan_mode(const ma_sound* pSound);
+MA_API void ma_sound_set_pitch(ma_sound* pSound, float pitch);
+MA_API float ma_sound_get_pitch(const ma_sound* pSound);
+MA_API void ma_sound_set_spatialization_enabled(ma_sound* pSound, ma_bool32 enabled);
+MA_API ma_bool32 ma_sound_is_spatialization_enabled(const ma_sound* pSound);
+MA_API void ma_sound_set_pinned_listener_index(ma_sound* pSound, ma_uint32 listenerIndex);
+MA_API ma_uint32 ma_sound_get_pinned_listener_index(const ma_sound* pSound);
+MA_API ma_uint32 ma_sound_get_listener_index(const ma_sound* pSound);
+MA_API ma_vec3f ma_sound_get_direction_to_listener(const ma_sound* pSound);
+MA_API void ma_sound_set_position(ma_sound* pSound, float x, float y, float z);
+MA_API ma_vec3f ma_sound_get_position(const ma_sound* pSound);
+MA_API void ma_sound_set_direction(ma_sound* pSound, float x, float y, float z);
+MA_API ma_vec3f ma_sound_get_direction(const ma_sound* pSound);
+MA_API void ma_sound_set_velocity(ma_sound* pSound, float x, float y, float z);
+MA_API ma_vec3f ma_sound_get_velocity(const ma_sound* pSound);
+MA_API void ma_sound_set_attenuation_model(ma_sound* pSound, ma_attenuation_model attenuationModel);
+MA_API ma_attenuation_model ma_sound_get_attenuation_model(const ma_sound* pSound);
+MA_API void ma_sound_set_positioning(ma_sound* pSound, ma_positioning positioning);
+MA_API ma_positioning ma_sound_get_positioning(const ma_sound* pSound);
+MA_API void ma_sound_set_rolloff(ma_sound* pSound, float rolloff);
+MA_API float ma_sound_get_rolloff(const ma_sound* pSound);
+MA_API void ma_sound_set_min_gain(ma_sound* pSound, float minGain);
+MA_API float ma_sound_get_min_gain(const ma_sound* pSound);
+MA_API void ma_sound_set_max_gain(ma_sound* pSound, float maxGain);
+MA_API float ma_sound_get_max_gain(const ma_sound* pSound);
+MA_API void ma_sound_set_min_distance(ma_sound* pSound, float minDistance);
+MA_API float ma_sound_get_min_distance(const ma_sound* pSound);
+MA_API void ma_sound_set_max_distance(ma_sound* pSound, float maxDistance);
+MA_API float ma_sound_get_max_distance(const ma_sound* pSound);
+MA_API void ma_sound_set_cone(ma_sound* pSound, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
+MA_API void ma_sound_get_cone(const ma_sound* pSound, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
+MA_API void ma_sound_set_doppler_factor(ma_sound* pSound, float dopplerFactor);
+MA_API float ma_sound_get_doppler_factor(const ma_sound* pSound);
+MA_API void ma_sound_set_directional_attenuation_factor(ma_sound* pSound, float directionalAttenuationFactor);
+MA_API float ma_sound_get_directional_attenuation_factor(const ma_sound* pSound);
+MA_API void ma_sound_set_fade_in_pcm_frames(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames);
+MA_API void ma_sound_set_fade_in_milliseconds(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds);
+MA_API void ma_sound_set_fade_start_in_pcm_frames(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames, ma_uint64 absoluteGlobalTimeInFrames);
+MA_API void ma_sound_set_fade_start_in_milliseconds(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds, ma_uint64 absoluteGlobalTimeInMilliseconds);
+MA_API float ma_sound_get_current_fade_volume(const ma_sound* pSound);
+MA_API void ma_sound_set_start_time_in_pcm_frames(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInFrames);
+MA_API void ma_sound_set_start_time_in_milliseconds(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInMilliseconds);
+MA_API void ma_sound_set_stop_time_in_pcm_frames(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInFrames);
+MA_API void ma_sound_set_stop_time_in_milliseconds(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInMilliseconds);
+MA_API void ma_sound_set_stop_time_with_fade_in_pcm_frames(ma_sound* pSound, ma_uint64 stopAbsoluteGlobalTimeInFrames, ma_uint64 fadeLengthInFrames);
+MA_API void ma_sound_set_stop_time_with_fade_in_milliseconds(ma_sound* pSound, ma_uint64 stopAbsoluteGlobalTimeInMilliseconds, ma_uint64 fadeLengthInMilliseconds);
+MA_API ma_bool32 ma_sound_is_playing(const ma_sound* pSound);
+MA_API ma_uint64 ma_sound_get_time_in_pcm_frames(const ma_sound* pSound);
+MA_API ma_uint64 ma_sound_get_time_in_milliseconds(const ma_sound* pSound);
+MA_API void ma_sound_set_looping(ma_sound* pSound, ma_bool32 isLooping);
+MA_API ma_bool32 ma_sound_is_looping(const ma_sound* pSound);
+MA_API ma_bool32 ma_sound_at_end(const ma_sound* pSound);
+MA_API ma_result ma_sound_seek_to_pcm_frame(ma_sound* pSound, ma_uint64 frameIndex); /* Just a wrapper around ma_data_source_seek_to_pcm_frame(). */
+MA_API ma_result ma_sound_seek_to_second(ma_sound* pSound, float seekPointInSeconds); /* Abstraction to ma_sound_seek_to_pcm_frame() */
+MA_API ma_result ma_sound_get_data_format(ma_sound* pSound, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_sound_get_cursor_in_pcm_frames(ma_sound* pSound, ma_uint64* pCursor);
+MA_API ma_result ma_sound_get_length_in_pcm_frames(ma_sound* pSound, ma_uint64* pLength);
+MA_API ma_result ma_sound_get_cursor_in_seconds(ma_sound* pSound, float* pCursor);
+MA_API ma_result ma_sound_get_length_in_seconds(ma_sound* pSound, float* pLength);
+MA_API ma_result ma_sound_set_end_callback(ma_sound* pSound, ma_sound_end_proc callback, void* pUserData);
+
+MA_API ma_result ma_sound_group_init(ma_engine* pEngine, ma_uint32 flags, ma_sound_group* pParentGroup, ma_sound_group* pGroup);
+MA_API ma_result ma_sound_group_init_ex(ma_engine* pEngine, const ma_sound_group_config* pConfig, ma_sound_group* pGroup);
+MA_API void ma_sound_group_uninit(ma_sound_group* pGroup);
+MA_API ma_engine* ma_sound_group_get_engine(const ma_sound_group* pGroup);
+MA_API ma_result ma_sound_group_start(ma_sound_group* pGroup);
+MA_API ma_result ma_sound_group_stop(ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_volume(ma_sound_group* pGroup, float volume);
+MA_API float ma_sound_group_get_volume(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_pan(ma_sound_group* pGroup, float pan);
+MA_API float ma_sound_group_get_pan(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_pan_mode(ma_sound_group* pGroup, ma_pan_mode panMode);
+MA_API ma_pan_mode ma_sound_group_get_pan_mode(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_pitch(ma_sound_group* pGroup, float pitch);
+MA_API float ma_sound_group_get_pitch(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_spatialization_enabled(ma_sound_group* pGroup, ma_bool32 enabled);
+MA_API ma_bool32 ma_sound_group_is_spatialization_enabled(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_pinned_listener_index(ma_sound_group* pGroup, ma_uint32 listenerIndex);
+MA_API ma_uint32 ma_sound_group_get_pinned_listener_index(const ma_sound_group* pGroup);
+MA_API ma_uint32 ma_sound_group_get_listener_index(const ma_sound_group* pGroup);
+MA_API ma_vec3f ma_sound_group_get_direction_to_listener(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_position(ma_sound_group* pGroup, float x, float y, float z);
+MA_API ma_vec3f ma_sound_group_get_position(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_direction(ma_sound_group* pGroup, float x, float y, float z);
+MA_API ma_vec3f ma_sound_group_get_direction(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_velocity(ma_sound_group* pGroup, float x, float y, float z);
+MA_API ma_vec3f ma_sound_group_get_velocity(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_attenuation_model(ma_sound_group* pGroup, ma_attenuation_model attenuationModel);
+MA_API ma_attenuation_model ma_sound_group_get_attenuation_model(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_positioning(ma_sound_group* pGroup, ma_positioning positioning);
+MA_API ma_positioning ma_sound_group_get_positioning(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_rolloff(ma_sound_group* pGroup, float rolloff);
+MA_API float ma_sound_group_get_rolloff(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_min_gain(ma_sound_group* pGroup, float minGain);
+MA_API float ma_sound_group_get_min_gain(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_max_gain(ma_sound_group* pGroup, float maxGain);
+MA_API float ma_sound_group_get_max_gain(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_min_distance(ma_sound_group* pGroup, float minDistance);
+MA_API float ma_sound_group_get_min_distance(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_max_distance(ma_sound_group* pGroup, float maxDistance);
+MA_API float ma_sound_group_get_max_distance(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_cone(ma_sound_group* pGroup, float innerAngleInRadians, float outerAngleInRadians, float outerGain);
+MA_API void ma_sound_group_get_cone(const ma_sound_group* pGroup, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain);
+MA_API void ma_sound_group_set_doppler_factor(ma_sound_group* pGroup, float dopplerFactor);
+MA_API float ma_sound_group_get_doppler_factor(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_directional_attenuation_factor(ma_sound_group* pGroup, float directionalAttenuationFactor);
+MA_API float ma_sound_group_get_directional_attenuation_factor(const ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_fade_in_pcm_frames(ma_sound_group* pGroup, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames);
+MA_API void ma_sound_group_set_fade_in_milliseconds(ma_sound_group* pGroup, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds);
+MA_API float ma_sound_group_get_current_fade_volume(ma_sound_group* pGroup);
+MA_API void ma_sound_group_set_start_time_in_pcm_frames(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInFrames);
+MA_API void ma_sound_group_set_start_time_in_milliseconds(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInMilliseconds);
+MA_API void ma_sound_group_set_stop_time_in_pcm_frames(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInFrames);
+MA_API void ma_sound_group_set_stop_time_in_milliseconds(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInMilliseconds);
+MA_API ma_bool32 ma_sound_group_is_playing(const ma_sound_group* pGroup);
+MA_API ma_uint64 ma_sound_group_get_time_in_pcm_frames(const ma_sound_group* pGroup);
+#endif  /* MA_NO_ENGINE */
+/* END SECTION: miniaudio_engine.h */
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* miniaudio_h */
+
+
+/*
+This is for preventing greying out of the implementation section.
+*/
+#if defined(Q_CREATOR_RUN) || defined(__INTELLISENSE__) || defined(__CDT_PARSER__)
+#define MINIAUDIO_IMPLEMENTATION
+#endif
+
+/************************************************************************************************************************************************************
+*************************************************************************************************************************************************************
+
+IMPLEMENTATION
+
+*************************************************************************************************************************************************************
+************************************************************************************************************************************************************/
+#if defined(MINIAUDIO_IMPLEMENTATION) || defined(MA_IMPLEMENTATION)
+#ifndef miniaudio_c
+#define miniaudio_c
+
+#include <assert.h>
+#include <limits.h>         /* For INT_MAX */
+#include <math.h>           /* sin(), etc. */
+#include <stdlib.h>         /* For malloc(), free(), wcstombs(). */
+#include <string.h>         /* For memset() */
+
+#include <stdarg.h>
+#include <stdio.h>
+#if !defined(_MSC_VER) && !defined(__DMC__)
+    #include <strings.h>    /* For strcasecmp(). */
+    #include <wchar.h>      /* For wcslen(), wcsrtombs() */
+#endif
+#ifdef _MSC_VER
+    #include <float.h>      /* For _controlfp_s constants */
+#endif
+
+#if defined(MA_WIN32)
+    #include <windows.h>
+
+    /*
+    There's a possibility that WIN32_LEAN_AND_MEAN has been defined which will exclude some symbols
+    such as STGM_READ and CLSCTL_ALL. We need to check these and define them ourselves if they're
+    unavailable.
+    */
+    #ifndef STGM_READ
+    #define STGM_READ   0x00000000L
+    #endif
+    #ifndef CLSCTX_ALL
+    #define CLSCTX_ALL  23
+    #endif
+
+    /* IUnknown is used by both the WASAPI and DirectSound backends. It easier to just declare our version here. */
+    typedef struct ma_IUnknown  ma_IUnknown;
+#endif
+
+#if !defined(MA_WIN32)
+#include <sched.h>
+#include <sys/time.h>   /* select() (used for ma_sleep()). */
+#include <pthread.h>
+#endif
+
+#ifdef MA_NX
+#include <time.h>       /* For nanosleep() */
+#endif
+
+#include <sys/stat.h>   /* For fstat(), etc. */
+
+#ifdef MA_EMSCRIPTEN
+#include <emscripten/emscripten.h>
+#endif
+
+
+/* Architecture Detection */
+#if !defined(MA_64BIT) && !defined(MA_32BIT)
+#ifdef _WIN32
+#ifdef _WIN64
+#define MA_64BIT
+#else
+#define MA_32BIT
+#endif
+#endif
+#endif
+
+#if !defined(MA_64BIT) && !defined(MA_32BIT)
+#ifdef __GNUC__
+#ifdef __LP64__
+#define MA_64BIT
+#else
+#define MA_32BIT
+#endif
+#endif
+#endif
+
+#if !defined(MA_64BIT) && !defined(MA_32BIT)
+#include <stdint.h>
+#if INTPTR_MAX == INT64_MAX
+#define MA_64BIT
+#else
+#define MA_32BIT
+#endif
+#endif
+
+#if defined(__arm__) || defined(_M_ARM)
+#define MA_ARM32
+#endif
+#if defined(__arm64) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
+#define MA_ARM64
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define MA_X64
+#elif defined(__i386) || defined(_M_IX86)
+#define MA_X86
+#elif defined(MA_ARM32) || defined(MA_ARM64)
+#define MA_ARM
+#endif
+
+/* Intrinsics Support */
+#if (defined(MA_X64) || defined(MA_X86)) && !defined(__COSMOPOLITAN__)
+    #if defined(_MSC_VER) && !defined(__clang__)
+        /* MSVC. */
+        #if _MSC_VER >= 1400 && !defined(MA_NO_SSE2)   /* 2005 */
+            #define MA_SUPPORT_SSE2
+        #endif
+        /*#if _MSC_VER >= 1600 && !defined(MA_NO_AVX)*/    /* 2010 */
+        /*    #define MA_SUPPORT_AVX*/
+        /*#endif*/
+        #if _MSC_VER >= 1700 && !defined(MA_NO_AVX2)   /* 2012 */
+            #define MA_SUPPORT_AVX2
+        #endif
+    #else
+        /* Assume GNUC-style. */
+        #if defined(__SSE2__) && !defined(MA_NO_SSE2)
+            #define MA_SUPPORT_SSE2
+        #endif
+        /*#if defined(__AVX__) && !defined(MA_NO_AVX)*/
+        /*    #define MA_SUPPORT_AVX*/
+        /*#endif*/
+        #if defined(__AVX2__) && !defined(MA_NO_AVX2)
+            #define MA_SUPPORT_AVX2
+        #endif
+    #endif
+
+    /* If at this point we still haven't determined compiler support for the intrinsics just fall back to __has_include. */
+    #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
+        #if !defined(MA_SUPPORT_SSE2)   && !defined(MA_NO_SSE2)   && __has_include(<emmintrin.h>)
+            #define MA_SUPPORT_SSE2
+        #endif
+        /*#if !defined(MA_SUPPORT_AVX)    && !defined(MA_NO_AVX)    && __has_include(<immintrin.h>)*/
+        /*    #define MA_SUPPORT_AVX*/
+        /*#endif*/
+        #if !defined(MA_SUPPORT_AVX2)   && !defined(MA_NO_AVX2)   && __has_include(<immintrin.h>)
+            #define MA_SUPPORT_AVX2
+        #endif
+    #endif
+
+    #if defined(MA_SUPPORT_AVX2) || defined(MA_SUPPORT_AVX)
+        #include <immintrin.h>
+    #elif defined(MA_SUPPORT_SSE2)
+        #include <emmintrin.h>
+    #endif
+#endif
+
+#if defined(MA_ARM)
+    #if !defined(MA_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
+        #define MA_SUPPORT_NEON
+        #include <arm_neon.h>
+    #endif
+#endif
+
+/* Begin globally disabled warnings. */
+#if defined(_MSC_VER)
+    #pragma warning(push)
+    #pragma warning(disable:4752)   /* found Intel(R) Advanced Vector Extensions; consider using /arch:AVX */
+    #pragma warning(disable:4049)   /* compiler limit : terminating line number emission */
+#endif
+
+#if defined(MA_X64) || defined(MA_X86)
+    #if defined(_MSC_VER) && !defined(__clang__)
+        #if _MSC_VER >= 1400
+            #include <intrin.h>
+            static MA_INLINE void ma_cpuid(int info[4], int fid)
+            {
+                __cpuid(info, fid);
+            }
+        #else
+            #define MA_NO_CPUID
+        #endif
+
+        #if _MSC_VER >= 1600 && (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219)
+            static MA_INLINE unsigned __int64 ma_xgetbv(int reg)
+            {
+                return _xgetbv(reg);
+            }
+        #else
+            #define MA_NO_XGETBV
+        #endif
+    #elif (defined(__GNUC__) || defined(__clang__)) && !defined(MA_ANDROID)
+        static MA_INLINE void ma_cpuid(int info[4], int fid)
+        {
+            /*
+            It looks like the -fPIC option uses the ebx register which GCC complains about. We can work around this by just using a different register, the
+            specific register of which I'm letting the compiler decide on. The "k" prefix is used to specify a 32-bit register. The {...} syntax is for
+            supporting different assembly dialects.
+
+            What's basically happening is that we're saving and restoring the ebx register manually.
+            */
+            #if defined(MA_X86) && defined(__PIC__)
+                __asm__ __volatile__ (
+                    "xchg{l} {%%}ebx, %k1;"
+                    "cpuid;"
+                    "xchg{l} {%%}ebx, %k1;"
+                    : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
+                );
+            #else
+                __asm__ __volatile__ (
+                    "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
+                );
+            #endif
+        }
+
+        static MA_INLINE ma_uint64 ma_xgetbv(int reg)
+        {
+            unsigned int hi;
+            unsigned int lo;
+
+            __asm__ __volatile__ (
+                "xgetbv" : "=a"(lo), "=d"(hi) : "c"(reg)
+            );
+
+            return ((ma_uint64)hi << 32) | (ma_uint64)lo;
+        }
+    #else
+        #define MA_NO_CPUID
+        #define MA_NO_XGETBV
+    #endif
+#else
+    #define MA_NO_CPUID
+    #define MA_NO_XGETBV
+#endif
+
+static MA_INLINE ma_bool32 ma_has_sse2(void)
+{
+#if defined(MA_SUPPORT_SSE2)
+    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_NO_SSE2)
+        #if defined(MA_X64)
+            return MA_TRUE;    /* 64-bit targets always support SSE2. */
+        #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
+            return MA_TRUE;    /* If the compiler is allowed to freely generate SSE2 code we can assume support. */
+        #else
+            #if defined(MA_NO_CPUID)
+                return MA_FALSE;
+            #else
+                int info[4];
+                ma_cpuid(info, 1);
+                return (info[3] & (1 << 26)) != 0;
+            #endif
+        #endif
+    #else
+        return MA_FALSE;       /* SSE2 is only supported on x86 and x64 architectures. */
+    #endif
+#else
+    return MA_FALSE;           /* No compiler support. */
+#endif
+}
+
+#if 0
+static MA_INLINE ma_bool32 ma_has_avx()
+{
+#if defined(MA_SUPPORT_AVX)
+    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_NO_AVX)
+        #if defined(_AVX_) || defined(__AVX__)
+            return MA_TRUE;    /* If the compiler is allowed to freely generate AVX code we can assume support. */
+        #else
+            /* AVX requires both CPU and OS support. */
+            #if defined(MA_NO_CPUID) || defined(MA_NO_XGETBV)
+                return MA_FALSE;
+            #else
+                int info[4];
+                ma_cpuid(info, 1);
+                if (((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0)) {
+                    ma_uint64 xrc = ma_xgetbv(0);
+                    if ((xrc & 0x06) == 0x06) {
+                        return MA_TRUE;
+                    } else {
+                        return MA_FALSE;
+                    }
+                } else {
+                    return MA_FALSE;
+                }
+            #endif
+        #endif
+    #else
+        return MA_FALSE;       /* AVX is only supported on x86 and x64 architectures. */
+    #endif
+#else
+    return MA_FALSE;           /* No compiler support. */
+#endif
+}
+#endif
+
+static MA_INLINE ma_bool32 ma_has_avx2(void)
+{
+#if defined(MA_SUPPORT_AVX2)
+    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_NO_AVX2)
+        #if defined(_AVX2_) || defined(__AVX2__)
+            return MA_TRUE;    /* If the compiler is allowed to freely generate AVX2 code we can assume support. */
+        #else
+            /* AVX2 requires both CPU and OS support. */
+            #if defined(MA_NO_CPUID) || defined(MA_NO_XGETBV)
+                return MA_FALSE;
+            #else
+                int info1[4];
+                int info7[4];
+                ma_cpuid(info1, 1);
+                ma_cpuid(info7, 7);
+                if (((info1[2] & (1 << 27)) != 0) && ((info7[1] & (1 << 5)) != 0)) {
+                    ma_uint64 xrc = ma_xgetbv(0);
+                    if ((xrc & 0x06) == 0x06) {
+                        return MA_TRUE;
+                    } else {
+                        return MA_FALSE;
+                    }
+                } else {
+                    return MA_FALSE;
+                }
+            #endif
+        #endif
+    #else
+        return MA_FALSE;       /* AVX2 is only supported on x86 and x64 architectures. */
+    #endif
+#else
+    return MA_FALSE;           /* No compiler support. */
+#endif
+}
+
+static MA_INLINE ma_bool32 ma_has_neon(void)
+{
+#if defined(MA_SUPPORT_NEON)
+    #if defined(MA_ARM) && !defined(MA_NO_NEON)
+        #if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
+            return MA_TRUE;    /* If the compiler is allowed to freely generate NEON code we can assume support. */
+        #else
+            /* TODO: Runtime check. */
+            return MA_FALSE;
+        #endif
+    #else
+        return MA_FALSE;       /* NEON is only supported on ARM architectures. */
+    #endif
+#else
+    return MA_FALSE;           /* No compiler support. */
+#endif
+}
+
+#if defined(__has_builtin)
+    #define MA_COMPILER_HAS_BUILTIN(x) __has_builtin(x)
+#else
+    #define MA_COMPILER_HAS_BUILTIN(x) 0
+#endif
+
+#ifndef MA_ASSUME
+    #if MA_COMPILER_HAS_BUILTIN(__builtin_assume)
+        #define MA_ASSUME(x) __builtin_assume(x)
+    #elif MA_COMPILER_HAS_BUILTIN(__builtin_unreachable)
+        #define MA_ASSUME(x) do { if (!(x)) __builtin_unreachable(); } while (0)
+    #elif defined(_MSC_VER)
+        #define MA_ASSUME(x) __assume(x)
+    #else
+        #define MA_ASSUME(x) (void)(x)
+    #endif
+#endif
+
+#ifndef MA_RESTRICT
+    #if defined(__clang__) || defined(__GNUC__) || defined(_MSC_VER)
+        #define MA_RESTRICT __restrict
+    #else
+        #define MA_RESTRICT
+    #endif
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    #define MA_HAS_BYTESWAP16_INTRINSIC
+    #define MA_HAS_BYTESWAP32_INTRINSIC
+    #define MA_HAS_BYTESWAP64_INTRINSIC
+#elif defined(__clang__)
+    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap16)
+        #define MA_HAS_BYTESWAP16_INTRINSIC
+    #endif
+    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap32)
+        #define MA_HAS_BYTESWAP32_INTRINSIC
+    #endif
+    #if MA_COMPILER_HAS_BUILTIN(__builtin_bswap64)
+        #define MA_HAS_BYTESWAP64_INTRINSIC
+    #endif
+#elif defined(__GNUC__)
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+        #define MA_HAS_BYTESWAP32_INTRINSIC
+        #define MA_HAS_BYTESWAP64_INTRINSIC
+    #endif
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+        #define MA_HAS_BYTESWAP16_INTRINSIC
+    #endif
+#endif
+
+
+static MA_INLINE ma_bool32 ma_is_little_endian(void)
+{
+#if defined(MA_X86) || defined(MA_X64)
+    return MA_TRUE;
+#else
+    int n = 1;
+    return (*(char*)&n) == 1;
+#endif
+}
+
+static MA_INLINE ma_bool32 ma_is_big_endian(void)
+{
+    return !ma_is_little_endian();
+}
+
+
+static MA_INLINE ma_uint32 ma_swap_endian_uint32(ma_uint32 n)
+{
+#ifdef MA_HAS_BYTESWAP32_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_ulong(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        #if defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(MA_64BIT)   /* <-- 64-bit inline assembly has not been tested, so disabling for now. */
+            /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */
+            ma_uint32 r;
+            __asm__ __volatile__ (
+            #if defined(MA_64BIT)
+                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
+            #else
+                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
+            #endif
+            );
+            return r;
+        #else
+            return __builtin_bswap32(n);
+        #endif
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+
+
+#if !defined(MA_EMSCRIPTEN)
+#ifdef MA_WIN32
+static void ma_sleep__win32(ma_uint32 milliseconds)
+{
+    Sleep((DWORD)milliseconds);
+}
+#endif
+#ifdef MA_POSIX
+static void ma_sleep__posix(ma_uint32 milliseconds)
+{
+#ifdef MA_EMSCRIPTEN
+    (void)milliseconds;
+    MA_ASSERT(MA_FALSE);  /* The Emscripten build should never sleep. */
+#else
+    #if (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309L) || defined(MA_NX)
+        struct timespec ts;
+        ts.tv_sec  = milliseconds / 1000;
+        ts.tv_nsec = milliseconds % 1000 * 1000000;
+        nanosleep(&ts, NULL);
+    #else
+        struct timeval tv;
+        tv.tv_sec  = milliseconds / 1000;
+        tv.tv_usec = milliseconds % 1000 * 1000;
+        select(0, NULL, NULL, NULL, &tv);
+    #endif
+#endif
+}
+#endif
+
+static MA_INLINE void ma_sleep(ma_uint32 milliseconds)
+{
+#ifdef MA_WIN32
+    ma_sleep__win32(milliseconds);
+#endif
+#ifdef MA_POSIX
+    ma_sleep__posix(milliseconds);
+#endif
+}
+#endif
+
+static MA_INLINE void ma_yield(void)
+{
+#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+    /* x86/x64 */
+    #if (defined(_MSC_VER) || defined(__WATCOMC__) || defined(__DMC__)) && !defined(__clang__)
+        #if _MSC_VER >= 1400
+            _mm_pause();
+        #else
+            #if defined(__DMC__)
+                /* Digital Mars does not recognize the PAUSE opcode. Fall back to NOP. */
+                __asm nop;
+            #else
+                __asm pause;
+            #endif
+        #endif
+    #else
+        __asm__ __volatile__ ("pause");
+    #endif
+#elif (defined(__arm__) && defined(__ARM_ARCH) && __ARM_ARCH >= 7) || defined(_M_ARM64) || (defined(_M_ARM) && _M_ARM >= 7) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__)
+    /* ARM */
+    #if defined(_MSC_VER)
+        /* Apparently there is a __yield() intrinsic that's compatible with ARM, but I cannot find documentation for it nor can I find where it's declared. */
+        __yield();
+    #else
+        __asm__ __volatile__ ("yield"); /* ARMv6K/ARMv6T2 and above. */
+    #endif
+#else
+    /* Unknown or unsupported architecture. No-op. */
+#endif
+}
+
+
+#define MA_MM_DENORMALS_ZERO_MASK   0x0040
+#define MA_MM_FLUSH_ZERO_MASK       0x8000
+
+static MA_INLINE unsigned int ma_disable_denormals(void)
+{
+    unsigned int prevState;
+
+    #if defined(_MSC_VER)
+    {
+        /*
+        Older versions of Visual Studio don't support the "safe" versions of _controlfp_s(). I don't
+        know which version of Visual Studio first added support for _controlfp_s(), but I do know
+        that VC6 lacks support. _MSC_VER = 1200 is VC6, but if you get compilation errors on older
+        versions of Visual Studio, let me know and I'll make the necessary adjustment.
+        */
+        #if _MSC_VER <= 1200
+        {
+            prevState = _statusfp();
+            _controlfp(prevState | _DN_FLUSH, _MCW_DN);
+        }
+        #else
+        {
+            unsigned int unused;
+            _controlfp_s(&prevState, 0, 0);
+            _controlfp_s(&unused, prevState | _DN_FLUSH, _MCW_DN);
+        }
+        #endif
+    }
+    #elif defined(MA_X86) || defined(MA_X64)
+    {
+        #if defined(__SSE2__) && !(defined(__TINYC__) || defined(__WATCOMC__) || defined(__COSMOPOLITAN__)) /* <-- Add compilers that lack support for _mm_getcsr() and _mm_setcsr() to this list. */
+        {
+            prevState = _mm_getcsr();
+            _mm_setcsr(prevState | MA_MM_DENORMALS_ZERO_MASK | MA_MM_FLUSH_ZERO_MASK);
+        }
+        #else
+        {
+            /* x88/64, but no support for _mm_getcsr()/_mm_setcsr(). May need to fall back to inlined assembly here. */
+            prevState = 0;
+        }
+        #endif
+    }
+    #else
+    {
+        /* Unknown or unsupported architecture. No-op. */
+        prevState = 0;
+    }
+    #endif
+
+    return prevState;
+}
+
+static MA_INLINE void ma_restore_denormals(unsigned int prevState)
+{
+    #if defined(_MSC_VER)
+    {
+        /* Older versions of Visual Studio do not support _controlfp_s(). See ma_disable_denormals(). */
+        #if _MSC_VER <= 1200
+        {
+            _controlfp(prevState, _MCW_DN);
+        }
+        #else
+        {
+            unsigned int unused;
+            _controlfp_s(&unused, prevState, _MCW_DN);
+        }
+        #endif
+    }
+    #elif defined(MA_X86) || defined(MA_X64)
+    {
+        #if defined(__SSE2__) && !(defined(__TINYC__) || defined(__WATCOMC__) || defined(__COSMOPOLITAN__))   /* <-- Add compilers that lack support for _mm_getcsr() and _mm_setcsr() to this list. */
+        {
+            _mm_setcsr(prevState);
+        }
+        #else
+        {
+            /* x88/64, but no support for _mm_getcsr()/_mm_setcsr(). May need to fall back to inlined assembly here. */
+            (void)prevState;
+        }
+        #endif
+    }
+    #else
+    {
+        /* Unknown or unsupported architecture. No-op. */
+        (void)prevState;
+    }
+    #endif
+}
+
+
+#ifdef MA_ANDROID
+#include <sys/system_properties.h>
+
+int ma_android_sdk_version()
+{
+    char sdkVersion[PROP_VALUE_MAX + 1] = {0, };
+    if (__system_property_get("ro.build.version.sdk", sdkVersion)) {
+        return atoi(sdkVersion);
+    }
+
+    return 0;
+}
+#endif
+
+
+#ifndef MA_COINIT_VALUE
+#define MA_COINIT_VALUE    0   /* 0 = COINIT_MULTITHREADED */
+#endif
+
+
+#ifndef MA_FLT_MAX
+    #ifdef FLT_MAX
+        #define MA_FLT_MAX FLT_MAX
+    #else
+        #define MA_FLT_MAX 3.402823466e+38F
+    #endif
+#endif
+
+
+#ifndef MA_PI
+#define MA_PI      3.14159265358979323846264f
+#endif
+#ifndef MA_PI_D
+#define MA_PI_D    3.14159265358979323846264
+#endif
+#ifndef MA_TAU
+#define MA_TAU     6.28318530717958647693f
+#endif
+#ifndef MA_TAU_D
+#define MA_TAU_D   6.28318530717958647693
+#endif
+
+
+/* The default format when ma_format_unknown (0) is requested when initializing a device. */
+#ifndef MA_DEFAULT_FORMAT
+#define MA_DEFAULT_FORMAT                                   ma_format_f32
+#endif
+
+/* The default channel count to use when 0 is used when initializing a device. */
+#ifndef MA_DEFAULT_CHANNELS
+#define MA_DEFAULT_CHANNELS                                 2
+#endif
+
+/* The default sample rate to use when 0 is used when initializing a device. */
+#ifndef MA_DEFAULT_SAMPLE_RATE
+#define MA_DEFAULT_SAMPLE_RATE                              48000
+#endif
+
+/* Default periods when none is specified in ma_device_init(). More periods means more work on the CPU. */
+#ifndef MA_DEFAULT_PERIODS
+#define MA_DEFAULT_PERIODS                                  3
+#endif
+
+/* The default period size in milliseconds for low latency mode. */
+#ifndef MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY
+#define MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY  10
+#endif
+
+/* The default buffer size in milliseconds for conservative mode. */
+#ifndef MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE
+#define MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE 100
+#endif
+
+/* The default LPF filter order for linear resampling. Note that this is clamped to MA_MAX_FILTER_ORDER. */
+#ifndef MA_DEFAULT_RESAMPLER_LPF_ORDER
+    #if MA_MAX_FILTER_ORDER >= 4
+        #define MA_DEFAULT_RESAMPLER_LPF_ORDER  4
+    #else
+        #define MA_DEFAULT_RESAMPLER_LPF_ORDER  MA_MAX_FILTER_ORDER
+    #endif
+#endif
+
+
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+/* Standard sample rates, in order of priority. */
+static ma_uint32 g_maStandardSampleRatePriorities[] = {
+    (ma_uint32)ma_standard_sample_rate_48000,
+    (ma_uint32)ma_standard_sample_rate_44100,
+
+    (ma_uint32)ma_standard_sample_rate_32000,
+    (ma_uint32)ma_standard_sample_rate_24000,
+    (ma_uint32)ma_standard_sample_rate_22050,
+
+    (ma_uint32)ma_standard_sample_rate_88200,
+    (ma_uint32)ma_standard_sample_rate_96000,
+    (ma_uint32)ma_standard_sample_rate_176400,
+    (ma_uint32)ma_standard_sample_rate_192000,
+
+    (ma_uint32)ma_standard_sample_rate_16000,
+    (ma_uint32)ma_standard_sample_rate_11025,
+    (ma_uint32)ma_standard_sample_rate_8000,
+
+    (ma_uint32)ma_standard_sample_rate_352800,
+    (ma_uint32)ma_standard_sample_rate_384000
+};
+
+static MA_INLINE ma_bool32 ma_is_standard_sample_rate(ma_uint32 sampleRate)
+{
+    ma_uint32 iSampleRate;
+
+    for (iSampleRate = 0; iSampleRate < sizeof(g_maStandardSampleRatePriorities) / sizeof(g_maStandardSampleRatePriorities[0]); iSampleRate += 1) {
+        if (g_maStandardSampleRatePriorities[iSampleRate] == sampleRate) {
+            return MA_TRUE;
+        }
+    }
+
+    /* Getting here means the sample rate is not supported. */
+    return MA_FALSE;
+}
+
+
+static ma_format g_maFormatPriorities[] = {
+    ma_format_s16,         /* Most common */
+    ma_format_f32,
+
+    /*ma_format_s24_32,*/    /* Clean alignment */
+    ma_format_s32,
+
+    ma_format_s24,         /* Unclean alignment */
+
+    ma_format_u8           /* Low quality */
+};
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic pop
+#endif
+
+
+MA_API void ma_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision)
+{
+    if (pMajor) {
+        *pMajor = MA_VERSION_MAJOR;
+    }
+
+    if (pMinor) {
+        *pMinor = MA_VERSION_MINOR;
+    }
+
+    if (pRevision) {
+        *pRevision = MA_VERSION_REVISION;
+    }
+}
+
+MA_API const char* ma_version_string(void)
+{
+    return MA_VERSION_STRING;
+}
+
+
+/******************************************************************************
+
+Standard Library Stuff
+
+******************************************************************************/
+#ifndef MA_ASSERT
+#define MA_ASSERT(condition)            assert(condition)
+#endif
+
+#ifndef MA_MALLOC
+#define MA_MALLOC(sz)                   malloc((sz))
+#endif
+#ifndef MA_REALLOC
+#define MA_REALLOC(p, sz)               realloc((p), (sz))
+#endif
+#ifndef MA_FREE
+#define MA_FREE(p)                      free((p))
+#endif
+
+static MA_INLINE void ma_zero_memory_default(void* p, size_t sz)
+{
+    if (p == NULL) {
+        MA_ASSERT(sz == 0); /* If this is triggered there's an error with the calling code. */
+        return;
+    }
+
+    if (sz > 0) {
+        memset(p, 0, sz);
+    }
+}
+
+
+#ifndef MA_ZERO_MEMORY
+#define MA_ZERO_MEMORY(p, sz)           ma_zero_memory_default((p), (sz))
+#endif
+#ifndef MA_COPY_MEMORY
+#define MA_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
+#endif
+#ifndef MA_MOVE_MEMORY
+#define MA_MOVE_MEMORY(dst, src, sz)    memmove((dst), (src), (sz))
+#endif
+
+#define MA_ZERO_OBJECT(p)               MA_ZERO_MEMORY((p), sizeof(*(p)))
+
+#define ma_countof(x)                   (sizeof(x) / sizeof(x[0]))
+#define ma_max(x, y)                    (((x) > (y)) ? (x) : (y))
+#define ma_min(x, y)                    (((x) < (y)) ? (x) : (y))
+#define ma_abs(x)                       (((x) > 0) ? (x) : -(x))
+#define ma_clamp(x, lo, hi)             (ma_max(lo, ma_min(x, hi)))
+#define ma_offset_ptr(p, offset)        (((ma_uint8*)(p)) + (offset))
+#define ma_align(x, a)                  (((x) + ((a)-1)) & ~((a)-1))
+#define ma_align_64(x)                  ma_align(x, 8)
+
+#define ma_buffer_frame_capacity(buffer, channels, format) (sizeof(buffer) / ma_get_bytes_per_sample(format) / (channels))
+
+static MA_INLINE double ma_sind(double x)
+{
+    /* TODO: Implement custom sin(x). */
+    return sin(x);
+}
+
+static MA_INLINE double ma_expd(double x)
+{
+    /* TODO: Implement custom exp(x). */
+    return exp(x);
+}
+
+static MA_INLINE double ma_logd(double x)
+{
+    /* TODO: Implement custom log(x). */
+    return log(x);
+}
+
+static MA_INLINE double ma_powd(double x, double y)
+{
+    /* TODO: Implement custom pow(x, y). */
+    return pow(x, y);
+}
+
+static MA_INLINE double ma_sqrtd(double x)
+{
+    /* TODO: Implement custom sqrt(x). */
+    return sqrt(x);
+}
+
+
+static MA_INLINE float ma_rsqrtf(float x)
+{
+    #if defined(MA_SUPPORT_SSE2) && !defined(MA_NO_SSE2) && (defined(MA_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__))
+    {
+        /*
+        For SSE we can use RSQRTSS.
+
+        This Stack Overflow post suggests that compilers don't necessarily generate optimal code
+        when using intrinsics:
+
+            https://web.archive.org/web/20221211012522/https://stackoverflow.com/questions/32687079/getting-fewest-instructions-for-rsqrtss-wrapper
+
+        I'm going to do something similar here, but a bit simpler.
+        */
+        #if defined(__GNUC__) || defined(__clang__)
+        {
+            float result;
+            __asm__ __volatile__("rsqrtss %1, %0" : "=x"(result) : "x"(x));
+            return result;
+        }
+        #else
+        {
+            return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ps1(x)));
+        }
+        #endif
+    }
+    #else
+    {
+        return 1 / (float)ma_sqrtd(x);
+    }
+    #endif
+}
+
+
+static MA_INLINE float ma_sinf(float x)
+{
+    return (float)ma_sind((float)x);
+}
+
+static MA_INLINE double ma_cosd(double x)
+{
+    return ma_sind((MA_PI_D*0.5) - x);
+}
+
+static MA_INLINE float ma_cosf(float x)
+{
+    return (float)ma_cosd((float)x);
+}
+
+static MA_INLINE double ma_log10d(double x)
+{
+    return ma_logd(x) * 0.43429448190325182765;
+}
+
+static MA_INLINE float ma_powf(float x, float y)
+{
+    return (float)ma_powd((double)x, (double)y);
+}
+
+static MA_INLINE float ma_log10f(float x)
+{
+    return (float)ma_log10d((double)x);
+}
+
+
+static MA_INLINE double ma_degrees_to_radians(double degrees)
+{
+    return degrees * 0.01745329252;
+}
+
+static MA_INLINE double ma_radians_to_degrees(double radians)
+{
+    return radians * 57.295779512896;
+}
+
+static MA_INLINE float ma_degrees_to_radians_f(float degrees)
+{
+    return degrees * 0.01745329252f;
+}
+
+static MA_INLINE float ma_radians_to_degrees_f(float radians)
+{
+    return radians * 57.295779512896f;
+}
+
+
+/*
+Return Values:
+  0:  Success
+  22: EINVAL
+  34: ERANGE
+
+Not using symbolic constants for errors because I want to avoid #including errno.h
+
+These are marked as no-inline because of some bad code generation by Clang. None of these functions
+are used in any performance-critical code within miniaudio.
+*/
+MA_API MA_NO_INLINE int ma_strcpy_s(char* dst, size_t dstSizeInBytes, const char* src)
+{
+    size_t i;
+
+    if (dst == 0) {
+        return 22;
+    }
+    if (dstSizeInBytes == 0) {
+        return 34;
+    }
+    if (src == 0) {
+        dst[0] = '\0';
+        return 22;
+    }
+
+    for (i = 0; i < dstSizeInBytes && src[i] != '\0'; ++i) {
+        dst[i] = src[i];
+    }
+
+    if (i < dstSizeInBytes) {
+        dst[i] = '\0';
+        return 0;
+    }
+
+    dst[0] = '\0';
+    return 34;
+}
+
+MA_API MA_NO_INLINE int ma_wcscpy_s(wchar_t* dst, size_t dstCap, const wchar_t* src)
+{
+    size_t i;
+
+    if (dst == 0) {
+        return 22;
+    }
+    if (dstCap == 0) {
+        return 34;
+    }
+    if (src == 0) {
+        dst[0] = '\0';
+        return 22;
+    }
+
+    for (i = 0; i < dstCap && src[i] != '\0'; ++i) {
+        dst[i] = src[i];
+    }
+
+    if (i < dstCap) {
+        dst[i] = '\0';
+        return 0;
+    }
+
+    dst[0] = '\0';
+    return 34;
+}
+
+
+MA_API MA_NO_INLINE int ma_strncpy_s(char* dst, size_t dstSizeInBytes, const char* src, size_t count)
+{
+    size_t maxcount;
+    size_t i;
+
+    if (dst == 0) {
+        return 22;
+    }
+    if (dstSizeInBytes == 0) {
+        return 34;
+    }
+    if (src == 0) {
+        dst[0] = '\0';
+        return 22;
+    }
+
+    maxcount = count;
+    if (count == ((size_t)-1) || count >= dstSizeInBytes) {        /* -1 = _TRUNCATE */
+        maxcount = dstSizeInBytes - 1;
+    }
+
+    for (i = 0; i < maxcount && src[i] != '\0'; ++i) {
+        dst[i] = src[i];
+    }
+
+    if (src[i] == '\0' || i == count || count == ((size_t)-1)) {
+        dst[i] = '\0';
+        return 0;
+    }
+
+    dst[0] = '\0';
+    return 34;
+}
+
+MA_API MA_NO_INLINE int ma_strcat_s(char* dst, size_t dstSizeInBytes, const char* src)
+{
+    char* dstorig;
+
+    if (dst == 0) {
+        return 22;
+    }
+    if (dstSizeInBytes == 0) {
+        return 34;
+    }
+    if (src == 0) {
+        dst[0] = '\0';
+        return 22;
+    }
+
+    dstorig = dst;
+
+    while (dstSizeInBytes > 0 && dst[0] != '\0') {
+        dst += 1;
+        dstSizeInBytes -= 1;
+    }
+
+    if (dstSizeInBytes == 0) {
+        return 22;  /* Unterminated. */
+    }
+
+
+    while (dstSizeInBytes > 0 && src[0] != '\0') {
+        *dst++ = *src++;
+        dstSizeInBytes -= 1;
+    }
+
+    if (dstSizeInBytes > 0) {
+        dst[0] = '\0';
+    } else {
+        dstorig[0] = '\0';
+        return 34;
+    }
+
+    return 0;
+}
+
+MA_API MA_NO_INLINE int ma_strncat_s(char* dst, size_t dstSizeInBytes, const char* src, size_t count)
+{
+    char* dstorig;
+
+    if (dst == 0) {
+        return 22;
+    }
+    if (dstSizeInBytes == 0) {
+        return 34;
+    }
+    if (src == 0) {
+        return 22;
+    }
+
+    dstorig = dst;
+
+    while (dstSizeInBytes > 0 && dst[0] != '\0') {
+        dst += 1;
+        dstSizeInBytes -= 1;
+    }
+
+    if (dstSizeInBytes == 0) {
+        return 22;  /* Unterminated. */
+    }
+
+
+    if (count == ((size_t)-1)) {        /* _TRUNCATE */
+        count = dstSizeInBytes - 1;
+    }
+
+    while (dstSizeInBytes > 0 && src[0] != '\0' && count > 0) {
+        *dst++ = *src++;
+        dstSizeInBytes -= 1;
+        count -= 1;
+    }
+
+    if (dstSizeInBytes > 0) {
+        dst[0] = '\0';
+    } else {
+        dstorig[0] = '\0';
+        return 34;
+    }
+
+    return 0;
+}
+
+MA_API MA_NO_INLINE int ma_itoa_s(int value, char* dst, size_t dstSizeInBytes, int radix)
+{
+    int sign;
+    unsigned int valueU;
+    char* dstEnd;
+
+    if (dst == NULL || dstSizeInBytes == 0) {
+        return 22;
+    }
+    if (radix < 2 || radix > 36) {
+        dst[0] = '\0';
+        return 22;
+    }
+
+    sign = (value < 0 && radix == 10) ? -1 : 1;     /* The negative sign is only used when the base is 10. */
+
+    if (value < 0) {
+        valueU = -value;
+    } else {
+        valueU = value;
+    }
+
+    dstEnd = dst;
+    do
+    {
+        int remainder = valueU % radix;
+        if (remainder > 9) {
+            *dstEnd = (char)((remainder - 10) + 'a');
+        } else {
+            *dstEnd = (char)(remainder + '0');
+        }
+
+        dstEnd += 1;
+        dstSizeInBytes -= 1;
+        valueU /= radix;
+    } while (dstSizeInBytes > 0 && valueU > 0);
+
+    if (dstSizeInBytes == 0) {
+        dst[0] = '\0';
+        return 22;  /* Ran out of room in the output buffer. */
+    }
+
+    if (sign < 0) {
+        *dstEnd++ = '-';
+        dstSizeInBytes -= 1;
+    }
+
+    if (dstSizeInBytes == 0) {
+        dst[0] = '\0';
+        return 22;  /* Ran out of room in the output buffer. */
+    }
+
+    *dstEnd = '\0';
+
+
+    /* At this point the string will be reversed. */
+    dstEnd -= 1;
+    while (dst < dstEnd) {
+        char temp = *dst;
+        *dst = *dstEnd;
+        *dstEnd = temp;
+
+        dst += 1;
+        dstEnd -= 1;
+    }
+
+    return 0;
+}
+
+MA_API MA_NO_INLINE int ma_strcmp(const char* str1, const char* str2)
+{
+    if (str1 == str2) return  0;
+
+    /* These checks differ from the standard implementation. It's not important, but I prefer it just for sanity. */
+    if (str1 == NULL) return -1;
+    if (str2 == NULL) return  1;
+
+    for (;;) {
+        if (str1[0] == '\0') {
+            break;
+        }
+        if (str1[0] != str2[0]) {
+            break;
+        }
+
+        str1 += 1;
+        str2 += 1;
+    }
+
+    return ((unsigned char*)str1)[0] - ((unsigned char*)str2)[0];
+}
+
+MA_API MA_NO_INLINE int ma_strappend(char* dst, size_t dstSize, const char* srcA, const char* srcB)
+{
+    int result;
+
+    result = ma_strncpy_s(dst, dstSize, srcA, (size_t)-1);
+    if (result != 0) {
+        return result;
+    }
+
+    result = ma_strncat_s(dst, dstSize, srcB, (size_t)-1);
+    if (result != 0) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API MA_NO_INLINE char* ma_copy_string(const char* src, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    size_t sz;
+    char* dst;
+
+    if (src == NULL) {
+        return NULL;
+    }
+
+    sz = strlen(src)+1;
+    dst = (char*)ma_malloc(sz, pAllocationCallbacks);
+    if (dst == NULL) {
+        return NULL;
+    }
+
+    ma_strcpy_s(dst, sz, src);
+
+    return dst;
+}
+
+MA_API MA_NO_INLINE wchar_t* ma_copy_string_w(const wchar_t* src, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    size_t sz = wcslen(src)+1;
+    wchar_t* dst = (wchar_t*)ma_malloc(sz * sizeof(*dst), pAllocationCallbacks);
+    if (dst == NULL) {
+        return NULL;
+    }
+
+    ma_wcscpy_s(dst, sz, src);
+
+    return dst;
+}
+
+
+
+#include <errno.h>
+static ma_result ma_result_from_errno(int e)
+{
+    if (e == 0) {
+        return MA_SUCCESS;
+    }
+#ifdef EPERM
+    else if (e == EPERM) { return MA_INVALID_OPERATION; }
+#endif
+#ifdef ENOENT
+    else if (e == ENOENT) { return MA_DOES_NOT_EXIST; }
+#endif
+#ifdef ESRCH
+    else if (e == ESRCH) { return MA_DOES_NOT_EXIST; }
+#endif
+#ifdef EINTR
+    else if (e == EINTR) { return MA_INTERRUPT; }
+#endif
+#ifdef EIO
+    else if (e == EIO) { return MA_IO_ERROR; }
+#endif
+#ifdef ENXIO
+    else if (e == ENXIO) { return MA_DOES_NOT_EXIST; }
+#endif
+#ifdef E2BIG
+    else if (e == E2BIG) { return MA_INVALID_ARGS; }
+#endif
+#ifdef ENOEXEC
+    else if (e == ENOEXEC) { return MA_INVALID_FILE; }
+#endif
+#ifdef EBADF
+    else if (e == EBADF) { return MA_INVALID_FILE; }
+#endif
+#ifdef ECHILD
+    else if (e == ECHILD) { return MA_ERROR; }
+#endif
+#ifdef EAGAIN
+    else if (e == EAGAIN) { return MA_UNAVAILABLE; }
+#endif
+#ifdef ENOMEM
+    else if (e == ENOMEM) { return MA_OUT_OF_MEMORY; }
+#endif
+#ifdef EACCES
+    else if (e == EACCES) { return MA_ACCESS_DENIED; }
+#endif
+#ifdef EFAULT
+    else if (e == EFAULT) { return MA_BAD_ADDRESS; }
+#endif
+#ifdef ENOTBLK
+    else if (e == ENOTBLK) { return MA_ERROR; }
+#endif
+#ifdef EBUSY
+    else if (e == EBUSY) { return MA_BUSY; }
+#endif
+#ifdef EEXIST
+    else if (e == EEXIST) { return MA_ALREADY_EXISTS; }
+#endif
+#ifdef EXDEV
+    else if (e == EXDEV) { return MA_ERROR; }
+#endif
+#ifdef ENODEV
+    else if (e == ENODEV) { return MA_DOES_NOT_EXIST; }
+#endif
+#ifdef ENOTDIR
+    else if (e == ENOTDIR) { return MA_NOT_DIRECTORY; }
+#endif
+#ifdef EISDIR
+    else if (e == EISDIR) { return MA_IS_DIRECTORY; }
+#endif
+#ifdef EINVAL
+    else if (e == EINVAL) { return MA_INVALID_ARGS; }
+#endif
+#ifdef ENFILE
+    else if (e == ENFILE) { return MA_TOO_MANY_OPEN_FILES; }
+#endif
+#ifdef EMFILE
+    else if (e == EMFILE) { return MA_TOO_MANY_OPEN_FILES; }
+#endif
+#ifdef ENOTTY
+    else if (e == ENOTTY) { return MA_INVALID_OPERATION; }
+#endif
+#ifdef ETXTBSY
+    else if (e == ETXTBSY) { return MA_BUSY; }
+#endif
+#ifdef EFBIG
+    else if (e == EFBIG) { return MA_TOO_BIG; }
+#endif
+#ifdef ENOSPC
+    else if (e == ENOSPC) { return MA_NO_SPACE; }
+#endif
+#ifdef ESPIPE
+    else if (e == ESPIPE) { return MA_BAD_SEEK; }
+#endif
+#ifdef EROFS
+    else if (e == EROFS) { return MA_ACCESS_DENIED; }
+#endif
+#ifdef EMLINK
+    else if (e == EMLINK) { return MA_TOO_MANY_LINKS; }
+#endif
+#ifdef EPIPE
+    else if (e == EPIPE) { return MA_BAD_PIPE; }
+#endif
+#ifdef EDOM
+    else if (e == EDOM) { return MA_OUT_OF_RANGE; }
+#endif
+#ifdef ERANGE
+    else if (e == ERANGE) { return MA_OUT_OF_RANGE; }
+#endif
+#ifdef EDEADLK
+    else if (e == EDEADLK) { return MA_DEADLOCK; }
+#endif
+#ifdef ENAMETOOLONG
+    else if (e == ENAMETOOLONG) { return MA_PATH_TOO_LONG; }
+#endif
+#ifdef ENOLCK
+    else if (e == ENOLCK) { return MA_ERROR; }
+#endif
+#ifdef ENOSYS
+    else if (e == ENOSYS) { return MA_NOT_IMPLEMENTED; }
+#endif
+#ifdef ENOTEMPTY
+    else if (e == ENOTEMPTY) { return MA_DIRECTORY_NOT_EMPTY; }
+#endif
+#ifdef ELOOP
+    else if (e == ELOOP) { return MA_TOO_MANY_LINKS; }
+#endif
+#ifdef ENOMSG
+    else if (e == ENOMSG) { return MA_NO_MESSAGE; }
+#endif
+#ifdef EIDRM
+    else if (e == EIDRM) { return MA_ERROR; }
+#endif
+#ifdef ECHRNG
+    else if (e == ECHRNG) { return MA_ERROR; }
+#endif
+#ifdef EL2NSYNC
+    else if (e == EL2NSYNC) { return MA_ERROR; }
+#endif
+#ifdef EL3HLT
+    else if (e == EL3HLT) { return MA_ERROR; }
+#endif
+#ifdef EL3RST
+    else if (e == EL3RST) { return MA_ERROR; }
+#endif
+#ifdef ELNRNG
+    else if (e == ELNRNG) { return MA_OUT_OF_RANGE; }
+#endif
+#ifdef EUNATCH
+    else if (e == EUNATCH) { return MA_ERROR; }
+#endif
+#ifdef ENOCSI
+    else if (e == ENOCSI) { return MA_ERROR; }
+#endif
+#ifdef EL2HLT
+    else if (e == EL2HLT) { return MA_ERROR; }
+#endif
+#ifdef EBADE
+    else if (e == EBADE) { return MA_ERROR; }
+#endif
+#ifdef EBADR
+    else if (e == EBADR) { return MA_ERROR; }
+#endif
+#ifdef EXFULL
+    else if (e == EXFULL) { return MA_ERROR; }
+#endif
+#ifdef ENOANO
+    else if (e == ENOANO) { return MA_ERROR; }
+#endif
+#ifdef EBADRQC
+    else if (e == EBADRQC) { return MA_ERROR; }
+#endif
+#ifdef EBADSLT
+    else if (e == EBADSLT) { return MA_ERROR; }
+#endif
+#ifdef EBFONT
+    else if (e == EBFONT) { return MA_INVALID_FILE; }
+#endif
+#ifdef ENOSTR
+    else if (e == ENOSTR) { return MA_ERROR; }
+#endif
+#ifdef ENODATA
+    else if (e == ENODATA) { return MA_NO_DATA_AVAILABLE; }
+#endif
+#ifdef ETIME
+    else if (e == ETIME) { return MA_TIMEOUT; }
+#endif
+#ifdef ENOSR
+    else if (e == ENOSR) { return MA_NO_DATA_AVAILABLE; }
+#endif
+#ifdef ENONET
+    else if (e == ENONET) { return MA_NO_NETWORK; }
+#endif
+#ifdef ENOPKG
+    else if (e == ENOPKG) { return MA_ERROR; }
+#endif
+#ifdef EREMOTE
+    else if (e == EREMOTE) { return MA_ERROR; }
+#endif
+#ifdef ENOLINK
+    else if (e == ENOLINK) { return MA_ERROR; }
+#endif
+#ifdef EADV
+    else if (e == EADV) { return MA_ERROR; }
+#endif
+#ifdef ESRMNT
+    else if (e == ESRMNT) { return MA_ERROR; }
+#endif
+#ifdef ECOMM
+    else if (e == ECOMM) { return MA_ERROR; }
+#endif
+#ifdef EPROTO
+    else if (e == EPROTO) { return MA_ERROR; }
+#endif
+#ifdef EMULTIHOP
+    else if (e == EMULTIHOP) { return MA_ERROR; }
+#endif
+#ifdef EDOTDOT
+    else if (e == EDOTDOT) { return MA_ERROR; }
+#endif
+#ifdef EBADMSG
+    else if (e == EBADMSG) { return MA_BAD_MESSAGE; }
+#endif
+#ifdef EOVERFLOW
+    else if (e == EOVERFLOW) { return MA_TOO_BIG; }
+#endif
+#ifdef ENOTUNIQ
+    else if (e == ENOTUNIQ) { return MA_NOT_UNIQUE; }
+#endif
+#ifdef EBADFD
+    else if (e == EBADFD) { return MA_ERROR; }
+#endif
+#ifdef EREMCHG
+    else if (e == EREMCHG) { return MA_ERROR; }
+#endif
+#ifdef ELIBACC
+    else if (e == ELIBACC) { return MA_ACCESS_DENIED; }
+#endif
+#ifdef ELIBBAD
+    else if (e == ELIBBAD) { return MA_INVALID_FILE; }
+#endif
+#ifdef ELIBSCN
+    else if (e == ELIBSCN) { return MA_INVALID_FILE; }
+#endif
+#ifdef ELIBMAX
+    else if (e == ELIBMAX) { return MA_ERROR; }
+#endif
+#ifdef ELIBEXEC
+    else if (e == ELIBEXEC) { return MA_ERROR; }
+#endif
+#ifdef EILSEQ
+    else if (e == EILSEQ) { return MA_INVALID_DATA; }
+#endif
+#ifdef ERESTART
+    else if (e == ERESTART) { return MA_ERROR; }
+#endif
+#ifdef ESTRPIPE
+    else if (e == ESTRPIPE) { return MA_ERROR; }
+#endif
+#ifdef EUSERS
+    else if (e == EUSERS) { return MA_ERROR; }
+#endif
+#ifdef ENOTSOCK
+    else if (e == ENOTSOCK) { return MA_NOT_SOCKET; }
+#endif
+#ifdef EDESTADDRREQ
+    else if (e == EDESTADDRREQ) { return MA_NO_ADDRESS; }
+#endif
+#ifdef EMSGSIZE
+    else if (e == EMSGSIZE) { return MA_TOO_BIG; }
+#endif
+#ifdef EPROTOTYPE
+    else if (e == EPROTOTYPE) { return MA_BAD_PROTOCOL; }
+#endif
+#ifdef ENOPROTOOPT
+    else if (e == ENOPROTOOPT) { return MA_PROTOCOL_UNAVAILABLE; }
+#endif
+#ifdef EPROTONOSUPPORT
+    else if (e == EPROTONOSUPPORT) { return MA_PROTOCOL_NOT_SUPPORTED; }
+#endif
+#ifdef ESOCKTNOSUPPORT
+    else if (e == ESOCKTNOSUPPORT) { return MA_SOCKET_NOT_SUPPORTED; }
+#endif
+#ifdef EOPNOTSUPP
+    else if (e == EOPNOTSUPP) { return MA_INVALID_OPERATION; }
+#endif
+#ifdef EPFNOSUPPORT
+    else if (e == EPFNOSUPPORT) { return MA_PROTOCOL_FAMILY_NOT_SUPPORTED; }
+#endif
+#ifdef EAFNOSUPPORT
+    else if (e == EAFNOSUPPORT) { return MA_ADDRESS_FAMILY_NOT_SUPPORTED; }
+#endif
+#ifdef EADDRINUSE
+    else if (e == EADDRINUSE) { return MA_ALREADY_IN_USE; }
+#endif
+#ifdef EADDRNOTAVAIL
+    else if (e == EADDRNOTAVAIL) { return MA_ERROR; }
+#endif
+#ifdef ENETDOWN
+    else if (e == ENETDOWN) { return MA_NO_NETWORK; }
+#endif
+#ifdef ENETUNREACH
+    else if (e == ENETUNREACH) { return MA_NO_NETWORK; }
+#endif
+#ifdef ENETRESET
+    else if (e == ENETRESET) { return MA_NO_NETWORK; }
+#endif
+#ifdef ECONNABORTED
+    else if (e == ECONNABORTED) { return MA_NO_NETWORK; }
+#endif
+#ifdef ECONNRESET
+    else if (e == ECONNRESET) { return MA_CONNECTION_RESET; }
+#endif
+#ifdef ENOBUFS
+    else if (e == ENOBUFS) { return MA_NO_SPACE; }
+#endif
+#ifdef EISCONN
+    else if (e == EISCONN) { return MA_ALREADY_CONNECTED; }
+#endif
+#ifdef ENOTCONN
+    else if (e == ENOTCONN) { return MA_NOT_CONNECTED; }
+#endif
+#ifdef ESHUTDOWN
+    else if (e == ESHUTDOWN) { return MA_ERROR; }
+#endif
+#ifdef ETOOMANYREFS
+    else if (e == ETOOMANYREFS) { return MA_ERROR; }
+#endif
+#ifdef ETIMEDOUT
+    else if (e == ETIMEDOUT) { return MA_TIMEOUT; }
+#endif
+#ifdef ECONNREFUSED
+    else if (e == ECONNREFUSED) { return MA_CONNECTION_REFUSED; }
+#endif
+#ifdef EHOSTDOWN
+    else if (e == EHOSTDOWN) { return MA_NO_HOST; }
+#endif
+#ifdef EHOSTUNREACH
+    else if (e == EHOSTUNREACH) { return MA_NO_HOST; }
+#endif
+#ifdef EALREADY
+    else if (e == EALREADY) { return MA_IN_PROGRESS; }
+#endif
+#ifdef EINPROGRESS
+    else if (e == EINPROGRESS) { return MA_IN_PROGRESS; }
+#endif
+#ifdef ESTALE
+    else if (e == ESTALE) { return MA_INVALID_FILE; }
+#endif
+#ifdef EUCLEAN
+    else if (e == EUCLEAN) { return MA_ERROR; }
+#endif
+#ifdef ENOTNAM
+    else if (e == ENOTNAM) { return MA_ERROR; }
+#endif
+#ifdef ENAVAIL
+    else if (e == ENAVAIL) { return MA_ERROR; }
+#endif
+#ifdef EISNAM
+    else if (e == EISNAM) { return MA_ERROR; }
+#endif
+#ifdef EREMOTEIO
+    else if (e == EREMOTEIO) { return MA_IO_ERROR; }
+#endif
+#ifdef EDQUOT
+    else if (e == EDQUOT) { return MA_NO_SPACE; }
+#endif
+#ifdef ENOMEDIUM
+    else if (e == ENOMEDIUM) { return MA_DOES_NOT_EXIST; }
+#endif
+#ifdef EMEDIUMTYPE
+    else if (e == EMEDIUMTYPE) { return MA_ERROR; }
+#endif
+#ifdef ECANCELED
+    else if (e == ECANCELED) { return MA_CANCELLED; }
+#endif
+#ifdef ENOKEY
+    else if (e == ENOKEY) { return MA_ERROR; }
+#endif
+#ifdef EKEYEXPIRED
+    else if (e == EKEYEXPIRED) { return MA_ERROR; }
+#endif
+#ifdef EKEYREVOKED
+    else if (e == EKEYREVOKED) { return MA_ERROR; }
+#endif
+#ifdef EKEYREJECTED
+    else if (e == EKEYREJECTED) { return MA_ERROR; }
+#endif
+#ifdef EOWNERDEAD
+    else if (e == EOWNERDEAD) { return MA_ERROR; }
+#endif
+#ifdef ENOTRECOVERABLE
+    else if (e == ENOTRECOVERABLE) { return MA_ERROR; }
+#endif
+#ifdef ERFKILL
+    else if (e == ERFKILL) { return MA_ERROR; }
+#endif
+#ifdef EHWPOISON
+    else if (e == EHWPOISON) { return MA_ERROR; }
+#endif
+    else {
+        return MA_ERROR;
+    }
+}
+
+MA_API ma_result ma_fopen(FILE** ppFile, const char* pFilePath, const char* pOpenMode)
+{
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    errno_t err;
+#endif
+
+    if (ppFile != NULL) {
+        *ppFile = NULL;  /* Safety. */
+    }
+
+    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    err = fopen_s(ppFile, pFilePath, pOpenMode);
+    if (err != 0) {
+        return ma_result_from_errno(err);
+    }
+#else
+#if defined(_WIN32) || defined(__APPLE__)
+    *ppFile = fopen(pFilePath, pOpenMode);
+#else
+    #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 && defined(_LARGEFILE64_SOURCE)
+        *ppFile = fopen64(pFilePath, pOpenMode);
+    #else
+        *ppFile = fopen(pFilePath, pOpenMode);
+    #endif
+#endif
+    if (*ppFile == NULL) {
+        ma_result result = ma_result_from_errno(errno);
+        if (result == MA_SUCCESS) {
+            result = MA_ERROR;   /* Just a safety check to make sure we never ever return success when pFile == NULL. */
+        }
+
+        return result;
+    }
+#endif
+
+    return MA_SUCCESS;
+}
+
+
+
+/*
+_wfopen() isn't always available in all compilation environments.
+
+    * Windows only.
+    * MSVC seems to support it universally as far back as VC6 from what I can tell (haven't checked further back).
+    * MinGW-64 (both 32- and 64-bit) seems to support it.
+    * MinGW wraps it in !defined(__STRICT_ANSI__).
+    * OpenWatcom wraps it in !defined(_NO_EXT_KEYS).
+
+This can be reviewed as compatibility issues arise. The preference is to use _wfopen_s() and _wfopen() as opposed to the wcsrtombs()
+fallback, so if you notice your compiler not detecting this properly I'm happy to look at adding support.
+*/
+#if defined(_WIN32)
+    #if defined(_MSC_VER) || defined(__MINGW64__) || (!defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS))
+        #define MA_HAS_WFOPEN
+    #endif
+#endif
+
+MA_API ma_result ma_wfopen(FILE** ppFile, const wchar_t* pFilePath, const wchar_t* pOpenMode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (ppFile != NULL) {
+        *ppFile = NULL;  /* Safety. */
+    }
+
+    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_HAS_WFOPEN)
+    {
+        /* Use _wfopen() on Windows. */
+    #if defined(_MSC_VER) && _MSC_VER >= 1400
+        errno_t err = _wfopen_s(ppFile, pFilePath, pOpenMode);
+        if (err != 0) {
+            return ma_result_from_errno(err);
+        }
+    #else
+        *ppFile = _wfopen(pFilePath, pOpenMode);
+        if (*ppFile == NULL) {
+            return ma_result_from_errno(errno);
+        }
+    #endif
+        (void)pAllocationCallbacks;
+    }
+#else
+    /*
+    Use fopen() on anything other than Windows. Requires a conversion. This is annoying because fopen() is locale specific. The only real way I can
+    think of to do this is with wcsrtombs(). Note that wcstombs() is apparently not thread-safe because it uses a static global mbstate_t object for
+    maintaining state. I've checked this with -std=c89 and it works, but if somebody get's a compiler error I'll look into improving compatibility.
+    */
+    {
+        mbstate_t mbs;
+        size_t lenMB;
+        const wchar_t* pFilePathTemp = pFilePath;
+        char* pFilePathMB = NULL;
+        char pOpenModeMB[32] = {0};
+
+        /* Get the length first. */
+        MA_ZERO_OBJECT(&mbs);
+        lenMB = wcsrtombs(NULL, &pFilePathTemp, 0, &mbs);
+        if (lenMB == (size_t)-1) {
+            return ma_result_from_errno(errno);
+        }
+
+        pFilePathMB = (char*)ma_malloc(lenMB + 1, pAllocationCallbacks);
+        if (pFilePathMB == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        pFilePathTemp = pFilePath;
+        MA_ZERO_OBJECT(&mbs);
+        wcsrtombs(pFilePathMB, &pFilePathTemp, lenMB + 1, &mbs);
+
+        /* The open mode should always consist of ASCII characters so we should be able to do a trivial conversion. */
+        {
+            size_t i = 0;
+            for (;;) {
+                if (pOpenMode[i] == 0) {
+                    pOpenModeMB[i] = '\0';
+                    break;
+                }
+
+                pOpenModeMB[i] = (char)pOpenMode[i];
+                i += 1;
+            }
+        }
+
+        *ppFile = fopen(pFilePathMB, pOpenModeMB);
+
+        ma_free(pFilePathMB, pAllocationCallbacks);
+    }
+
+    if (*ppFile == NULL) {
+        return MA_ERROR;
+    }
+#endif
+
+    return MA_SUCCESS;
+}
+
+
+
+static MA_INLINE void ma_copy_memory_64(void* dst, const void* src, ma_uint64 sizeInBytes)
+{
+#if 0xFFFFFFFFFFFFFFFF <= MA_SIZE_MAX
+    MA_COPY_MEMORY(dst, src, (size_t)sizeInBytes);
+#else
+    while (sizeInBytes > 0) {
+        ma_uint64 bytesToCopyNow = sizeInBytes;
+        if (bytesToCopyNow > MA_SIZE_MAX) {
+            bytesToCopyNow = MA_SIZE_MAX;
+        }
+
+        MA_COPY_MEMORY(dst, src, (size_t)bytesToCopyNow);  /* Safe cast to size_t. */
+
+        sizeInBytes -= bytesToCopyNow;
+        dst = (      void*)((      ma_uint8*)dst + bytesToCopyNow);
+        src = (const void*)((const ma_uint8*)src + bytesToCopyNow);
+    }
+#endif
+}
+
+static MA_INLINE void ma_zero_memory_64(void* dst, ma_uint64 sizeInBytes)
+{
+#if 0xFFFFFFFFFFFFFFFF <= MA_SIZE_MAX
+    MA_ZERO_MEMORY(dst, (size_t)sizeInBytes);
+#else
+    while (sizeInBytes > 0) {
+        ma_uint64 bytesToZeroNow = sizeInBytes;
+        if (bytesToZeroNow > MA_SIZE_MAX) {
+            bytesToZeroNow = MA_SIZE_MAX;
+        }
+
+        MA_ZERO_MEMORY(dst, (size_t)bytesToZeroNow);  /* Safe cast to size_t. */
+
+        sizeInBytes -= bytesToZeroNow;
+        dst = (void*)((ma_uint8*)dst + bytesToZeroNow);
+    }
+#endif
+}
+
+
+/* Thanks to good old Bit Twiddling Hacks for this one: http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+static MA_INLINE unsigned int ma_next_power_of_2(unsigned int x)
+{
+    x--;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    x++;
+
+    return x;
+}
+
+static MA_INLINE unsigned int ma_prev_power_of_2(unsigned int x)
+{
+    return ma_next_power_of_2(x) >> 1;
+}
+
+static MA_INLINE unsigned int ma_round_to_power_of_2(unsigned int x)
+{
+    unsigned int prev = ma_prev_power_of_2(x);
+    unsigned int next = ma_next_power_of_2(x);
+    if ((next - x) > (x - prev)) {
+        return prev;
+    } else {
+        return next;
+    }
+}
+
+static MA_INLINE unsigned int ma_count_set_bits(unsigned int x)
+{
+    unsigned int count = 0;
+    while (x != 0) {
+        if (x & 1) {
+            count += 1;
+        }
+
+        x = x >> 1;
+    }
+
+    return count;
+}
+
+
+
+/**************************************************************************************************************************************************************
+
+Allocation Callbacks
+
+**************************************************************************************************************************************************************/
+static void* ma__malloc_default(size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return MA_MALLOC(sz);
+}
+
+static void* ma__realloc_default(void* p, size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return MA_REALLOC(p, sz);
+}
+
+static void ma__free_default(void* p, void* pUserData)
+{
+    (void)pUserData;
+    MA_FREE(p);
+}
+
+static ma_allocation_callbacks ma_allocation_callbacks_init_default(void)
+{
+    ma_allocation_callbacks callbacks;
+    callbacks.pUserData = NULL;
+    callbacks.onMalloc  = ma__malloc_default;
+    callbacks.onRealloc = ma__realloc_default;
+    callbacks.onFree    = ma__free_default;
+
+    return callbacks;
+}
+
+static ma_result ma_allocation_callbacks_init_copy(ma_allocation_callbacks* pDst, const ma_allocation_callbacks* pSrc)
+{
+    if (pDst == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pSrc == NULL) {
+        *pDst = ma_allocation_callbacks_init_default();
+    } else {
+        if (pSrc->pUserData == NULL && pSrc->onFree == NULL && pSrc->onMalloc == NULL && pSrc->onRealloc == NULL) {
+            *pDst = ma_allocation_callbacks_init_default();
+        } else {
+            if (pSrc->onFree == NULL || (pSrc->onMalloc == NULL && pSrc->onRealloc == NULL)) {
+                return MA_INVALID_ARGS;    /* Invalid allocation callbacks. */
+            } else {
+                *pDst = *pSrc;
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+
+
+/**************************************************************************************************************************************************************
+
+Logging
+
+**************************************************************************************************************************************************************/
+MA_API const char* ma_log_level_to_string(ma_uint32 logLevel)
+{
+    switch (logLevel)
+    {
+        case MA_LOG_LEVEL_DEBUG:   return "DEBUG";
+        case MA_LOG_LEVEL_INFO:    return "INFO";
+        case MA_LOG_LEVEL_WARNING: return "WARNING";
+        case MA_LOG_LEVEL_ERROR:   return "ERROR";
+        default:                   return "ERROR";
+    }
+}
+
+#if defined(MA_DEBUG_OUTPUT)
+#if defined(MA_ANDROID)
+    #include <android/log.h>
+#endif
+
+/* Customize this to use a specific tag in __android_log_print() for debug output messages. */
+#ifndef MA_ANDROID_LOG_TAG
+#define MA_ANDROID_LOG_TAG  "miniaudio"
+#endif
+
+void ma_log_callback_debug(void* pUserData, ma_uint32 level, const char* pMessage)
+{
+    (void)pUserData;
+
+    /* Special handling for some platforms. */
+    #if defined(MA_ANDROID)
+    {
+        /* Android. */
+        __android_log_print(ANDROID_LOG_DEBUG, MA_ANDROID_LOG_TAG, "%s: %s", ma_log_level_to_string(level), pMessage);
+    }
+    #else
+    {
+        /* Everything else. */
+        printf("%s: %s", ma_log_level_to_string(level), pMessage);
+    }
+    #endif
+}
+#endif
+
+MA_API ma_log_callback ma_log_callback_init(ma_log_callback_proc onLog, void* pUserData)
+{
+    ma_log_callback callback;
+
+    MA_ZERO_OBJECT(&callback);
+    callback.onLog     = onLog;
+    callback.pUserData = pUserData;
+
+    return callback;
+}
+
+
+MA_API ma_result ma_log_init(const ma_allocation_callbacks* pAllocationCallbacks, ma_log* pLog)
+{
+    if (pLog == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pLog);
+    ma_allocation_callbacks_init_copy(&pLog->allocationCallbacks, pAllocationCallbacks);
+
+    /* We need a mutex for thread safety. */
+    #ifndef MA_NO_THREADING
+    {
+        ma_result result = ma_mutex_init(&pLog->lock);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+    #endif
+
+    /* If we're using debug output, enable it. */
+    #if defined(MA_DEBUG_OUTPUT)
+    {
+        ma_log_register_callback(pLog, ma_log_callback_init(ma_log_callback_debug, NULL)); /* Doesn't really matter if this fails. */
+    }
+    #endif
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_log_uninit(ma_log* pLog)
+{
+    if (pLog == NULL) {
+        return;
+    }
+
+#ifndef MA_NO_THREADING
+    ma_mutex_uninit(&pLog->lock);
+#endif
+}
+
+static void ma_log_lock(ma_log* pLog)
+{
+#ifndef MA_NO_THREADING
+    ma_mutex_lock(&pLog->lock);
+#else
+    (void)pLog;
+#endif
+}
+
+static void ma_log_unlock(ma_log* pLog)
+{
+#ifndef MA_NO_THREADING
+    ma_mutex_unlock(&pLog->lock);
+#else
+    (void)pLog;
+#endif
+}
+
+MA_API ma_result ma_log_register_callback(ma_log* pLog, ma_log_callback callback)
+{
+    ma_result result = MA_SUCCESS;
+
+    if (pLog == NULL || callback.onLog == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_log_lock(pLog);
+    {
+        if (pLog->callbackCount == ma_countof(pLog->callbacks)) {
+            result = MA_OUT_OF_MEMORY;  /* Reached the maximum allowed log callbacks. */
+        } else {
+            pLog->callbacks[pLog->callbackCount] = callback;
+            pLog->callbackCount += 1;
+        }
+    }
+    ma_log_unlock(pLog);
+
+    return result;
+}
+
+MA_API ma_result ma_log_unregister_callback(ma_log* pLog, ma_log_callback callback)
+{
+    if (pLog == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_log_lock(pLog);
+    {
+        ma_uint32 iLog;
+        for (iLog = 0; iLog < pLog->callbackCount; ) {
+            if (pLog->callbacks[iLog].onLog == callback.onLog) {
+                /* Found. Move everything down a slot. */
+                ma_uint32 jLog;
+                for (jLog = iLog; jLog < pLog->callbackCount-1; jLog += 1) {
+                    pLog->callbacks[jLog] = pLog->callbacks[jLog + 1];
+                }
+
+                pLog->callbackCount -= 1;
+            } else {
+                /* Not found. */
+                iLog += 1;
+            }
+        }
+    }
+    ma_log_unlock(pLog);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_log_post(ma_log* pLog, ma_uint32 level, const char* pMessage)
+{
+    if (pLog == NULL || pMessage == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_log_lock(pLog);
+    {
+        ma_uint32 iLog;
+        for (iLog = 0; iLog < pLog->callbackCount; iLog += 1) {
+            if (pLog->callbacks[iLog].onLog) {
+                pLog->callbacks[iLog].onLog(pLog->callbacks[iLog].pUserData, level, pMessage);
+            }
+        }
+    }
+    ma_log_unlock(pLog);
+
+    return MA_SUCCESS;
+}
+
+
+/*
+We need to emulate _vscprintf() for the VC6 build. This can be more efficient, but since it's only VC6, and it's just a
+logging function, I'm happy to keep this simple. In the VC6 build we can implement this in terms of _vsnprintf().
+*/
+#if defined(_MSC_VER) && _MSC_VER < 1900
+static int ma_vscprintf(const ma_allocation_callbacks* pAllocationCallbacks, const char* format, va_list args)
+{
+#if _MSC_VER > 1200
+    return _vscprintf(format, args);
+#else
+    int result;
+    char* pTempBuffer = NULL;
+    size_t tempBufferCap = 1024;
+
+    if (format == NULL) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    for (;;) {
+        char* pNewTempBuffer = (char*)ma_realloc(pTempBuffer, tempBufferCap, pAllocationCallbacks);
+        if (pNewTempBuffer == NULL) {
+            ma_free(pTempBuffer, pAllocationCallbacks);
+            errno = ENOMEM;
+            return -1;  /* Out of memory. */
+        }
+
+        pTempBuffer = pNewTempBuffer;
+
+        result = _vsnprintf(pTempBuffer, tempBufferCap, format, args);
+        ma_free(pTempBuffer, NULL);
+
+        if (result != -1) {
+            break;  /* Got it. */
+        }
+
+        /* Buffer wasn't big enough. Ideally it'd be nice to use an error code to know the reason for sure, but this is reliable enough. */
+        tempBufferCap *= 2;
+    }
+
+    return result;
+#endif
+}
+#endif
+
+MA_API ma_result ma_log_postv(ma_log* pLog, ma_uint32 level, const char* pFormat, va_list args)
+{
+    if (pLog == NULL || pFormat == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || ((!defined(_MSC_VER) || _MSC_VER >= 1900) && !defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS)) || (defined(__cplusplus) && __cplusplus >= 201103L)
+    {
+        ma_result result;
+        int length;
+        char  pFormattedMessageStack[1024];
+        char* pFormattedMessageHeap = NULL;
+
+        /* First try formatting into our fixed sized stack allocated buffer. If this is too small we'll fallback to a heap allocation. */
+        length = vsnprintf(pFormattedMessageStack, sizeof(pFormattedMessageStack), pFormat, args);
+        if (length < 0) {
+            return MA_INVALID_OPERATION;    /* An error occurred when trying to convert the buffer. */
+        }
+
+        if ((size_t)length < sizeof(pFormattedMessageStack)) {
+            /* The string was written to the stack. */
+            result = ma_log_post(pLog, level, pFormattedMessageStack);
+        } else {
+            /* The stack buffer was too small, try the heap. */
+            pFormattedMessageHeap = (char*)ma_malloc(length + 1, &pLog->allocationCallbacks);
+            if (pFormattedMessageHeap == NULL) {
+                return MA_OUT_OF_MEMORY;
+            }
+
+            length = vsnprintf(pFormattedMessageHeap, length + 1, pFormat, args);
+            if (length < 0) {
+                ma_free(pFormattedMessageHeap, &pLog->allocationCallbacks);
+                return MA_INVALID_OPERATION;
+            }
+
+            result = ma_log_post(pLog, level, pFormattedMessageHeap);
+            ma_free(pFormattedMessageHeap, &pLog->allocationCallbacks);
+        }
+
+        return result;
+    }
+    #else
+    {
+        /*
+        Without snprintf() we need to first measure the string and then heap allocate it. I'm only aware of Visual Studio having support for this without snprintf(), so we'll
+        need to restrict this branch to Visual Studio. For other compilers we need to just not support formatted logging because I don't want the security risk of overflowing
+        a fixed sized stack allocated buffer.
+        */
+        #if defined(_MSC_VER) && _MSC_VER >= 1200   /* 1200 = VC6 */
+        {
+            ma_result result;
+            int formattedLen;
+            char* pFormattedMessage = NULL;
+            va_list args2;
+
+            #if _MSC_VER >= 1800
+            {
+                va_copy(args2, args);
+            }
+            #else
+            {
+                args2 = args;
+            }
+            #endif
+
+            formattedLen = ma_vscprintf(&pLog->allocationCallbacks, pFormat, args2);
+            va_end(args2);
+
+            if (formattedLen <= 0) {
+                return MA_INVALID_OPERATION;
+            }
+
+            pFormattedMessage = (char*)ma_malloc(formattedLen + 1, &pLog->allocationCallbacks);
+            if (pFormattedMessage == NULL) {
+                return MA_OUT_OF_MEMORY;
+            }
+
+            /* We'll get errors on newer versions of Visual Studio if we try to use vsprintf().  */
+            #if _MSC_VER >= 1400    /* 1400 = Visual Studio 2005 */
+            {
+                vsprintf_s(pFormattedMessage, formattedLen + 1, pFormat, args);
+            }
+            #else
+            {
+                vsprintf(pFormattedMessage, pFormat, args);
+            }
+            #endif
+
+            result = ma_log_post(pLog, level, pFormattedMessage);
+            ma_free(pFormattedMessage, &pLog->allocationCallbacks);
+
+            return result;
+        }
+        #else
+        {
+            /* Can't do anything because we don't have a safe way of to emulate vsnprintf() without a manual solution. */
+            (void)level;
+            (void)args;
+
+            return MA_INVALID_OPERATION;
+        }
+        #endif
+    }
+    #endif
+}
+
+MA_API ma_result ma_log_postf(ma_log* pLog, ma_uint32 level, const char* pFormat, ...)
+{
+    ma_result result;
+    va_list args;
+
+    if (pLog == NULL || pFormat == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    va_start(args, pFormat);
+    {
+        result = ma_log_postv(pLog, level, pFormat, args);
+    }
+    va_end(args);
+
+    return result;
+}
+
+
+
+static MA_INLINE ma_uint8 ma_clip_u8(ma_int32 x)
+{
+    return (ma_uint8)(ma_clamp(x, -128, 127) + 128);
+}
+
+static MA_INLINE ma_int16 ma_clip_s16(ma_int32 x)
+{
+    return (ma_int16)ma_clamp(x, -32768, 32767);
+}
+
+static MA_INLINE ma_int64 ma_clip_s24(ma_int64 x)
+{
+    return (ma_int64)ma_clamp(x, -8388608, 8388607);
+}
+
+static MA_INLINE ma_int32 ma_clip_s32(ma_int64 x)
+{
+    /* This dance is to silence warnings with -std=c89. A good compiler should be able to optimize this away. */
+    ma_int64 clipMin;
+    ma_int64 clipMax;
+    clipMin = -((ma_int64)2147483647 + 1);
+    clipMax =   (ma_int64)2147483647;
+
+    return (ma_int32)ma_clamp(x, clipMin, clipMax);
+}
+
+static MA_INLINE float ma_clip_f32(float x)
+{
+    if (x < -1) return -1;
+    if (x > +1) return +1;
+    return x;
+}
+
+
+static MA_INLINE float ma_mix_f32(float x, float y, float a)
+{
+    return x*(1-a) + y*a;
+}
+static MA_INLINE float ma_mix_f32_fast(float x, float y, float a)
+{
+    float r0 = (y - x);
+    float r1 = r0*a;
+    return x + r1;
+    /*return x + (y - x)*a;*/
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE __m128 ma_mix_f32_fast__sse2(__m128 x, __m128 y, __m128 a)
+{
+    return _mm_add_ps(x, _mm_mul_ps(_mm_sub_ps(y, x), a));
+}
+#endif
+#if defined(MA_SUPPORT_AVX2)
+static MA_INLINE __m256 ma_mix_f32_fast__avx2(__m256 x, __m256 y, __m256 a)
+{
+    return _mm256_add_ps(x, _mm256_mul_ps(_mm256_sub_ps(y, x), a));
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE float32x4_t ma_mix_f32_fast__neon(float32x4_t x, float32x4_t y, float32x4_t a)
+{
+    return vaddq_f32(x, vmulq_f32(vsubq_f32(y, x), a));
+}
+#endif
+
+
+static MA_INLINE double ma_mix_f64(double x, double y, double a)
+{
+    return x*(1-a) + y*a;
+}
+static MA_INLINE double ma_mix_f64_fast(double x, double y, double a)
+{
+    return x + (y - x)*a;
+}
+
+static MA_INLINE float ma_scale_to_range_f32(float x, float lo, float hi)
+{
+    return lo + x*(hi-lo);
+}
+
+
+/*
+Greatest common factor using Euclid's algorithm iteratively.
+*/
+static MA_INLINE ma_uint32 ma_gcf_u32(ma_uint32 a, ma_uint32 b)
+{
+    for (;;) {
+        if (b == 0) {
+            break;
+        } else {
+            ma_uint32 t = a;
+            a = b;
+            b = t % a;
+        }
+    }
+
+    return a;
+}
+
+
+static ma_uint32 ma_ffs_32(ma_uint32 x)
+{
+    ma_uint32 i;
+
+    /* Just a naive implementation just to get things working for now. Will optimize this later. */
+    for (i = 0; i < 32; i += 1) {
+        if ((x & (1U << i)) != 0) {
+            return i;
+        }
+    }
+
+    return i;
+}
+
+static MA_INLINE ma_int16 ma_float_to_fixed_16(float x)
+{
+    return (ma_int16)(x * (1 << 8));
+}
+
+
+
+/*
+Random Number Generation
+
+miniaudio uses the LCG random number generation algorithm. This is good enough for audio.
+
+Note that miniaudio's global LCG implementation uses global state which is _not_ thread-local. When this is called across
+multiple threads, results will be unpredictable. However, it won't crash and results will still be random enough for
+miniaudio's purposes.
+*/
+#ifndef MA_DEFAULT_LCG_SEED
+#define MA_DEFAULT_LCG_SEED 4321
+#endif
+
+#define MA_LCG_M   2147483647
+#define MA_LCG_A   48271
+#define MA_LCG_C   0
+
+static ma_lcg g_maLCG = {MA_DEFAULT_LCG_SEED}; /* Non-zero initial seed. Use ma_seed() to use an explicit seed. */
+
+static MA_INLINE void ma_lcg_seed(ma_lcg* pLCG, ma_int32 seed)
+{
+    MA_ASSERT(pLCG != NULL);
+    pLCG->state = seed;
+}
+
+static MA_INLINE ma_int32 ma_lcg_rand_s32(ma_lcg* pLCG)
+{
+    pLCG->state = (MA_LCG_A * pLCG->state + MA_LCG_C) % MA_LCG_M;
+    return pLCG->state;
+}
+
+static MA_INLINE ma_uint32 ma_lcg_rand_u32(ma_lcg* pLCG)
+{
+    return (ma_uint32)ma_lcg_rand_s32(pLCG);
+}
+
+static MA_INLINE ma_int16 ma_lcg_rand_s16(ma_lcg* pLCG)
+{
+    return (ma_int16)(ma_lcg_rand_s32(pLCG) & 0xFFFF);
+}
+
+static MA_INLINE double ma_lcg_rand_f64(ma_lcg* pLCG)
+{
+    return ma_lcg_rand_s32(pLCG) / (double)0x7FFFFFFF;
+}
+
+static MA_INLINE float ma_lcg_rand_f32(ma_lcg* pLCG)
+{
+    return (float)ma_lcg_rand_f64(pLCG);
+}
+
+static MA_INLINE float ma_lcg_rand_range_f32(ma_lcg* pLCG, float lo, float hi)
+{
+    return ma_scale_to_range_f32(ma_lcg_rand_f32(pLCG), lo, hi);
+}
+
+static MA_INLINE ma_int32 ma_lcg_rand_range_s32(ma_lcg* pLCG, ma_int32 lo, ma_int32 hi)
+{
+    if (lo == hi) {
+        return lo;
+    }
+
+    return lo + ma_lcg_rand_u32(pLCG) / (0xFFFFFFFF / (hi - lo + 1) + 1);
+}
+
+
+
+static MA_INLINE void ma_seed(ma_int32 seed)
+{
+    ma_lcg_seed(&g_maLCG, seed);
+}
+
+static MA_INLINE ma_int32 ma_rand_s32(void)
+{
+    return ma_lcg_rand_s32(&g_maLCG);
+}
+
+static MA_INLINE ma_uint32 ma_rand_u32(void)
+{
+    return ma_lcg_rand_u32(&g_maLCG);
+}
+
+static MA_INLINE double ma_rand_f64(void)
+{
+    return ma_lcg_rand_f64(&g_maLCG);
+}
+
+static MA_INLINE float ma_rand_f32(void)
+{
+    return ma_lcg_rand_f32(&g_maLCG);
+}
+
+static MA_INLINE float ma_rand_range_f32(float lo, float hi)
+{
+    return ma_lcg_rand_range_f32(&g_maLCG, lo, hi);
+}
+
+static MA_INLINE ma_int32 ma_rand_range_s32(ma_int32 lo, ma_int32 hi)
+{
+    return ma_lcg_rand_range_s32(&g_maLCG, lo, hi);
+}
+
+
+static MA_INLINE float ma_dither_f32_rectangle(float ditherMin, float ditherMax)
+{
+    return ma_rand_range_f32(ditherMin, ditherMax);
+}
+
+static MA_INLINE float ma_dither_f32_triangle(float ditherMin, float ditherMax)
+{
+    float a = ma_rand_range_f32(ditherMin, 0);
+    float b = ma_rand_range_f32(0, ditherMax);
+    return a + b;
+}
+
+static MA_INLINE float ma_dither_f32(ma_dither_mode ditherMode, float ditherMin, float ditherMax)
+{
+    if (ditherMode == ma_dither_mode_rectangle) {
+        return ma_dither_f32_rectangle(ditherMin, ditherMax);
+    }
+    if (ditherMode == ma_dither_mode_triangle) {
+        return ma_dither_f32_triangle(ditherMin, ditherMax);
+    }
+
+    return 0;
+}
+
+static MA_INLINE ma_int32 ma_dither_s32(ma_dither_mode ditherMode, ma_int32 ditherMin, ma_int32 ditherMax)
+{
+    if (ditherMode == ma_dither_mode_rectangle) {
+        ma_int32 a = ma_rand_range_s32(ditherMin, ditherMax);
+        return a;
+    }
+    if (ditherMode == ma_dither_mode_triangle) {
+        ma_int32 a = ma_rand_range_s32(ditherMin, 0);
+        ma_int32 b = ma_rand_range_s32(0, ditherMax);
+        return a + b;
+    }
+
+    return 0;
+}
+
+
+/**************************************************************************************************************************************************************
+
+Atomics
+
+**************************************************************************************************************************************************************/
+/* c89atomic.h begin */
+#ifndef ma_atomic_h
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wlong-long"
+    #if defined(__clang__)
+        #pragma GCC diagnostic ignored "-Wc++11-long-long"
+    #endif
+#endif
+typedef int ma_atomic_memory_order;
+#define MA_ATOMIC_HAS_8
+#define MA_ATOMIC_HAS_16
+#define MA_ATOMIC_HAS_32
+#define MA_ATOMIC_HAS_64
+#if (defined(_MSC_VER) ) || defined(__WATCOMC__) || defined(__DMC__)
+    #define MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, intrin, ma_atomicType, msvcType)   \
+        ma_atomicType result; \
+        switch (order) \
+        { \
+            case ma_atomic_memory_order_relaxed: \
+            { \
+                result = (ma_atomicType)intrin##_nf((volatile msvcType*)dst, (msvcType)src); \
+            } break; \
+            case ma_atomic_memory_order_consume: \
+            case ma_atomic_memory_order_acquire: \
+            { \
+                result = (ma_atomicType)intrin##_acq((volatile msvcType*)dst, (msvcType)src); \
+            } break; \
+            case ma_atomic_memory_order_release: \
+            { \
+                result = (ma_atomicType)intrin##_rel((volatile msvcType*)dst, (msvcType)src); \
+            } break; \
+            case ma_atomic_memory_order_acq_rel: \
+            case ma_atomic_memory_order_seq_cst: \
+            default: \
+            { \
+                result = (ma_atomicType)intrin((volatile msvcType*)dst, (msvcType)src); \
+            } break; \
+        } \
+        return result;
+    #define MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, expected, desired, order, intrin, ma_atomicType, msvcType)   \
+        ma_atomicType result; \
+        switch (order) \
+        { \
+            case ma_atomic_memory_order_relaxed: \
+            { \
+                result = (ma_atomicType)intrin##_nf((volatile msvcType*)ptr, (msvcType)expected, (msvcType)desired); \
+            } break; \
+            case ma_atomic_memory_order_consume: \
+            case ma_atomic_memory_order_acquire: \
+            { \
+                result = (ma_atomicType)intrin##_acq((volatile msvcType*)ptr, (msvcType)expected, (msvcType)desired); \
+            } break; \
+            case ma_atomic_memory_order_release: \
+            { \
+                result = (ma_atomicType)intrin##_rel((volatile msvcType*)ptr, (msvcType)expected, (msvcType)desired); \
+            } break; \
+            case ma_atomic_memory_order_acq_rel: \
+            case ma_atomic_memory_order_seq_cst: \
+            default: \
+            { \
+                result = (ma_atomicType)intrin((volatile msvcType*)ptr, (msvcType)expected, (msvcType)desired); \
+            } break; \
+        } \
+        return result;
+    #define ma_atomic_memory_order_relaxed  0
+    #define ma_atomic_memory_order_consume  1
+    #define ma_atomic_memory_order_acquire  2
+    #define ma_atomic_memory_order_release  3
+    #define ma_atomic_memory_order_acq_rel  4
+    #define ma_atomic_memory_order_seq_cst  5
+    #if _MSC_VER < 1600 && defined(MA_X86)
+        #define MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY
+    #endif
+    #if _MSC_VER < 1600
+        #undef MA_ATOMIC_HAS_8
+        #undef MA_ATOMIC_HAS_16
+    #endif
+    #if !defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
+        #include <intrin.h>
+    #endif
+    #if defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
+        #if defined(MA_ATOMIC_HAS_8)
+            static MA_INLINE ma_uint8 __stdcall ma_atomic_compare_and_swap_8(volatile ma_uint8* dst, ma_uint8 expected, ma_uint8 desired)
+            {
+                ma_uint8 result = 0;
+                __asm {
+                    mov ecx, dst
+                    mov al,  expected
+                    mov dl,  desired
+                    lock cmpxchg [ecx], dl
+                    mov result, al
+                }
+                return result;
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_16)
+            static MA_INLINE ma_uint16 __stdcall ma_atomic_compare_and_swap_16(volatile ma_uint16* dst, ma_uint16 expected, ma_uint16 desired)
+            {
+                ma_uint16 result = 0;
+                __asm {
+                    mov ecx, dst
+                    mov ax,  expected
+                    mov dx,  desired
+                    lock cmpxchg [ecx], dx
+                    mov result, ax
+                }
+                return result;
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_32)
+            static MA_INLINE ma_uint32 __stdcall ma_atomic_compare_and_swap_32(volatile ma_uint32* dst, ma_uint32 expected, ma_uint32 desired)
+            {
+                ma_uint32 result = 0;
+                __asm {
+                    mov ecx, dst
+                    mov eax, expected
+                    mov edx, desired
+                    lock cmpxchg [ecx], edx
+                    mov result, eax
+                }
+                return result;
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_64)
+            static MA_INLINE ma_uint64 __stdcall ma_atomic_compare_and_swap_64(volatile ma_uint64* dst, ma_uint64 expected, ma_uint64 desired)
+            {
+                ma_uint32 resultEAX = 0;
+                ma_uint32 resultEDX = 0;
+                __asm {
+                    mov esi, dst
+                    mov eax, dword ptr expected
+                    mov edx, dword ptr expected + 4
+                    mov ebx, dword ptr desired
+                    mov ecx, dword ptr desired + 4
+                    lock cmpxchg8b qword ptr [esi]
+                    mov resultEAX, eax
+                    mov resultEDX, edx
+                }
+                return ((ma_uint64)resultEDX << 32) | resultEAX;
+            }
+        #endif
+    #else
+        #if defined(MA_ATOMIC_HAS_8)
+            #define ma_atomic_compare_and_swap_8( dst, expected, desired) (ma_uint8 )_InterlockedCompareExchange8((volatile char*)dst, (char)desired, (char)expected)
+        #endif
+        #if defined(MA_ATOMIC_HAS_16)
+            #define ma_atomic_compare_and_swap_16(dst, expected, desired) (ma_uint16)_InterlockedCompareExchange16((volatile short*)dst, (short)desired, (short)expected)
+        #endif
+        #if defined(MA_ATOMIC_HAS_32)
+            #define ma_atomic_compare_and_swap_32(dst, expected, desired) (ma_uint32)_InterlockedCompareExchange((volatile long*)dst, (long)desired, (long)expected)
+        #endif
+        #if defined(MA_ATOMIC_HAS_64)
+            #define ma_atomic_compare_and_swap_64(dst, expected, desired) (ma_uint64)_InterlockedCompareExchange64((volatile ma_int64*)dst, (ma_int64)desired, (ma_int64)expected)
+        #endif
+    #endif
+    #if defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
+        #if defined(MA_ATOMIC_HAS_8)
+            static MA_INLINE ma_uint8 __stdcall ma_atomic_exchange_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+            {
+                ma_uint8 result = 0;
+                (void)order;
+                __asm {
+                    mov ecx, dst
+                    mov al,  src
+                    lock xchg [ecx], al
+                    mov result, al
+                }
+                return result;
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_16)
+            static MA_INLINE ma_uint16 __stdcall ma_atomic_exchange_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+            {
+                ma_uint16 result = 0;
+                (void)order;
+                __asm {
+                    mov ecx, dst
+                    mov ax,  src
+                    lock xchg [ecx], ax
+                    mov result, ax
+                }
+                return result;
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_32)
+            static MA_INLINE ma_uint32 __stdcall ma_atomic_exchange_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+            {
+                ma_uint32 result = 0;
+                (void)order;
+                __asm {
+                    mov ecx, dst
+                    mov eax, src
+                    lock xchg [ecx], eax
+                    mov result, eax
+                }
+                return result;
+            }
+        #endif
+    #else
+        #if defined(MA_ATOMIC_HAS_8)
+            static MA_INLINE ma_uint8 __stdcall ma_atomic_exchange_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+            {
+            #if defined(MA_ARM)
+                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchange8, ma_uint8, char);
+            #else
+                (void)order;
+                return (ma_uint8)_InterlockedExchange8((volatile char*)dst, (char)src);
+            #endif
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_16)
+            static MA_INLINE ma_uint16 __stdcall ma_atomic_exchange_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+            {
+            #if defined(MA_ARM)
+                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchange16, ma_uint16, short);
+            #else
+                (void)order;
+                return (ma_uint16)_InterlockedExchange16((volatile short*)dst, (short)src);
+            #endif
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_32)
+            static MA_INLINE ma_uint32 __stdcall ma_atomic_exchange_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+            {
+            #if defined(MA_ARM)
+                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchange, ma_uint32, long);
+            #else
+                (void)order;
+                return (ma_uint32)_InterlockedExchange((volatile long*)dst, (long)src);
+            #endif
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_64) && defined(MA_64BIT)
+            static MA_INLINE ma_uint64 __stdcall ma_atomic_exchange_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+            {
+            #if defined(MA_ARM)
+                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchange64, ma_uint64, long long);
+            #else
+                (void)order;
+                return (ma_uint64)_InterlockedExchange64((volatile long long*)dst, (long long)src);
+            #endif
+            }
+        #else
+        #endif
+    #endif
+    #if defined(MA_ATOMIC_HAS_64) && !defined(MA_64BIT)
+        static MA_INLINE ma_uint64 __stdcall ma_atomic_exchange_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 oldValue;
+            do {
+                oldValue = *dst;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, src) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+    #endif
+    #if defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
+        #if defined(MA_ATOMIC_HAS_8)
+            static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_add_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+            {
+                ma_uint8 result = 0;
+                (void)order;
+                __asm {
+                    mov ecx, dst
+                    mov al,  src
+                    lock xadd [ecx], al
+                    mov result, al
+                }
+                return result;
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_16)
+            static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_add_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+            {
+                ma_uint16 result = 0;
+                (void)order;
+                __asm {
+                    mov ecx, dst
+                    mov ax,  src
+                    lock xadd [ecx], ax
+                    mov result, ax
+                }
+                return result;
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_32)
+            static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_add_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+            {
+                ma_uint32 result = 0;
+                (void)order;
+                __asm {
+                    mov ecx, dst
+                    mov eax, src
+                    lock xadd [ecx], eax
+                    mov result, eax
+                }
+                return result;
+            }
+        #endif
+    #else
+        #if defined(MA_ATOMIC_HAS_8)
+            static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_add_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+            {
+            #if defined(MA_ARM)
+                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchangeAdd8, ma_uint8, char);
+            #else
+                (void)order;
+                return (ma_uint8)_InterlockedExchangeAdd8((volatile char*)dst, (char)src);
+            #endif
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_16)
+            static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_add_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+            {
+            #if defined(MA_ARM)
+                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchangeAdd16, ma_uint16, short);
+            #else
+                (void)order;
+                return (ma_uint16)_InterlockedExchangeAdd16((volatile short*)dst, (short)src);
+            #endif
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_32)
+            static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_add_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+            {
+            #if defined(MA_ARM)
+                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchangeAdd, ma_uint32, long);
+            #else
+                (void)order;
+                return (ma_uint32)_InterlockedExchangeAdd((volatile long*)dst, (long)src);
+            #endif
+            }
+        #endif
+        #if defined(MA_ATOMIC_HAS_64) && defined(MA_64BIT)
+            static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_add_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+            {
+            #if defined(MA_ARM)
+                MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedExchangeAdd64, ma_uint64, long long);
+            #else
+                (void)order;
+                return (ma_uint64)_InterlockedExchangeAdd64((volatile long long*)dst, (long long)src);
+            #endif
+            }
+        #else
+        #endif
+    #endif
+    #if defined(MA_ATOMIC_HAS_64) && !defined(MA_64BIT)
+        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_add_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue + src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+    #endif
+    #if defined(MA_ATOMIC_MSVC_USE_INLINED_ASSEMBLY)
+        static MA_INLINE void __stdcall ma_atomic_thread_fence(ma_atomic_memory_order order)
+        {
+            (void)order;
+            __asm {
+                lock add [esp], 0
+            }
+        }
+    #else
+        #if defined(MA_X64)
+            #define ma_atomic_thread_fence(order)   __faststorefence(), (void)order
+        #elif defined(MA_ARM64)
+            #define ma_atomic_thread_fence(order)   __dmb(_ARM64_BARRIER_ISH), (void)order
+        #else
+            static MA_INLINE void ma_atomic_thread_fence(ma_atomic_memory_order order)
+            {
+                volatile ma_uint32 barrier = 0;
+                ma_atomic_fetch_add_explicit_32(&barrier, 0, order);
+            }
+        #endif
+    #endif
+    #define ma_atomic_compiler_fence()      ma_atomic_thread_fence(ma_atomic_memory_order_seq_cst)
+    #define ma_atomic_signal_fence(order)   ma_atomic_thread_fence(order)
+    #if defined(MA_ATOMIC_HAS_8)
+        static MA_INLINE ma_uint8 ma_atomic_load_explicit_8(volatile const ma_uint8* ptr, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, 0, 0, order, _InterlockedCompareExchange8, ma_uint8, char);
+        #else
+            (void)order;
+            return ma_atomic_compare_and_swap_8((volatile ma_uint8*)ptr, 0, 0);
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        static MA_INLINE ma_uint16 ma_atomic_load_explicit_16(volatile const ma_uint16* ptr, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, 0, 0, order, _InterlockedCompareExchange16, ma_uint16, short);
+        #else
+            (void)order;
+            return ma_atomic_compare_and_swap_16((volatile ma_uint16*)ptr, 0, 0);
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        static MA_INLINE ma_uint32 ma_atomic_load_explicit_32(volatile const ma_uint32* ptr, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, 0, 0, order, _InterlockedCompareExchange, ma_uint32, long);
+        #else
+            (void)order;
+            return ma_atomic_compare_and_swap_32((volatile ma_uint32*)ptr, 0, 0);
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        static MA_INLINE ma_uint64 ma_atomic_load_explicit_64(volatile const ma_uint64* ptr, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC_COMPARE_EXCHANGE(ptr, 0, 0, order, _InterlockedCompareExchange64, ma_uint64, long long);
+        #else
+            (void)order;
+            return ma_atomic_compare_and_swap_64((volatile ma_uint64*)ptr, 0, 0);
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_8)
+        #define ma_atomic_store_explicit_8( dst, src, order) (void)ma_atomic_exchange_explicit_8 (dst, src, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        #define ma_atomic_store_explicit_16(dst, src, order) (void)ma_atomic_exchange_explicit_16(dst, src, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        #define ma_atomic_store_explicit_32(dst, src, order) (void)ma_atomic_exchange_explicit_32(dst, src, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        #define ma_atomic_store_explicit_64(dst, src, order) (void)ma_atomic_exchange_explicit_64(dst, src, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_8)
+        static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_sub_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            ma_uint8 oldValue;
+            ma_uint8 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint8)(oldValue - src);
+            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_sub_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            ma_uint16 oldValue;
+            ma_uint16 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint16)(oldValue - src);
+            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_sub_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            ma_uint32 oldValue;
+            ma_uint32 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue - src;
+            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_sub_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue - src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_8)
+        static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_and_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedAnd8, ma_uint8, char);
+        #else
+            ma_uint8 oldValue;
+            ma_uint8 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint8)(oldValue & src);
+            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_and_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedAnd16, ma_uint16, short);
+        #else
+            ma_uint16 oldValue;
+            ma_uint16 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint16)(oldValue & src);
+            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_and_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedAnd, ma_uint32, long);
+        #else
+            ma_uint32 oldValue;
+            ma_uint32 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue & src;
+            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_and_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedAnd64, ma_uint64, long long);
+        #else
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue & src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_8)
+        static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_xor_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedXor8, ma_uint8, char);
+        #else
+            ma_uint8 oldValue;
+            ma_uint8 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint8)(oldValue ^ src);
+            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_xor_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedXor16, ma_uint16, short);
+        #else
+            ma_uint16 oldValue;
+            ma_uint16 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint16)(oldValue ^ src);
+            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_xor_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedXor, ma_uint32, long);
+        #else
+            ma_uint32 oldValue;
+            ma_uint32 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue ^ src;
+            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_xor_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedXor64, ma_uint64, long long);
+        #else
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue ^ src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_8)
+        static MA_INLINE ma_uint8 __stdcall ma_atomic_fetch_or_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedOr8, ma_uint8, char);
+        #else
+            ma_uint8 oldValue;
+            ma_uint8 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint8)(oldValue | src);
+            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        static MA_INLINE ma_uint16 __stdcall ma_atomic_fetch_or_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedOr16, ma_uint16, short);
+        #else
+            ma_uint16 oldValue;
+            ma_uint16 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint16)(oldValue | src);
+            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        static MA_INLINE ma_uint32 __stdcall ma_atomic_fetch_or_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedOr, ma_uint32, long);
+        #else
+            ma_uint32 oldValue;
+            ma_uint32 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue | src;
+            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        static MA_INLINE ma_uint64 __stdcall ma_atomic_fetch_or_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_ARM)
+            MA_ATOMIC_MSVC_ARM_INTRINSIC(dst, src, order, _InterlockedOr64, ma_uint64, long long);
+        #else
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue | src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        #endif
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_8)
+        #define ma_atomic_test_and_set_explicit_8( dst, order) ma_atomic_exchange_explicit_8 (dst, 1, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        #define ma_atomic_test_and_set_explicit_16(dst, order) ma_atomic_exchange_explicit_16(dst, 1, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        #define ma_atomic_test_and_set_explicit_32(dst, order) ma_atomic_exchange_explicit_32(dst, 1, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        #define ma_atomic_test_and_set_explicit_64(dst, order) ma_atomic_exchange_explicit_64(dst, 1, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_8)
+        #define ma_atomic_clear_explicit_8( dst, order) ma_atomic_store_explicit_8 (dst, 0, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        #define ma_atomic_clear_explicit_16(dst, order) ma_atomic_store_explicit_16(dst, 0, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        #define ma_atomic_clear_explicit_32(dst, order) ma_atomic_store_explicit_32(dst, 0, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        #define ma_atomic_clear_explicit_64(dst, order) ma_atomic_store_explicit_64(dst, 0, order)
+    #endif
+    #if defined(MA_ATOMIC_HAS_8)
+        typedef ma_uint8 ma_atomic_flag;
+        #define ma_atomic_flag_test_and_set_explicit(ptr, order)    (ma_bool32)ma_atomic_test_and_set_explicit_8(ptr, order)
+        #define ma_atomic_flag_clear_explicit(ptr, order)           ma_atomic_clear_explicit_8(ptr, order)
+        #define ma_atomic_flag_load_explicit(ptr, order)            ma_atomic_load_explicit_8(ptr, order)
+    #else
+        typedef ma_uint32 ma_atomic_flag;
+        #define ma_atomic_flag_test_and_set_explicit(ptr, order)    (ma_bool32)ma_atomic_test_and_set_explicit_32(ptr, order)
+        #define ma_atomic_flag_clear_explicit(ptr, order)           ma_atomic_clear_explicit_32(ptr, order)
+        #define ma_atomic_flag_load_explicit(ptr, order)            ma_atomic_load_explicit_32(ptr, order)
+    #endif
+#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+    #define MA_ATOMIC_HAS_NATIVE_COMPARE_EXCHANGE
+    #define MA_ATOMIC_HAS_NATIVE_IS_LOCK_FREE
+    #define ma_atomic_memory_order_relaxed                          __ATOMIC_RELAXED
+    #define ma_atomic_memory_order_consume                          __ATOMIC_CONSUME
+    #define ma_atomic_memory_order_acquire                          __ATOMIC_ACQUIRE
+    #define ma_atomic_memory_order_release                          __ATOMIC_RELEASE
+    #define ma_atomic_memory_order_acq_rel                          __ATOMIC_ACQ_REL
+    #define ma_atomic_memory_order_seq_cst                          __ATOMIC_SEQ_CST
+    #define ma_atomic_compiler_fence()                              __asm__ __volatile__("":::"memory")
+    #define ma_atomic_thread_fence(order)                           __atomic_thread_fence(order)
+    #define ma_atomic_signal_fence(order)                           __atomic_signal_fence(order)
+    #define ma_atomic_is_lock_free_8(ptr)                           __atomic_is_lock_free(1, ptr)
+    #define ma_atomic_is_lock_free_16(ptr)                          __atomic_is_lock_free(2, ptr)
+    #define ma_atomic_is_lock_free_32(ptr)                          __atomic_is_lock_free(4, ptr)
+    #define ma_atomic_is_lock_free_64(ptr)                          __atomic_is_lock_free(8, ptr)
+    #define ma_atomic_test_and_set_explicit_8( dst, order)          __atomic_exchange_n(dst, 1, order)
+    #define ma_atomic_test_and_set_explicit_16(dst, order)          __atomic_exchange_n(dst, 1, order)
+    #define ma_atomic_test_and_set_explicit_32(dst, order)          __atomic_exchange_n(dst, 1, order)
+    #define ma_atomic_test_and_set_explicit_64(dst, order)          __atomic_exchange_n(dst, 1, order)
+    #define ma_atomic_clear_explicit_8( dst, order)                 __atomic_store_n(dst, 0, order)
+    #define ma_atomic_clear_explicit_16(dst, order)                 __atomic_store_n(dst, 0, order)
+    #define ma_atomic_clear_explicit_32(dst, order)                 __atomic_store_n(dst, 0, order)
+    #define ma_atomic_clear_explicit_64(dst, order)                 __atomic_store_n(dst, 0, order)
+    #define ma_atomic_store_explicit_8( dst, src, order)            __atomic_store_n(dst, src, order)
+    #define ma_atomic_store_explicit_16(dst, src, order)            __atomic_store_n(dst, src, order)
+    #define ma_atomic_store_explicit_32(dst, src, order)            __atomic_store_n(dst, src, order)
+    #define ma_atomic_store_explicit_64(dst, src, order)            __atomic_store_n(dst, src, order)
+    #define ma_atomic_load_explicit_8( dst, order)                  __atomic_load_n(dst, order)
+    #define ma_atomic_load_explicit_16(dst, order)                  __atomic_load_n(dst, order)
+    #define ma_atomic_load_explicit_32(dst, order)                  __atomic_load_n(dst, order)
+    #define ma_atomic_load_explicit_64(dst, order)                  __atomic_load_n(dst, order)
+    #define ma_atomic_exchange_explicit_8( dst, src, order)         __atomic_exchange_n(dst, src, order)
+    #define ma_atomic_exchange_explicit_16(dst, src, order)         __atomic_exchange_n(dst, src, order)
+    #define ma_atomic_exchange_explicit_32(dst, src, order)         __atomic_exchange_n(dst, src, order)
+    #define ma_atomic_exchange_explicit_64(dst, src, order)         __atomic_exchange_n(dst, src, order)
+    #define ma_atomic_compare_exchange_strong_explicit_8( dst, expected, desired, successOrder, failureOrder)   __atomic_compare_exchange_n(dst, expected, desired, 0, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_strong_explicit_16(dst, expected, desired, successOrder, failureOrder)   __atomic_compare_exchange_n(dst, expected, desired, 0, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_strong_explicit_32(dst, expected, desired, successOrder, failureOrder)   __atomic_compare_exchange_n(dst, expected, desired, 0, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_strong_explicit_64(dst, expected, desired, successOrder, failureOrder)   __atomic_compare_exchange_n(dst, expected, desired, 0, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_weak_explicit_8( dst, expected, desired, successOrder, failureOrder)     __atomic_compare_exchange_n(dst, expected, desired, 1, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_weak_explicit_16(dst, expected, desired, successOrder, failureOrder)     __atomic_compare_exchange_n(dst, expected, desired, 1, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_weak_explicit_32(dst, expected, desired, successOrder, failureOrder)     __atomic_compare_exchange_n(dst, expected, desired, 1, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_weak_explicit_64(dst, expected, desired, successOrder, failureOrder)     __atomic_compare_exchange_n(dst, expected, desired, 1, successOrder, failureOrder)
+    #define ma_atomic_fetch_add_explicit_8( dst, src, order)        __atomic_fetch_add(dst, src, order)
+    #define ma_atomic_fetch_add_explicit_16(dst, src, order)        __atomic_fetch_add(dst, src, order)
+    #define ma_atomic_fetch_add_explicit_32(dst, src, order)        __atomic_fetch_add(dst, src, order)
+    #define ma_atomic_fetch_add_explicit_64(dst, src, order)        __atomic_fetch_add(dst, src, order)
+    #define ma_atomic_fetch_sub_explicit_8( dst, src, order)        __atomic_fetch_sub(dst, src, order)
+    #define ma_atomic_fetch_sub_explicit_16(dst, src, order)        __atomic_fetch_sub(dst, src, order)
+    #define ma_atomic_fetch_sub_explicit_32(dst, src, order)        __atomic_fetch_sub(dst, src, order)
+    #define ma_atomic_fetch_sub_explicit_64(dst, src, order)        __atomic_fetch_sub(dst, src, order)
+    #define ma_atomic_fetch_or_explicit_8( dst, src, order)         __atomic_fetch_or(dst, src, order)
+    #define ma_atomic_fetch_or_explicit_16(dst, src, order)         __atomic_fetch_or(dst, src, order)
+    #define ma_atomic_fetch_or_explicit_32(dst, src, order)         __atomic_fetch_or(dst, src, order)
+    #define ma_atomic_fetch_or_explicit_64(dst, src, order)         __atomic_fetch_or(dst, src, order)
+    #define ma_atomic_fetch_xor_explicit_8( dst, src, order)        __atomic_fetch_xor(dst, src, order)
+    #define ma_atomic_fetch_xor_explicit_16(dst, src, order)        __atomic_fetch_xor(dst, src, order)
+    #define ma_atomic_fetch_xor_explicit_32(dst, src, order)        __atomic_fetch_xor(dst, src, order)
+    #define ma_atomic_fetch_xor_explicit_64(dst, src, order)        __atomic_fetch_xor(dst, src, order)
+    #define ma_atomic_fetch_and_explicit_8( dst, src, order)        __atomic_fetch_and(dst, src, order)
+    #define ma_atomic_fetch_and_explicit_16(dst, src, order)        __atomic_fetch_and(dst, src, order)
+    #define ma_atomic_fetch_and_explicit_32(dst, src, order)        __atomic_fetch_and(dst, src, order)
+    #define ma_atomic_fetch_and_explicit_64(dst, src, order)        __atomic_fetch_and(dst, src, order)
+    static MA_INLINE ma_uint8 ma_atomic_compare_and_swap_8(volatile ma_uint8* dst, ma_uint8 expected, ma_uint8 desired)
+    {
+        __atomic_compare_exchange_n(dst, &expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+        return expected;
+    }
+    static MA_INLINE ma_uint16 ma_atomic_compare_and_swap_16(volatile ma_uint16* dst, ma_uint16 expected, ma_uint16 desired)
+    {
+        __atomic_compare_exchange_n(dst, &expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+        return expected;
+    }
+    static MA_INLINE ma_uint32 ma_atomic_compare_and_swap_32(volatile ma_uint32* dst, ma_uint32 expected, ma_uint32 desired)
+    {
+        __atomic_compare_exchange_n(dst, &expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+        return expected;
+    }
+    #if defined(__clang__)
+        #pragma clang diagnostic push
+        #if __clang_major__ >= 8
+            #pragma clang diagnostic ignored "-Watomic-alignment"
+        #endif
+    #endif
+    static MA_INLINE ma_uint64 ma_atomic_compare_and_swap_64(volatile ma_uint64* dst, ma_uint64 expected, ma_uint64 desired)
+    {
+        __atomic_compare_exchange_n(dst, &expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+        return expected;
+    }
+    #if defined(__clang__)
+        #pragma clang diagnostic pop
+    #endif
+    typedef ma_uint8 ma_atomic_flag;
+    #define ma_atomic_flag_test_and_set_explicit(dst, order)        (ma_bool32)__atomic_test_and_set(dst, order)
+    #define ma_atomic_flag_clear_explicit(dst, order)               __atomic_clear(dst, order)
+    #define ma_atomic_flag_load_explicit(ptr, order)                ma_atomic_load_explicit_8(ptr, order)
+#else
+    #define ma_atomic_memory_order_relaxed  1
+    #define ma_atomic_memory_order_consume  2
+    #define ma_atomic_memory_order_acquire  3
+    #define ma_atomic_memory_order_release  4
+    #define ma_atomic_memory_order_acq_rel  5
+    #define ma_atomic_memory_order_seq_cst  6
+    #define ma_atomic_compiler_fence() __asm__ __volatile__("":::"memory")
+    #if defined(__GNUC__)
+        #define ma_atomic_thread_fence(order) __sync_synchronize(), (void)order
+        static MA_INLINE ma_uint8 ma_atomic_exchange_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            if (order > ma_atomic_memory_order_acquire) {
+                __sync_synchronize();
+            }
+            return __sync_lock_test_and_set(dst, src);
+        }
+        static MA_INLINE ma_uint16 ma_atomic_exchange_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            ma_uint16 oldValue;
+            do {
+                oldValue = *dst;
+            } while (__sync_val_compare_and_swap(dst, oldValue, src) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint32 ma_atomic_exchange_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            ma_uint32 oldValue;
+            do {
+                oldValue = *dst;
+            } while (__sync_val_compare_and_swap(dst, oldValue, src) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint64 ma_atomic_exchange_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 oldValue;
+            do {
+                oldValue = *dst;
+            } while (__sync_val_compare_and_swap(dst, oldValue, src) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_add_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_add(dst, src);
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_add_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_add(dst, src);
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_add_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_add(dst, src);
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_add_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_add(dst, src);
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_sub_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_sub(dst, src);
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_sub_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_sub(dst, src);
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_sub_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_sub(dst, src);
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_sub_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_sub(dst, src);
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_or_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_or(dst, src);
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_or_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_or(dst, src);
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_or_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_or(dst, src);
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_or_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_or(dst, src);
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_xor_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_xor(dst, src);
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_xor_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_xor(dst, src);
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_xor_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_xor(dst, src);
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_xor_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_xor(dst, src);
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_and_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_and(dst, src);
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_and_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_and(dst, src);
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_and_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_and(dst, src);
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_and_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            (void)order;
+            return __sync_fetch_and_and(dst, src);
+        }
+        #define ma_atomic_compare_and_swap_8( dst, expected, desired)   __sync_val_compare_and_swap(dst, expected, desired)
+        #define ma_atomic_compare_and_swap_16(dst, expected, desired)   __sync_val_compare_and_swap(dst, expected, desired)
+        #define ma_atomic_compare_and_swap_32(dst, expected, desired)   __sync_val_compare_and_swap(dst, expected, desired)
+        #define ma_atomic_compare_and_swap_64(dst, expected, desired)   __sync_val_compare_and_swap(dst, expected, desired)
+    #else
+        #if defined(MA_X86)
+            #define ma_atomic_thread_fence(order) __asm__ __volatile__("lock; addl $0, (%%esp)" ::: "memory", "cc")
+        #elif defined(MA_X64)
+            #define ma_atomic_thread_fence(order) __asm__ __volatile__("lock; addq $0, (%%rsp)" ::: "memory", "cc")
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+        static MA_INLINE ma_uint8 ma_atomic_compare_and_swap_8(volatile ma_uint8* dst, ma_uint8 expected, ma_uint8 desired)
+        {
+            ma_uint8 result;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; cmpxchg %3, %0" : "+m"(*dst), "=a"(result) : "a"(expected), "d"(desired) : "cc");
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint16 ma_atomic_compare_and_swap_16(volatile ma_uint16* dst, ma_uint16 expected, ma_uint16 desired)
+        {
+            ma_uint16 result;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; cmpxchg %3, %0" : "+m"(*dst), "=a"(result) : "a"(expected), "d"(desired) : "cc");
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint32 ma_atomic_compare_and_swap_32(volatile ma_uint32* dst, ma_uint32 expected, ma_uint32 desired)
+        {
+            ma_uint32 result;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; cmpxchg %3, %0" : "+m"(*dst), "=a"(result) : "a"(expected), "d"(desired) : "cc");
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint64 ma_atomic_compare_and_swap_64(volatile ma_uint64* dst, ma_uint64 expected, ma_uint64 desired)
+        {
+            volatile ma_uint64 result;
+        #if defined(MA_X86)
+            ma_uint32 resultEAX;
+            ma_uint32 resultEDX;
+            __asm__ __volatile__("push %%ebx; xchg %5, %%ebx; lock; cmpxchg8b %0; pop %%ebx" : "+m"(*dst), "=a"(resultEAX), "=d"(resultEDX) : "a"(expected & 0xFFFFFFFF), "d"(expected >> 32), "r"(desired & 0xFFFFFFFF), "c"(desired >> 32) : "cc");
+            result = ((ma_uint64)resultEDX << 32) | resultEAX;
+        #elif defined(MA_X64)
+            __asm__ __volatile__("lock; cmpxchg %3, %0" : "+m"(*dst), "=a"(result) : "a"(expected), "d"(desired) : "cc");
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint8 ma_atomic_exchange_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            ma_uint8 result = 0;
+            (void)order;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; xchg %1, %0" : "+m"(*dst), "=a"(result) : "a"(src));
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint16 ma_atomic_exchange_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            ma_uint16 result = 0;
+            (void)order;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; xchg %1, %0" : "+m"(*dst), "=a"(result) : "a"(src));
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint32 ma_atomic_exchange_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            ma_uint32 result;
+            (void)order;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; xchg %1, %0" : "+m"(*dst), "=a"(result) : "a"(src));
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint64 ma_atomic_exchange_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 result;
+            (void)order;
+        #if defined(MA_X86)
+            do {
+                result = *dst;
+            } while (ma_atomic_compare_and_swap_64(dst, result, src) != result);
+        #elif defined(MA_X64)
+            __asm__ __volatile__("lock; xchg %1, %0" : "+m"(*dst), "=a"(result) : "a"(src));
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_add_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            ma_uint8 result;
+            (void)order;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; xadd %1, %0" : "+m"(*dst), "=a"(result) : "a"(src) : "cc");
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_add_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            ma_uint16 result;
+            (void)order;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; xadd %1, %0" : "+m"(*dst), "=a"(result) : "a"(src) : "cc");
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_add_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            ma_uint32 result;
+            (void)order;
+        #if defined(MA_X86) || defined(MA_X64)
+            __asm__ __volatile__("lock; xadd %1, %0" : "+m"(*dst), "=a"(result) : "a"(src) : "cc");
+        #else
+            #error Unsupported architecture. Please submit a feature request.
+        #endif
+            return result;
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_add_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+        #if defined(MA_X86)
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            (void)order;
+            do {
+                oldValue = *dst;
+                newValue = oldValue + src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            return oldValue;
+        #elif defined(MA_X64)
+            ma_uint64 result;
+            (void)order;
+            __asm__ __volatile__("lock; xadd %1, %0" : "+m"(*dst), "=a"(result) : "a"(src) : "cc");
+            return result;
+        #endif
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_sub_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            ma_uint8 oldValue;
+            ma_uint8 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint8)(oldValue - src);
+            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_sub_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            ma_uint16 oldValue;
+            ma_uint16 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint16)(oldValue - src);
+            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_sub_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            ma_uint32 oldValue;
+            ma_uint32 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue - src;
+            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_sub_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue - src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_and_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            ma_uint8 oldValue;
+            ma_uint8 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint8)(oldValue & src);
+            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_and_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            ma_uint16 oldValue;
+            ma_uint16 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint16)(oldValue & src);
+            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_and_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            ma_uint32 oldValue;
+            ma_uint32 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue & src;
+            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_and_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue & src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_xor_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            ma_uint8 oldValue;
+            ma_uint8 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint8)(oldValue ^ src);
+            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_xor_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            ma_uint16 oldValue;
+            ma_uint16 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint16)(oldValue ^ src);
+            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_xor_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            ma_uint32 oldValue;
+            ma_uint32 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue ^ src;
+            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_xor_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue ^ src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint8 ma_atomic_fetch_or_explicit_8(volatile ma_uint8* dst, ma_uint8 src, ma_atomic_memory_order order)
+        {
+            ma_uint8 oldValue;
+            ma_uint8 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint8)(oldValue | src);
+            } while (ma_atomic_compare_and_swap_8(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint16 ma_atomic_fetch_or_explicit_16(volatile ma_uint16* dst, ma_uint16 src, ma_atomic_memory_order order)
+        {
+            ma_uint16 oldValue;
+            ma_uint16 newValue;
+            do {
+                oldValue = *dst;
+                newValue = (ma_uint16)(oldValue | src);
+            } while (ma_atomic_compare_and_swap_16(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint32 ma_atomic_fetch_or_explicit_32(volatile ma_uint32* dst, ma_uint32 src, ma_atomic_memory_order order)
+        {
+            ma_uint32 oldValue;
+            ma_uint32 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue | src;
+            } while (ma_atomic_compare_and_swap_32(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+        static MA_INLINE ma_uint64 ma_atomic_fetch_or_explicit_64(volatile ma_uint64* dst, ma_uint64 src, ma_atomic_memory_order order)
+        {
+            ma_uint64 oldValue;
+            ma_uint64 newValue;
+            do {
+                oldValue = *dst;
+                newValue = oldValue | src;
+            } while (ma_atomic_compare_and_swap_64(dst, oldValue, newValue) != oldValue);
+            (void)order;
+            return oldValue;
+        }
+    #endif
+    #define ma_atomic_signal_fence(order)                           ma_atomic_thread_fence(order)
+    static MA_INLINE ma_uint8 ma_atomic_load_explicit_8(volatile const ma_uint8* ptr, ma_atomic_memory_order order)
+    {
+        (void)order;
+        return ma_atomic_compare_and_swap_8((ma_uint8*)ptr, 0, 0);
+    }
+    static MA_INLINE ma_uint16 ma_atomic_load_explicit_16(volatile const ma_uint16* ptr, ma_atomic_memory_order order)
+    {
+        (void)order;
+        return ma_atomic_compare_and_swap_16((ma_uint16*)ptr, 0, 0);
+    }
+    static MA_INLINE ma_uint32 ma_atomic_load_explicit_32(volatile const ma_uint32* ptr, ma_atomic_memory_order order)
+    {
+        (void)order;
+        return ma_atomic_compare_and_swap_32((ma_uint32*)ptr, 0, 0);
+    }
+    static MA_INLINE ma_uint64 ma_atomic_load_explicit_64(volatile const ma_uint64* ptr, ma_atomic_memory_order order)
+    {
+        (void)order;
+        return ma_atomic_compare_and_swap_64((ma_uint64*)ptr, 0, 0);
+    }
+    #define ma_atomic_store_explicit_8( dst, src, order)            (void)ma_atomic_exchange_explicit_8 (dst, src, order)
+    #define ma_atomic_store_explicit_16(dst, src, order)            (void)ma_atomic_exchange_explicit_16(dst, src, order)
+    #define ma_atomic_store_explicit_32(dst, src, order)            (void)ma_atomic_exchange_explicit_32(dst, src, order)
+    #define ma_atomic_store_explicit_64(dst, src, order)            (void)ma_atomic_exchange_explicit_64(dst, src, order)
+    #define ma_atomic_test_and_set_explicit_8( dst, order)          ma_atomic_exchange_explicit_8 (dst, 1, order)
+    #define ma_atomic_test_and_set_explicit_16(dst, order)          ma_atomic_exchange_explicit_16(dst, 1, order)
+    #define ma_atomic_test_and_set_explicit_32(dst, order)          ma_atomic_exchange_explicit_32(dst, 1, order)
+    #define ma_atomic_test_and_set_explicit_64(dst, order)          ma_atomic_exchange_explicit_64(dst, 1, order)
+    #define ma_atomic_clear_explicit_8( dst, order)                 ma_atomic_store_explicit_8 (dst, 0, order)
+    #define ma_atomic_clear_explicit_16(dst, order)                 ma_atomic_store_explicit_16(dst, 0, order)
+    #define ma_atomic_clear_explicit_32(dst, order)                 ma_atomic_store_explicit_32(dst, 0, order)
+    #define ma_atomic_clear_explicit_64(dst, order)                 ma_atomic_store_explicit_64(dst, 0, order)
+    typedef ma_uint8 ma_atomic_flag;
+    #define ma_atomic_flag_test_and_set_explicit(ptr, order)        (ma_bool32)ma_atomic_test_and_set_explicit_8(ptr, order)
+    #define ma_atomic_flag_clear_explicit(ptr, order)               ma_atomic_clear_explicit_8(ptr, order)
+    #define ma_atomic_flag_load_explicit(ptr, order)                ma_atomic_load_explicit_8(ptr, order)
+#endif
+#if !defined(MA_ATOMIC_HAS_NATIVE_COMPARE_EXCHANGE)
+    #if defined(MA_ATOMIC_HAS_8)
+        static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_8(volatile ma_uint8* dst, ma_uint8* expected, ma_uint8 desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+        {
+            ma_uint8 expectedValue;
+            ma_uint8 result;
+            (void)successOrder;
+            (void)failureOrder;
+            expectedValue = ma_atomic_load_explicit_8(expected, ma_atomic_memory_order_seq_cst);
+            result = ma_atomic_compare_and_swap_8(dst, expectedValue, desired);
+            if (result == expectedValue) {
+                return 1;
+            } else {
+                ma_atomic_store_explicit_8(expected, result, failureOrder);
+                return 0;
+            }
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_16)
+        static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_16(volatile ma_uint16* dst, ma_uint16* expected, ma_uint16 desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+        {
+            ma_uint16 expectedValue;
+            ma_uint16 result;
+            (void)successOrder;
+            (void)failureOrder;
+            expectedValue = ma_atomic_load_explicit_16(expected, ma_atomic_memory_order_seq_cst);
+            result = ma_atomic_compare_and_swap_16(dst, expectedValue, desired);
+            if (result == expectedValue) {
+                return 1;
+            } else {
+                ma_atomic_store_explicit_16(expected, result, failureOrder);
+                return 0;
+            }
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_32)
+        static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_32(volatile ma_uint32* dst, ma_uint32* expected, ma_uint32 desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+        {
+            ma_uint32 expectedValue;
+            ma_uint32 result;
+            (void)successOrder;
+            (void)failureOrder;
+            expectedValue = ma_atomic_load_explicit_32(expected, ma_atomic_memory_order_seq_cst);
+            result = ma_atomic_compare_and_swap_32(dst, expectedValue, desired);
+            if (result == expectedValue) {
+                return 1;
+            } else {
+                ma_atomic_store_explicit_32(expected, result, failureOrder);
+                return 0;
+            }
+        }
+    #endif
+    #if defined(MA_ATOMIC_HAS_64)
+        static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_64(volatile ma_uint64* dst, volatile ma_uint64* expected, ma_uint64 desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+        {
+            ma_uint64 expectedValue;
+            ma_uint64 result;
+            (void)successOrder;
+            (void)failureOrder;
+            expectedValue = ma_atomic_load_explicit_64(expected, ma_atomic_memory_order_seq_cst);
+            result = ma_atomic_compare_and_swap_64(dst, expectedValue, desired);
+            if (result == expectedValue) {
+                return 1;
+            } else {
+                ma_atomic_store_explicit_64(expected, result, failureOrder);
+                return 0;
+            }
+        }
+    #endif
+    #define ma_atomic_compare_exchange_weak_explicit_8( dst, expected, desired, successOrder, failureOrder) ma_atomic_compare_exchange_strong_explicit_8 (dst, expected, desired, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_weak_explicit_16(dst, expected, desired, successOrder, failureOrder) ma_atomic_compare_exchange_strong_explicit_16(dst, expected, desired, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_weak_explicit_32(dst, expected, desired, successOrder, failureOrder) ma_atomic_compare_exchange_strong_explicit_32(dst, expected, desired, successOrder, failureOrder)
+    #define ma_atomic_compare_exchange_weak_explicit_64(dst, expected, desired, successOrder, failureOrder) ma_atomic_compare_exchange_strong_explicit_64(dst, expected, desired, successOrder, failureOrder)
+#endif
+#if !defined(MA_ATOMIC_HAS_NATIVE_IS_LOCK_FREE)
+    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_8(volatile void* ptr)
+    {
+        (void)ptr;
+        return 1;
+    }
+    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_16(volatile void* ptr)
+    {
+        (void)ptr;
+        return 1;
+    }
+    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_32(volatile void* ptr)
+    {
+        (void)ptr;
+        return 1;
+    }
+    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_64(volatile void* ptr)
+    {
+        (void)ptr;
+    #if defined(MA_64BIT)
+        return 1;
+    #else
+        #if defined(MA_X86) || defined(MA_X64)
+            return 1;
+        #else
+            return 0;
+        #endif
+    #endif
+    }
+#endif
+#if defined(MA_64BIT)
+    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_ptr(volatile void** ptr)
+    {
+        return ma_atomic_is_lock_free_64((volatile ma_uint64*)ptr);
+    }
+    static MA_INLINE void* ma_atomic_load_explicit_ptr(volatile void** ptr, ma_atomic_memory_order order)
+    {
+        return (void*)ma_atomic_load_explicit_64((volatile ma_uint64*)ptr, order);
+    }
+    static MA_INLINE void ma_atomic_store_explicit_ptr(volatile void** dst, void* src, ma_atomic_memory_order order)
+    {
+        ma_atomic_store_explicit_64((volatile ma_uint64*)dst, (ma_uint64)src, order);
+    }
+    static MA_INLINE void* ma_atomic_exchange_explicit_ptr(volatile void** dst, void* src, ma_atomic_memory_order order)
+    {
+        return (void*)ma_atomic_exchange_explicit_64((volatile ma_uint64*)dst, (ma_uint64)src, order);
+    }
+    static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_ptr(volatile void** dst, void** expected, void* desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+    {
+        return ma_atomic_compare_exchange_strong_explicit_64((volatile ma_uint64*)dst, (ma_uint64*)expected, (ma_uint64)desired, successOrder, failureOrder);
+    }
+    static MA_INLINE ma_bool32 ma_atomic_compare_exchange_weak_explicit_ptr(volatile void** dst, void** expected, void* desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+    {
+        return ma_atomic_compare_exchange_weak_explicit_64((volatile ma_uint64*)dst, (ma_uint64*)expected, (ma_uint64)desired, successOrder, failureOrder);
+    }
+    static MA_INLINE void* ma_atomic_compare_and_swap_ptr(volatile void** dst, void* expected, void* desired)
+    {
+        return (void*)ma_atomic_compare_and_swap_64((volatile ma_uint64*)dst, (ma_uint64)expected, (ma_uint64)desired);
+    }
+#elif defined(MA_32BIT)
+    static MA_INLINE ma_bool32 ma_atomic_is_lock_free_ptr(volatile void** ptr)
+    {
+        return ma_atomic_is_lock_free_32((volatile ma_uint32*)ptr);
+    }
+    static MA_INLINE void* ma_atomic_load_explicit_ptr(volatile void** ptr, ma_atomic_memory_order order)
+    {
+        return (void*)ma_atomic_load_explicit_32((volatile ma_uint32*)ptr, order);
+    }
+    static MA_INLINE void ma_atomic_store_explicit_ptr(volatile void** dst, void* src, ma_atomic_memory_order order)
+    {
+        ma_atomic_store_explicit_32((volatile ma_uint32*)dst, (ma_uint32)src, order);
+    }
+    static MA_INLINE void* ma_atomic_exchange_explicit_ptr(volatile void** dst, void* src, ma_atomic_memory_order order)
+    {
+        return (void*)ma_atomic_exchange_explicit_32((volatile ma_uint32*)dst, (ma_uint32)src, order);
+    }
+    static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_ptr(volatile void** dst, void** expected, void* desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+    {
+        return ma_atomic_compare_exchange_strong_explicit_32((volatile ma_uint32*)dst, (ma_uint32*)expected, (ma_uint32)desired, successOrder, failureOrder);
+    }
+    static MA_INLINE ma_bool32 ma_atomic_compare_exchange_weak_explicit_ptr(volatile void** dst, void** expected, void* desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+    {
+        return ma_atomic_compare_exchange_weak_explicit_32((volatile ma_uint32*)dst, (ma_uint32*)expected, (ma_uint32)desired, successOrder, failureOrder);
+    }
+    static MA_INLINE void* ma_atomic_compare_and_swap_ptr(volatile void** dst, void* expected, void* desired)
+    {
+        return (void*)ma_atomic_compare_and_swap_32((volatile ma_uint32*)dst, (ma_uint32)expected, (ma_uint32)desired);
+    }
+#else
+    #error Unsupported architecture.
+#endif
+#define ma_atomic_flag_test_and_set(ptr)                                ma_atomic_flag_test_and_set_explicit(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_flag_clear(ptr)                                       ma_atomic_flag_clear_explicit(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_ptr(dst, src)                                   ma_atomic_store_explicit_ptr((volatile void**)dst, (void*)src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_ptr(ptr)                                         ma_atomic_load_explicit_ptr((volatile void**)ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_ptr(dst, src)                                ma_atomic_exchange_explicit_ptr((volatile void**)dst, (void*)src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_ptr(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_ptr((volatile void**)dst, (void**)expected, (void*)desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_ptr(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_ptr((volatile void**)dst, (void**)expected, (void*)desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_test_and_set_8( ptr)                                  ma_atomic_test_and_set_explicit_8( ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_test_and_set_16(ptr)                                  ma_atomic_test_and_set_explicit_16(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_test_and_set_32(ptr)                                  ma_atomic_test_and_set_explicit_32(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_test_and_set_64(ptr)                                  ma_atomic_test_and_set_explicit_64(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_8( ptr)                                         ma_atomic_clear_explicit_8( ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_16(ptr)                                         ma_atomic_clear_explicit_16(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_32(ptr)                                         ma_atomic_clear_explicit_32(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_64(ptr)                                         ma_atomic_clear_explicit_64(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_8( dst, src)                                    ma_atomic_store_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_16(dst, src)                                    ma_atomic_store_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_32(dst, src)                                    ma_atomic_store_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_64(dst, src)                                    ma_atomic_store_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_8( ptr)                                          ma_atomic_load_explicit_8( ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_16(ptr)                                          ma_atomic_load_explicit_16(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_32(ptr)                                          ma_atomic_load_explicit_32(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_64(ptr)                                          ma_atomic_load_explicit_64(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_8( dst, src)                                 ma_atomic_exchange_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_16(dst, src)                                 ma_atomic_exchange_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_32(dst, src)                                 ma_atomic_exchange_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_64(dst, src)                                 ma_atomic_exchange_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_8( dst, expected, desired)    ma_atomic_compare_exchange_strong_explicit_8( dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_16(dst, expected, desired)    ma_atomic_compare_exchange_strong_explicit_16(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_32(dst, expected, desired)    ma_atomic_compare_exchange_strong_explicit_32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_64(dst, expected, desired)    ma_atomic_compare_exchange_strong_explicit_64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_8(  dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_8( dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_16( dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_16(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_32( dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_64( dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_8( dst, src)                                ma_atomic_fetch_add_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_16(dst, src)                                ma_atomic_fetch_add_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_32(dst, src)                                ma_atomic_fetch_add_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_64(dst, src)                                ma_atomic_fetch_add_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_8( dst, src)                                ma_atomic_fetch_sub_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_16(dst, src)                                ma_atomic_fetch_sub_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_32(dst, src)                                ma_atomic_fetch_sub_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_64(dst, src)                                ma_atomic_fetch_sub_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_8( dst, src)                                 ma_atomic_fetch_or_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_16(dst, src)                                 ma_atomic_fetch_or_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_32(dst, src)                                 ma_atomic_fetch_or_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_64(dst, src)                                 ma_atomic_fetch_or_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_8( dst, src)                                ma_atomic_fetch_xor_explicit_8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_16(dst, src)                                ma_atomic_fetch_xor_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_32(dst, src)                                ma_atomic_fetch_xor_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_64(dst, src)                                ma_atomic_fetch_xor_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_8( dst, src)                                ma_atomic_fetch_and_explicit_8 (dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_16(dst, src)                                ma_atomic_fetch_and_explicit_16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_32(dst, src)                                ma_atomic_fetch_and_explicit_32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_64(dst, src)                                ma_atomic_fetch_and_explicit_64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_test_and_set_explicit_i8( ptr, order)                 (ma_int8 )ma_atomic_test_and_set_explicit_8( (ma_uint8* )ptr, order)
+#define ma_atomic_test_and_set_explicit_i16(ptr, order)                 (ma_int16)ma_atomic_test_and_set_explicit_16((ma_uint16*)ptr, order)
+#define ma_atomic_test_and_set_explicit_i32(ptr, order)                 (ma_int32)ma_atomic_test_and_set_explicit_32((ma_uint32*)ptr, order)
+#define ma_atomic_test_and_set_explicit_i64(ptr, order)                 (ma_int64)ma_atomic_test_and_set_explicit_64((ma_uint64*)ptr, order)
+#define ma_atomic_clear_explicit_i8( ptr, order)                        ma_atomic_clear_explicit_8( (ma_uint8* )ptr, order)
+#define ma_atomic_clear_explicit_i16(ptr, order)                        ma_atomic_clear_explicit_16((ma_uint16*)ptr, order)
+#define ma_atomic_clear_explicit_i32(ptr, order)                        ma_atomic_clear_explicit_32((ma_uint32*)ptr, order)
+#define ma_atomic_clear_explicit_i64(ptr, order)                        ma_atomic_clear_explicit_64((ma_uint64*)ptr, order)
+#define ma_atomic_store_explicit_i8( dst, src, order)                   ma_atomic_store_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
+#define ma_atomic_store_explicit_i16(dst, src, order)                   ma_atomic_store_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
+#define ma_atomic_store_explicit_i32(dst, src, order)                   ma_atomic_store_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
+#define ma_atomic_store_explicit_i64(dst, src, order)                   ma_atomic_store_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
+#define ma_atomic_load_explicit_i8( ptr, order)                         (ma_int8 )ma_atomic_load_explicit_8( (ma_uint8* )ptr, order)
+#define ma_atomic_load_explicit_i16(ptr, order)                         (ma_int16)ma_atomic_load_explicit_16((ma_uint16*)ptr, order)
+#define ma_atomic_load_explicit_i32(ptr, order)                         (ma_int32)ma_atomic_load_explicit_32((ma_uint32*)ptr, order)
+#define ma_atomic_load_explicit_i64(ptr, order)                         (ma_int64)ma_atomic_load_explicit_64((ma_uint64*)ptr, order)
+#define ma_atomic_exchange_explicit_i8( dst, src, order)                (ma_int8 )ma_atomic_exchange_explicit_8 ((ma_uint8* )dst, (ma_uint8 )src, order)
+#define ma_atomic_exchange_explicit_i16(dst, src, order)                (ma_int16)ma_atomic_exchange_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
+#define ma_atomic_exchange_explicit_i32(dst, src, order)                (ma_int32)ma_atomic_exchange_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
+#define ma_atomic_exchange_explicit_i64(dst, src, order)                (ma_int64)ma_atomic_exchange_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
+#define ma_atomic_compare_exchange_strong_explicit_i8( dst, expected, desired, successOrder, failureOrder)  ma_atomic_compare_exchange_strong_explicit_8( (ma_uint8* )dst, (ma_uint8* )expected, (ma_uint8 )desired, successOrder, failureOrder)
+#define ma_atomic_compare_exchange_strong_explicit_i16(dst, expected, desired, successOrder, failureOrder)  ma_atomic_compare_exchange_strong_explicit_16((ma_uint16*)dst, (ma_uint16*)expected, (ma_uint16)desired, successOrder, failureOrder)
+#define ma_atomic_compare_exchange_strong_explicit_i32(dst, expected, desired, successOrder, failureOrder)  ma_atomic_compare_exchange_strong_explicit_32((ma_uint32*)dst, (ma_uint32*)expected, (ma_uint32)desired, successOrder, failureOrder)
+#define ma_atomic_compare_exchange_strong_explicit_i64(dst, expected, desired, successOrder, failureOrder)  ma_atomic_compare_exchange_strong_explicit_64((ma_uint64*)dst, (ma_uint64*)expected, (ma_uint64)desired, successOrder, failureOrder)
+#define ma_atomic_compare_exchange_weak_explicit_i8( dst, expected, desired, successOrder, failureOrder)    ma_atomic_compare_exchange_weak_explicit_8( (ma_uint8* )dst, (ma_uint8* )expected, (ma_uint8 )desired, successOrder, failureOrder)
+#define ma_atomic_compare_exchange_weak_explicit_i16(dst, expected, desired, successOrder, failureOrder)    ma_atomic_compare_exchange_weak_explicit_16((ma_uint16*)dst, (ma_uint16*)expected, (ma_uint16)desired, successOrder, failureOrder)
+#define ma_atomic_compare_exchange_weak_explicit_i32(dst, expected, desired, successOrder, failureOrder)    ma_atomic_compare_exchange_weak_explicit_32((ma_uint32*)dst, (ma_uint32*)expected, (ma_uint32)desired, successOrder, failureOrder)
+#define ma_atomic_compare_exchange_weak_explicit_i64(dst, expected, desired, successOrder, failureOrder)    ma_atomic_compare_exchange_weak_explicit_64((ma_uint64*)dst, (ma_uint64*)expected, (ma_uint64)desired, successOrder, failureOrder)
+#define ma_atomic_fetch_add_explicit_i8( dst, src, order)               (ma_int8 )ma_atomic_fetch_add_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
+#define ma_atomic_fetch_add_explicit_i16(dst, src, order)               (ma_int16)ma_atomic_fetch_add_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
+#define ma_atomic_fetch_add_explicit_i32(dst, src, order)               (ma_int32)ma_atomic_fetch_add_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
+#define ma_atomic_fetch_add_explicit_i64(dst, src, order)               (ma_int64)ma_atomic_fetch_add_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
+#define ma_atomic_fetch_sub_explicit_i8( dst, src, order)               (ma_int8 )ma_atomic_fetch_sub_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
+#define ma_atomic_fetch_sub_explicit_i16(dst, src, order)               (ma_int16)ma_atomic_fetch_sub_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
+#define ma_atomic_fetch_sub_explicit_i32(dst, src, order)               (ma_int32)ma_atomic_fetch_sub_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
+#define ma_atomic_fetch_sub_explicit_i64(dst, src, order)               (ma_int64)ma_atomic_fetch_sub_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
+#define ma_atomic_fetch_or_explicit_i8( dst, src, order)                (ma_int8 )ma_atomic_fetch_or_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
+#define ma_atomic_fetch_or_explicit_i16(dst, src, order)                (ma_int16)ma_atomic_fetch_or_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
+#define ma_atomic_fetch_or_explicit_i32(dst, src, order)                (ma_int32)ma_atomic_fetch_or_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
+#define ma_atomic_fetch_or_explicit_i64(dst, src, order)                (ma_int64)ma_atomic_fetch_or_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
+#define ma_atomic_fetch_xor_explicit_i8( dst, src, order)               (ma_int8 )ma_atomic_fetch_xor_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
+#define ma_atomic_fetch_xor_explicit_i16(dst, src, order)               (ma_int16)ma_atomic_fetch_xor_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
+#define ma_atomic_fetch_xor_explicit_i32(dst, src, order)               (ma_int32)ma_atomic_fetch_xor_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
+#define ma_atomic_fetch_xor_explicit_i64(dst, src, order)               (ma_int64)ma_atomic_fetch_xor_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
+#define ma_atomic_fetch_and_explicit_i8( dst, src, order)               (ma_int8 )ma_atomic_fetch_and_explicit_8( (ma_uint8* )dst, (ma_uint8 )src, order)
+#define ma_atomic_fetch_and_explicit_i16(dst, src, order)               (ma_int16)ma_atomic_fetch_and_explicit_16((ma_uint16*)dst, (ma_uint16)src, order)
+#define ma_atomic_fetch_and_explicit_i32(dst, src, order)               (ma_int32)ma_atomic_fetch_and_explicit_32((ma_uint32*)dst, (ma_uint32)src, order)
+#define ma_atomic_fetch_and_explicit_i64(dst, src, order)               (ma_int64)ma_atomic_fetch_and_explicit_64((ma_uint64*)dst, (ma_uint64)src, order)
+#define ma_atomic_test_and_set_i8( ptr)                                 ma_atomic_test_and_set_explicit_i8( ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_test_and_set_i16(ptr)                                 ma_atomic_test_and_set_explicit_i16(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_test_and_set_i32(ptr)                                 ma_atomic_test_and_set_explicit_i32(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_test_and_set_i64(ptr)                                 ma_atomic_test_and_set_explicit_i64(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_i8( ptr)                                        ma_atomic_clear_explicit_i8( ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_i16(ptr)                                        ma_atomic_clear_explicit_i16(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_i32(ptr)                                        ma_atomic_clear_explicit_i32(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_i64(ptr)                                        ma_atomic_clear_explicit_i64(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_i8( dst, src)                                   ma_atomic_store_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_i16(dst, src)                                   ma_atomic_store_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_i32(dst, src)                                   ma_atomic_store_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_i64(dst, src)                                   ma_atomic_store_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_i8( ptr)                                         ma_atomic_load_explicit_i8( ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_i16(ptr)                                         ma_atomic_load_explicit_i16(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_i32(ptr)                                         ma_atomic_load_explicit_i32(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_i64(ptr)                                         ma_atomic_load_explicit_i64(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_i8( dst, src)                                ma_atomic_exchange_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_i16(dst, src)                                ma_atomic_exchange_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_i32(dst, src)                                ma_atomic_exchange_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_i64(dst, src)                                ma_atomic_exchange_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_i8( dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_i8( dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_i16(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_i16(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_i32(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_i32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_i64(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_i64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_i8( dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_i8( dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_i16(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_i16(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_i32(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_i32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_i64(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_i64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_i8( dst, src)                               ma_atomic_fetch_add_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_i16(dst, src)                               ma_atomic_fetch_add_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_i32(dst, src)                               ma_atomic_fetch_add_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_i64(dst, src)                               ma_atomic_fetch_add_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_i8( dst, src)                               ma_atomic_fetch_sub_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_i16(dst, src)                               ma_atomic_fetch_sub_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_i32(dst, src)                               ma_atomic_fetch_sub_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_i64(dst, src)                               ma_atomic_fetch_sub_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_i8( dst, src)                                ma_atomic_fetch_or_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_i16(dst, src)                                ma_atomic_fetch_or_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_i32(dst, src)                                ma_atomic_fetch_or_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_i64(dst, src)                                ma_atomic_fetch_or_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_i8( dst, src)                               ma_atomic_fetch_xor_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_i16(dst, src)                               ma_atomic_fetch_xor_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_i32(dst, src)                               ma_atomic_fetch_xor_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_i64(dst, src)                               ma_atomic_fetch_xor_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_i8( dst, src)                               ma_atomic_fetch_and_explicit_i8( dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_i16(dst, src)                               ma_atomic_fetch_and_explicit_i16(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_i32(dst, src)                               ma_atomic_fetch_and_explicit_i32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_i64(dst, src)                               ma_atomic_fetch_and_explicit_i64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_and_swap_i8( dst, expected, dedsired)         (ma_int8 )ma_atomic_compare_and_swap_8( (ma_uint8* )dst, (ma_uint8 )expected, (ma_uint8 )dedsired)
+#define ma_atomic_compare_and_swap_i16(dst, expected, dedsired)         (ma_int16)ma_atomic_compare_and_swap_16((ma_uint16*)dst, (ma_uint16)expected, (ma_uint16)dedsired)
+#define ma_atomic_compare_and_swap_i32(dst, expected, dedsired)         (ma_int32)ma_atomic_compare_and_swap_32((ma_uint32*)dst, (ma_uint32)expected, (ma_uint32)dedsired)
+#define ma_atomic_compare_and_swap_i64(dst, expected, dedsired)         (ma_int64)ma_atomic_compare_and_swap_64((ma_uint64*)dst, (ma_uint64)expected, (ma_uint64)dedsired)
+typedef union
+{
+    ma_uint32 i;
+    float f;
+} ma_atomic_if32;
+typedef union
+{
+    ma_uint64 i;
+    double f;
+} ma_atomic_if64;
+#define ma_atomic_clear_explicit_f32(ptr, order)                        ma_atomic_clear_explicit_32((ma_uint32*)ptr, order)
+#define ma_atomic_clear_explicit_f64(ptr, order)                        ma_atomic_clear_explicit_64((ma_uint64*)ptr, order)
+static MA_INLINE void ma_atomic_store_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
+{
+    ma_atomic_if32 x;
+    x.f = src;
+    ma_atomic_store_explicit_32((volatile ma_uint32*)dst, x.i, order);
+}
+static MA_INLINE void ma_atomic_store_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
+{
+    ma_atomic_if64 x;
+    x.f = src;
+    ma_atomic_store_explicit_64((volatile ma_uint64*)dst, x.i, order);
+}
+static MA_INLINE float ma_atomic_load_explicit_f32(volatile const float* ptr, ma_atomic_memory_order order)
+{
+    ma_atomic_if32 r;
+    r.i = ma_atomic_load_explicit_32((volatile const ma_uint32*)ptr, order);
+    return r.f;
+}
+static MA_INLINE double ma_atomic_load_explicit_f64(volatile const double* ptr, ma_atomic_memory_order order)
+{
+    ma_atomic_if64 r;
+    r.i = ma_atomic_load_explicit_64((volatile const ma_uint64*)ptr, order);
+    return r.f;
+}
+static MA_INLINE float ma_atomic_exchange_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
+{
+    ma_atomic_if32 r;
+    ma_atomic_if32 x;
+    x.f = src;
+    r.i = ma_atomic_exchange_explicit_32((volatile ma_uint32*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE double ma_atomic_exchange_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
+{
+    ma_atomic_if64 r;
+    ma_atomic_if64 x;
+    x.f = src;
+    r.i = ma_atomic_exchange_explicit_64((volatile ma_uint64*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_f32(volatile float* dst, float* expected, float desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+{
+    ma_atomic_if32 d;
+    d.f = desired;
+    return ma_atomic_compare_exchange_strong_explicit_32((volatile ma_uint32*)dst, (ma_uint32*)expected, d.i, successOrder, failureOrder);
+}
+static MA_INLINE ma_bool32 ma_atomic_compare_exchange_strong_explicit_f64(volatile double* dst, double* expected, double desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+{
+    ma_atomic_if64 d;
+    d.f = desired;
+    return ma_atomic_compare_exchange_strong_explicit_64((volatile ma_uint64*)dst, (ma_uint64*)expected, d.i, successOrder, failureOrder);
+}
+static MA_INLINE ma_bool32 ma_atomic_compare_exchange_weak_explicit_f32(volatile float* dst, float* expected, float desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+{
+    ma_atomic_if32 d;
+    d.f = desired;
+    return ma_atomic_compare_exchange_weak_explicit_32((volatile ma_uint32*)dst, (ma_uint32*)expected, d.i, successOrder, failureOrder);
+}
+static MA_INLINE ma_bool32 ma_atomic_compare_exchange_weak_explicit_f64(volatile double* dst, double* expected, double desired, ma_atomic_memory_order successOrder, ma_atomic_memory_order failureOrder)
+{
+    ma_atomic_if64 d;
+    d.f = desired;
+    return ma_atomic_compare_exchange_weak_explicit_64((volatile ma_uint64*)dst, (ma_uint64*)expected, d.i, successOrder, failureOrder);
+}
+static MA_INLINE float ma_atomic_fetch_add_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
+{
+    ma_atomic_if32 r;
+    ma_atomic_if32 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_add_explicit_32((volatile ma_uint32*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE double ma_atomic_fetch_add_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
+{
+    ma_atomic_if64 r;
+    ma_atomic_if64 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_add_explicit_64((volatile ma_uint64*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE float ma_atomic_fetch_sub_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
+{
+    ma_atomic_if32 r;
+    ma_atomic_if32 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_sub_explicit_32((volatile ma_uint32*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE double ma_atomic_fetch_sub_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
+{
+    ma_atomic_if64 r;
+    ma_atomic_if64 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_sub_explicit_64((volatile ma_uint64*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE float ma_atomic_fetch_or_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
+{
+    ma_atomic_if32 r;
+    ma_atomic_if32 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_or_explicit_32((volatile ma_uint32*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE double ma_atomic_fetch_or_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
+{
+    ma_atomic_if64 r;
+    ma_atomic_if64 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_or_explicit_64((volatile ma_uint64*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE float ma_atomic_fetch_xor_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
+{
+    ma_atomic_if32 r;
+    ma_atomic_if32 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_xor_explicit_32((volatile ma_uint32*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE double ma_atomic_fetch_xor_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
+{
+    ma_atomic_if64 r;
+    ma_atomic_if64 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_xor_explicit_64((volatile ma_uint64*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE float ma_atomic_fetch_and_explicit_f32(volatile float* dst, float src, ma_atomic_memory_order order)
+{
+    ma_atomic_if32 r;
+    ma_atomic_if32 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_and_explicit_32((volatile ma_uint32*)dst, x.i, order);
+    return r.f;
+}
+static MA_INLINE double ma_atomic_fetch_and_explicit_f64(volatile double* dst, double src, ma_atomic_memory_order order)
+{
+    ma_atomic_if64 r;
+    ma_atomic_if64 x;
+    x.f = src;
+    r.i = ma_atomic_fetch_and_explicit_64((volatile ma_uint64*)dst, x.i, order);
+    return r.f;
+}
+#define ma_atomic_clear_f32(ptr)                                        (float )ma_atomic_clear_explicit_f32(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_clear_f64(ptr)                                        (double)ma_atomic_clear_explicit_f64(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_f32(dst, src)                                   ma_atomic_store_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_store_f64(dst, src)                                   ma_atomic_store_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_f32(ptr)                                         (float )ma_atomic_load_explicit_f32(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_load_f64(ptr)                                         (double)ma_atomic_load_explicit_f64(ptr, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_f32(dst, src)                                (float )ma_atomic_exchange_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_exchange_f64(dst, src)                                (double)ma_atomic_exchange_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_f32(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_f32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_strong_f64(dst, expected, desired)   ma_atomic_compare_exchange_strong_explicit_f64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_f32(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_f32(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_compare_exchange_weak_f64(dst, expected, desired)     ma_atomic_compare_exchange_weak_explicit_f64(dst, expected, desired, ma_atomic_memory_order_seq_cst, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_f32(dst, src)                               ma_atomic_fetch_add_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_add_f64(dst, src)                               ma_atomic_fetch_add_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_f32(dst, src)                               ma_atomic_fetch_sub_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_sub_f64(dst, src)                               ma_atomic_fetch_sub_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_f32(dst, src)                                ma_atomic_fetch_or_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_or_f64(dst, src)                                ma_atomic_fetch_or_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_f32(dst, src)                               ma_atomic_fetch_xor_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_xor_f64(dst, src)                               ma_atomic_fetch_xor_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_f32(dst, src)                               ma_atomic_fetch_and_explicit_f32(dst, src, ma_atomic_memory_order_seq_cst)
+#define ma_atomic_fetch_and_f64(dst, src)                               ma_atomic_fetch_and_explicit_f64(dst, src, ma_atomic_memory_order_seq_cst)
+static MA_INLINE float ma_atomic_compare_and_swap_f32(volatile float* dst, float expected, float desired)
+{
+    ma_atomic_if32 r;
+    ma_atomic_if32 e, d;
+    e.f = expected;
+    d.f = desired;
+    r.i = ma_atomic_compare_and_swap_32((volatile ma_uint32*)dst, e.i, d.i);
+    return r.f;
+}
+static MA_INLINE double ma_atomic_compare_and_swap_f64(volatile double* dst, double expected, double desired)
+{
+    ma_atomic_if64 r;
+    ma_atomic_if64 e, d;
+    e.f = expected;
+    d.f = desired;
+    r.i = ma_atomic_compare_and_swap_64((volatile ma_uint64*)dst, e.i, d.i);
+    return r.f;
+}
+typedef ma_atomic_flag ma_atomic_spinlock;
+static MA_INLINE void ma_atomic_spinlock_lock(volatile ma_atomic_spinlock* pSpinlock)
+{
+    for (;;) {
+        if (ma_atomic_flag_test_and_set_explicit(pSpinlock, ma_atomic_memory_order_acquire) == 0) {
+            break;
+        }
+        while (ma_atomic_flag_load_explicit(pSpinlock, ma_atomic_memory_order_relaxed) == 1) {
+        }
+    }
+}
+static MA_INLINE void ma_atomic_spinlock_unlock(volatile ma_atomic_spinlock* pSpinlock)
+{
+    ma_atomic_flag_clear_explicit(pSpinlock, ma_atomic_memory_order_release);
+}
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic pop
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif
+/* c89atomic.h end */
+
+#define MA_ATOMIC_SAFE_TYPE_IMPL(c89TypeExtension, type) \
+    static MA_INLINE ma_##type ma_atomic_##type##_get(ma_atomic_##type* x) \
+    { \
+        return (ma_##type)ma_atomic_load_##c89TypeExtension(&x->value); \
+    } \
+    static MA_INLINE void ma_atomic_##type##_set(ma_atomic_##type* x, ma_##type value) \
+    { \
+        ma_atomic_store_##c89TypeExtension(&x->value, value); \
+    } \
+    static MA_INLINE ma_##type ma_atomic_##type##_exchange(ma_atomic_##type* x, ma_##type value) \
+    { \
+        return (ma_##type)ma_atomic_exchange_##c89TypeExtension(&x->value, value); \
+    } \
+    static MA_INLINE ma_bool32 ma_atomic_##type##_compare_exchange(ma_atomic_##type* x, ma_##type* expected, ma_##type desired) \
+    { \
+        return ma_atomic_compare_exchange_weak_##c89TypeExtension(&x->value, expected, desired); \
+    } \
+    static MA_INLINE ma_##type ma_atomic_##type##_fetch_add(ma_atomic_##type* x, ma_##type y) \
+    { \
+        return (ma_##type)ma_atomic_fetch_add_##c89TypeExtension(&x->value, y); \
+    } \
+    static MA_INLINE ma_##type ma_atomic_##type##_fetch_sub(ma_atomic_##type* x, ma_##type y) \
+    { \
+        return (ma_##type)ma_atomic_fetch_sub_##c89TypeExtension(&x->value, y); \
+    } \
+    static MA_INLINE ma_##type ma_atomic_##type##_fetch_or(ma_atomic_##type* x, ma_##type y) \
+    { \
+        return (ma_##type)ma_atomic_fetch_or_##c89TypeExtension(&x->value, y); \
+    } \
+    static MA_INLINE ma_##type ma_atomic_##type##_fetch_xor(ma_atomic_##type* x, ma_##type y) \
+    { \
+        return (ma_##type)ma_atomic_fetch_xor_##c89TypeExtension(&x->value, y); \
+    } \
+    static MA_INLINE ma_##type ma_atomic_##type##_fetch_and(ma_atomic_##type* x, ma_##type y) \
+    { \
+        return (ma_##type)ma_atomic_fetch_and_##c89TypeExtension(&x->value, y); \
+    } \
+    static MA_INLINE ma_##type ma_atomic_##type##_compare_and_swap(ma_atomic_##type* x, ma_##type expected, ma_##type desired) \
+    { \
+        return (ma_##type)ma_atomic_compare_and_swap_##c89TypeExtension(&x->value, expected, desired); \
+    } \
+
+#define MA_ATOMIC_SAFE_TYPE_IMPL_PTR(type) \
+    static MA_INLINE ma_##type* ma_atomic_ptr_##type##_get(ma_atomic_ptr_##type* x) \
+    { \
+        return ma_atomic_load_ptr((void**)&x->value); \
+    } \
+    static MA_INLINE void ma_atomic_ptr_##type##_set(ma_atomic_ptr_##type* x, ma_##type* value) \
+    { \
+        ma_atomic_store_ptr((void**)&x->value, (void*)value); \
+    } \
+    static MA_INLINE ma_##type* ma_atomic_ptr_##type##_exchange(ma_atomic_ptr_##type* x, ma_##type* value) \
+    { \
+        return ma_atomic_exchange_ptr((void**)&x->value, (void*)value); \
+    } \
+    static MA_INLINE ma_bool32 ma_atomic_ptr_##type##_compare_exchange(ma_atomic_ptr_##type* x, ma_##type** expected, ma_##type* desired) \
+    { \
+        return ma_atomic_compare_exchange_weak_ptr((void**)&x->value, (void*)expected, (void*)desired); \
+    } \
+    static MA_INLINE ma_##type* ma_atomic_ptr_##type##_compare_and_swap(ma_atomic_ptr_##type* x, ma_##type* expected, ma_##type* desired) \
+    { \
+        return (ma_##type*)ma_atomic_compare_and_swap_ptr((void**)&x->value, (void*)expected, (void*)desired); \
+    } \
+
+MA_ATOMIC_SAFE_TYPE_IMPL(32,  uint32)
+MA_ATOMIC_SAFE_TYPE_IMPL(i32, int32)
+MA_ATOMIC_SAFE_TYPE_IMPL(64,  uint64)
+MA_ATOMIC_SAFE_TYPE_IMPL(f32, float)
+MA_ATOMIC_SAFE_TYPE_IMPL(32,  bool32)
+
+#if !defined(MA_NO_DEVICE_IO)
+MA_ATOMIC_SAFE_TYPE_IMPL(i32, device_state)
+#endif
+
+
+MA_API ma_uint64 ma_calculate_frame_count_after_resampling(ma_uint32 sampleRateOut, ma_uint32 sampleRateIn, ma_uint64 frameCountIn)
+{
+    /* This is based on the calculation in ma_linear_resampler_get_expected_output_frame_count(). */
+    ma_uint64 outputFrameCount;
+    ma_uint64 preliminaryInputFrameCountFromFrac;
+    ma_uint64 preliminaryInputFrameCount;
+
+    if (sampleRateIn == 0 || sampleRateOut == 0 || frameCountIn == 0) {
+        return 0;
+    }
+
+    if (sampleRateOut == sampleRateIn) {
+        return frameCountIn;
+    }
+
+    outputFrameCount = (frameCountIn * sampleRateOut) / sampleRateIn;
+
+    preliminaryInputFrameCountFromFrac = (outputFrameCount * (sampleRateIn / sampleRateOut)) / sampleRateOut;
+    preliminaryInputFrameCount         = (outputFrameCount * (sampleRateIn % sampleRateOut)) + preliminaryInputFrameCountFromFrac;
+
+    if (preliminaryInputFrameCount <= frameCountIn) {
+        outputFrameCount += 1;
+    }
+
+    return outputFrameCount;
+}
+
+#ifndef MA_DATA_CONVERTER_STACK_BUFFER_SIZE
+#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE     4096
+#endif
+
+
+
+#if defined(MA_WIN32)
+static ma_result ma_result_from_GetLastError(DWORD error)
+{
+    switch (error)
+    {
+        case ERROR_SUCCESS:             return MA_SUCCESS;
+        case ERROR_PATH_NOT_FOUND:      return MA_DOES_NOT_EXIST;
+        case ERROR_TOO_MANY_OPEN_FILES: return MA_TOO_MANY_OPEN_FILES;
+        case ERROR_NOT_ENOUGH_MEMORY:   return MA_OUT_OF_MEMORY;
+        case ERROR_DISK_FULL:           return MA_NO_SPACE;
+        case ERROR_HANDLE_EOF:          return MA_AT_END;
+        case ERROR_NEGATIVE_SEEK:       return MA_BAD_SEEK;
+        case ERROR_INVALID_PARAMETER:   return MA_INVALID_ARGS;
+        case ERROR_ACCESS_DENIED:       return MA_ACCESS_DENIED;
+        case ERROR_SEM_TIMEOUT:         return MA_TIMEOUT;
+        case ERROR_FILE_NOT_FOUND:      return MA_DOES_NOT_EXIST;
+        default: break;
+    }
+
+    return MA_ERROR;
+}
+#endif  /* MA_WIN32 */
+
+
+/*******************************************************************************
+
+Threading
+
+*******************************************************************************/
+static MA_INLINE ma_result ma_spinlock_lock_ex(volatile ma_spinlock* pSpinlock, ma_bool32 yield)
+{
+    if (pSpinlock == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (;;) {
+        if (ma_atomic_exchange_explicit_32(pSpinlock, 1, ma_atomic_memory_order_acquire) == 0) {
+            break;
+        }
+
+        while (ma_atomic_load_explicit_32(pSpinlock, ma_atomic_memory_order_relaxed) == 1) {
+            if (yield) {
+                ma_yield();
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_spinlock_lock(volatile ma_spinlock* pSpinlock)
+{
+    return ma_spinlock_lock_ex(pSpinlock, MA_TRUE);
+}
+
+MA_API ma_result ma_spinlock_lock_noyield(volatile ma_spinlock* pSpinlock)
+{
+    return ma_spinlock_lock_ex(pSpinlock, MA_FALSE);
+}
+
+MA_API ma_result ma_spinlock_unlock(volatile ma_spinlock* pSpinlock)
+{
+    if (pSpinlock == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_atomic_store_explicit_32(pSpinlock, 0, ma_atomic_memory_order_release);
+    return MA_SUCCESS;
+}
+
+
+#ifndef MA_NO_THREADING
+#if defined(MA_POSIX)
+    #define MA_THREADCALL
+    typedef void* ma_thread_result;
+#elif defined(MA_WIN32)
+    #define MA_THREADCALL WINAPI
+    typedef unsigned long ma_thread_result;
+#endif
+
+typedef ma_thread_result (MA_THREADCALL * ma_thread_entry_proc)(void* pData);
+
+#ifdef MA_POSIX
+static ma_result ma_thread_create__posix(ma_thread* pThread, ma_thread_priority priority, size_t stackSize, ma_thread_entry_proc entryProc, void* pData)
+{
+    int result;
+    pthread_attr_t* pAttr = NULL;
+
+#if !defined(__EMSCRIPTEN__) && !defined(__3DS__)
+    /* Try setting the thread priority. It's not critical if anything fails here. */
+    pthread_attr_t attr;
+    if (pthread_attr_init(&attr) == 0) {
+        int scheduler = -1;
+
+        /* We successfully initialized our attributes object so we can assign the pointer so it's passed into pthread_create(). */
+        pAttr = &attr;
+
+        /* We need to set the scheduler policy. Only do this if the OS supports pthread_attr_setschedpolicy() */
+        #if !defined(MA_BEOS)
+        {
+            if (priority == ma_thread_priority_idle) {
+            #ifdef SCHED_IDLE
+                if (pthread_attr_setschedpolicy(&attr, SCHED_IDLE) == 0) {
+                    scheduler = SCHED_IDLE;
+                }
+            #endif
+            } else if (priority == ma_thread_priority_realtime) {
+            #ifdef SCHED_FIFO
+                if (pthread_attr_setschedpolicy(&attr, SCHED_FIFO) == 0) {
+                    scheduler = SCHED_FIFO;
+                }
+            #endif
+            #ifdef MA_LINUX
+            } else {
+                scheduler = sched_getscheduler(0);
+            #endif
+            }
+        }
+        #endif
+
+        if (stackSize > 0) {
+            pthread_attr_setstacksize(&attr, stackSize);
+        }
+
+        if (scheduler != -1) {
+            int priorityMin = sched_get_priority_min(scheduler);
+            int priorityMax = sched_get_priority_max(scheduler);
+            int priorityStep = (priorityMax - priorityMin) / 7;  /* 7 = number of priorities supported by miniaudio. */
+
+            struct sched_param sched;
+            if (pthread_attr_getschedparam(&attr, &sched) == 0) {
+                if (priority == ma_thread_priority_idle) {
+                    sched.sched_priority = priorityMin;
+                } else if (priority == ma_thread_priority_realtime) {
+                    #if defined(MA_PTHREAD_REALTIME_THREAD_PRIORITY)
+                    {
+                        sched.sched_priority = MA_PTHREAD_REALTIME_THREAD_PRIORITY;
+                    }
+                    #else
+                    {
+                        sched.sched_priority = priorityMax;
+                    }
+                    #endif
+                } else {
+                    sched.sched_priority += ((int)priority + 5) * priorityStep;  /* +5 because the lowest priority is -5. */
+                }
+
+                if (sched.sched_priority < priorityMin) {
+                    sched.sched_priority = priorityMin;
+                }
+                if (sched.sched_priority > priorityMax) {
+                    sched.sched_priority = priorityMax;
+                }
+
+                /* I'm not treating a failure of setting the priority as a critical error so not aborting on failure here. */
+                if (pthread_attr_setschedparam(&attr, &sched) == 0) {
+                    #if !defined(MA_ANDROID) || (defined(__ANDROID_API__) && __ANDROID_API__ >= 28)
+                    {
+                        pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
+                    }
+                    #endif
+                }
+            }
+        }
+    }
+#else
+    /* It's the emscripten build. We'll have a few unused parameters. */
+    (void)priority;
+    (void)stackSize;
+#endif
+
+    result = pthread_create((pthread_t*)pThread, pAttr, entryProc, pData);
+
+    /* The thread attributes object is no longer required. */
+    if (pAttr != NULL) {
+        pthread_attr_destroy(pAttr);
+    }
+
+    if (result != 0) {
+        return ma_result_from_errno(result);
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_thread_wait__posix(ma_thread* pThread)
+{
+    pthread_join((pthread_t)*pThread, NULL);
+}
+
+
+static ma_result ma_mutex_init__posix(ma_mutex* pMutex)
+{
+    int result;
+
+    if (pMutex == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pMutex);
+
+    result = pthread_mutex_init((pthread_mutex_t*)pMutex, NULL);
+    if (result != 0) {
+        return ma_result_from_errno(result);
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_mutex_uninit__posix(ma_mutex* pMutex)
+{
+    pthread_mutex_destroy((pthread_mutex_t*)pMutex);
+}
+
+static void ma_mutex_lock__posix(ma_mutex* pMutex)
+{
+    pthread_mutex_lock((pthread_mutex_t*)pMutex);
+}
+
+static void ma_mutex_unlock__posix(ma_mutex* pMutex)
+{
+    pthread_mutex_unlock((pthread_mutex_t*)pMutex);
+}
+
+
+static ma_result ma_event_init__posix(ma_event* pEvent)
+{
+    int result;
+
+    result = pthread_mutex_init((pthread_mutex_t*)&pEvent->lock, NULL);
+    if (result != 0) {
+        return ma_result_from_errno(result);
+    }
+
+    result = pthread_cond_init((pthread_cond_t*)&pEvent->cond, NULL);
+    if (result != 0) {
+        pthread_mutex_destroy((pthread_mutex_t*)&pEvent->lock);
+        return ma_result_from_errno(result);
+    }
+
+    pEvent->value = 0;
+    return MA_SUCCESS;
+}
+
+static void ma_event_uninit__posix(ma_event* pEvent)
+{
+    pthread_cond_destroy((pthread_cond_t*)&pEvent->cond);
+    pthread_mutex_destroy((pthread_mutex_t*)&pEvent->lock);
+}
+
+static ma_result ma_event_wait__posix(ma_event* pEvent)
+{
+    pthread_mutex_lock((pthread_mutex_t*)&pEvent->lock);
+    {
+        while (pEvent->value == 0) {
+            pthread_cond_wait((pthread_cond_t*)&pEvent->cond, (pthread_mutex_t*)&pEvent->lock);
+        }
+        pEvent->value = 0;  /* Auto-reset. */
+    }
+    pthread_mutex_unlock((pthread_mutex_t*)&pEvent->lock);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_event_signal__posix(ma_event* pEvent)
+{
+    pthread_mutex_lock((pthread_mutex_t*)&pEvent->lock);
+    {
+        pEvent->value = 1;
+        pthread_cond_signal((pthread_cond_t*)&pEvent->cond);
+    }
+    pthread_mutex_unlock((pthread_mutex_t*)&pEvent->lock);
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_semaphore_init__posix(int initialValue, ma_semaphore* pSemaphore)
+{
+    int result;
+
+    if (pSemaphore == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pSemaphore->value = initialValue;
+
+    result = pthread_mutex_init((pthread_mutex_t*)&pSemaphore->lock, NULL);
+    if (result != 0) {
+        return ma_result_from_errno(result);  /* Failed to create mutex. */
+    }
+
+    result = pthread_cond_init((pthread_cond_t*)&pSemaphore->cond, NULL);
+    if (result != 0) {
+        pthread_mutex_destroy((pthread_mutex_t*)&pSemaphore->lock);
+        return ma_result_from_errno(result);  /* Failed to create condition variable. */
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_semaphore_uninit__posix(ma_semaphore* pSemaphore)
+{
+    if (pSemaphore == NULL) {
+        return;
+    }
+
+    pthread_cond_destroy((pthread_cond_t*)&pSemaphore->cond);
+    pthread_mutex_destroy((pthread_mutex_t*)&pSemaphore->lock);
+}
+
+static ma_result ma_semaphore_wait__posix(ma_semaphore* pSemaphore)
+{
+    if (pSemaphore == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pthread_mutex_lock((pthread_mutex_t*)&pSemaphore->lock);
+    {
+        /* We need to wait on a condition variable before escaping. We can't return from this function until the semaphore has been signaled. */
+        while (pSemaphore->value == 0) {
+            pthread_cond_wait((pthread_cond_t*)&pSemaphore->cond, (pthread_mutex_t*)&pSemaphore->lock);
+        }
+
+        pSemaphore->value -= 1;
+    }
+    pthread_mutex_unlock((pthread_mutex_t*)&pSemaphore->lock);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_semaphore_release__posix(ma_semaphore* pSemaphore)
+{
+    if (pSemaphore == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pthread_mutex_lock((pthread_mutex_t*)&pSemaphore->lock);
+    {
+        pSemaphore->value += 1;
+        pthread_cond_signal((pthread_cond_t*)&pSemaphore->cond);
+    }
+    pthread_mutex_unlock((pthread_mutex_t*)&pSemaphore->lock);
+
+    return MA_SUCCESS;
+}
+#elif defined(MA_WIN32)
+static int ma_thread_priority_to_win32(ma_thread_priority priority)
+{
+    switch (priority) {
+        case ma_thread_priority_idle:     return THREAD_PRIORITY_IDLE;
+        case ma_thread_priority_lowest:   return THREAD_PRIORITY_LOWEST;
+        case ma_thread_priority_low:      return THREAD_PRIORITY_BELOW_NORMAL;
+        case ma_thread_priority_normal:   return THREAD_PRIORITY_NORMAL;
+        case ma_thread_priority_high:     return THREAD_PRIORITY_ABOVE_NORMAL;
+        case ma_thread_priority_highest:  return THREAD_PRIORITY_HIGHEST;
+        case ma_thread_priority_realtime: return THREAD_PRIORITY_TIME_CRITICAL;
+        default:                          return THREAD_PRIORITY_NORMAL;
+    }
+}
+
+static ma_result ma_thread_create__win32(ma_thread* pThread, ma_thread_priority priority, size_t stackSize, ma_thread_entry_proc entryProc, void* pData)
+{
+    DWORD threadID; /* Not used. Only used for passing into CreateThread() so it doesn't fail on Windows 98. */
+
+    *pThread = CreateThread(NULL, stackSize, entryProc, pData, 0, &threadID);
+    if (*pThread == NULL) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    SetThreadPriority((HANDLE)*pThread, ma_thread_priority_to_win32(priority));
+
+    return MA_SUCCESS;
+}
+
+static void ma_thread_wait__win32(ma_thread* pThread)
+{
+    WaitForSingleObject((HANDLE)*pThread, INFINITE);
+    CloseHandle((HANDLE)*pThread);
+}
+
+
+static ma_result ma_mutex_init__win32(ma_mutex* pMutex)
+{
+    *pMutex = CreateEventA(NULL, FALSE, TRUE, NULL);
+    if (*pMutex == NULL) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_mutex_uninit__win32(ma_mutex* pMutex)
+{
+    CloseHandle((HANDLE)*pMutex);
+}
+
+static void ma_mutex_lock__win32(ma_mutex* pMutex)
+{
+    WaitForSingleObject((HANDLE)*pMutex, INFINITE);
+}
+
+static void ma_mutex_unlock__win32(ma_mutex* pMutex)
+{
+    SetEvent((HANDLE)*pMutex);
+}
+
+
+static ma_result ma_event_init__win32(ma_event* pEvent)
+{
+    *pEvent = CreateEventA(NULL, FALSE, FALSE, NULL);
+    if (*pEvent == NULL) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_event_uninit__win32(ma_event* pEvent)
+{
+    CloseHandle((HANDLE)*pEvent);
+}
+
+static ma_result ma_event_wait__win32(ma_event* pEvent)
+{
+    DWORD result = WaitForSingleObject((HANDLE)*pEvent, INFINITE);
+    if (result == WAIT_OBJECT_0) {
+        return MA_SUCCESS;
+    }
+
+    if (result == WAIT_TIMEOUT) {
+        return MA_TIMEOUT;
+    }
+
+    return ma_result_from_GetLastError(GetLastError());
+}
+
+static ma_result ma_event_signal__win32(ma_event* pEvent)
+{
+    BOOL result = SetEvent((HANDLE)*pEvent);
+    if (result == 0) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_semaphore_init__win32(int initialValue, ma_semaphore* pSemaphore)
+{
+    *pSemaphore = CreateSemaphoreW(NULL, (LONG)initialValue, LONG_MAX, NULL);
+    if (*pSemaphore == NULL) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_semaphore_uninit__win32(ma_semaphore* pSemaphore)
+{
+    CloseHandle((HANDLE)*pSemaphore);
+}
+
+static ma_result ma_semaphore_wait__win32(ma_semaphore* pSemaphore)
+{
+    DWORD result = WaitForSingleObject((HANDLE)*pSemaphore, INFINITE);
+    if (result == WAIT_OBJECT_0) {
+        return MA_SUCCESS;
+    }
+
+    if (result == WAIT_TIMEOUT) {
+        return MA_TIMEOUT;
+    }
+
+    return ma_result_from_GetLastError(GetLastError());
+}
+
+static ma_result ma_semaphore_release__win32(ma_semaphore* pSemaphore)
+{
+    BOOL result = ReleaseSemaphore((HANDLE)*pSemaphore, 1, NULL);
+    if (result == 0) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    return MA_SUCCESS;
+}
+#endif
+
+typedef struct
+{
+    ma_thread_entry_proc entryProc;
+    void* pData;
+    ma_allocation_callbacks allocationCallbacks;
+} ma_thread_proxy_data;
+
+static ma_thread_result MA_THREADCALL ma_thread_entry_proxy(void* pData)
+{
+    ma_thread_proxy_data* pProxyData = (ma_thread_proxy_data*)pData;
+    ma_thread_entry_proc entryProc;
+    void* pEntryProcData;
+    ma_thread_result result;
+
+    #if defined(MA_ON_THREAD_ENTRY)
+        MA_ON_THREAD_ENTRY
+    #endif
+
+    entryProc = pProxyData->entryProc;
+    pEntryProcData = pProxyData->pData;
+
+    /* Free the proxy data before getting into the real thread entry proc. */
+    ma_free(pProxyData, &pProxyData->allocationCallbacks);
+
+    result = entryProc(pEntryProcData);
+
+    #if defined(MA_ON_THREAD_EXIT)
+        MA_ON_THREAD_EXIT
+    #endif
+
+    return result;
+}
+
+static ma_result ma_thread_create(ma_thread* pThread, ma_thread_priority priority, size_t stackSize, ma_thread_entry_proc entryProc, void* pData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_result result;
+    ma_thread_proxy_data* pProxyData;
+
+    if (pThread == NULL || entryProc == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pProxyData = (ma_thread_proxy_data*)ma_malloc(sizeof(*pProxyData), pAllocationCallbacks);   /* Will be freed by the proxy entry proc. */
+    if (pProxyData == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+#if defined(MA_THREAD_DEFAULT_STACK_SIZE)
+    if (stackSize == 0) {
+        stackSize = MA_THREAD_DEFAULT_STACK_SIZE;
+    }
+#endif
+
+    pProxyData->entryProc = entryProc;
+    pProxyData->pData     = pData;
+    ma_allocation_callbacks_init_copy(&pProxyData->allocationCallbacks, pAllocationCallbacks);
+
+#if defined(MA_POSIX)
+    result = ma_thread_create__posix(pThread, priority, stackSize, ma_thread_entry_proxy, pProxyData);
+#elif defined(MA_WIN32)
+    result = ma_thread_create__win32(pThread, priority, stackSize, ma_thread_entry_proxy, pProxyData);
+#endif
+
+    if (result != MA_SUCCESS) {
+        ma_free(pProxyData, pAllocationCallbacks);
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_thread_wait(ma_thread* pThread)
+{
+    if (pThread == NULL) {
+        return;
+    }
+
+#if defined(MA_POSIX)
+    ma_thread_wait__posix(pThread);
+#elif defined(MA_WIN32)
+    ma_thread_wait__win32(pThread);
+#endif
+}
+
+
+MA_API ma_result ma_mutex_init(ma_mutex* pMutex)
+{
+    if (pMutex == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_POSIX)
+    return ma_mutex_init__posix(pMutex);
+#elif defined(MA_WIN32)
+    return ma_mutex_init__win32(pMutex);
+#endif
+}
+
+MA_API void ma_mutex_uninit(ma_mutex* pMutex)
+{
+    if (pMutex == NULL) {
+        return;
+    }
+
+#if defined(MA_POSIX)
+    ma_mutex_uninit__posix(pMutex);
+#elif defined(MA_WIN32)
+    ma_mutex_uninit__win32(pMutex);
+#endif
+}
+
+MA_API void ma_mutex_lock(ma_mutex* pMutex)
+{
+    if (pMutex == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
+        return;
+    }
+
+#if defined(MA_POSIX)
+    ma_mutex_lock__posix(pMutex);
+#elif defined(MA_WIN32)
+    ma_mutex_lock__win32(pMutex);
+#endif
+}
+
+MA_API void ma_mutex_unlock(ma_mutex* pMutex)
+{
+    if (pMutex == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
+        return;
+    }
+
+#if defined(MA_POSIX)
+    ma_mutex_unlock__posix(pMutex);
+#elif defined(MA_WIN32)
+    ma_mutex_unlock__win32(pMutex);
+#endif
+}
+
+
+MA_API ma_result ma_event_init(ma_event* pEvent)
+{
+    if (pEvent == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_POSIX)
+    return ma_event_init__posix(pEvent);
+#elif defined(MA_WIN32)
+    return ma_event_init__win32(pEvent);
+#endif
+}
+
+#if 0
+static ma_result ma_event_alloc_and_init(ma_event** ppEvent, ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_result result;
+    ma_event* pEvent;
+
+    if (ppEvent == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *ppEvent = NULL;
+
+    pEvent = ma_malloc(sizeof(*pEvent), pAllocationCallbacks);
+    if (pEvent == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_event_init(pEvent);
+    if (result != MA_SUCCESS) {
+        ma_free(pEvent, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppEvent = pEvent;
+    return result;
+}
+#endif
+
+MA_API void ma_event_uninit(ma_event* pEvent)
+{
+    if (pEvent == NULL) {
+        return;
+    }
+
+#if defined(MA_POSIX)
+    ma_event_uninit__posix(pEvent);
+#elif defined(MA_WIN32)
+    ma_event_uninit__win32(pEvent);
+#endif
+}
+
+#if 0
+static void ma_event_uninit_and_free(ma_event* pEvent, ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pEvent == NULL) {
+        return;
+    }
+
+    ma_event_uninit(pEvent);
+    ma_free(pEvent, pAllocationCallbacks);
+}
+#endif
+
+MA_API ma_result ma_event_wait(ma_event* pEvent)
+{
+    if (pEvent == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert to the caller is aware of this bug. */
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_POSIX)
+    return ma_event_wait__posix(pEvent);
+#elif defined(MA_WIN32)
+    return ma_event_wait__win32(pEvent);
+#endif
+}
+
+MA_API ma_result ma_event_signal(ma_event* pEvent)
+{
+    if (pEvent == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert to the caller is aware of this bug. */
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_POSIX)
+    return ma_event_signal__posix(pEvent);
+#elif defined(MA_WIN32)
+    return ma_event_signal__win32(pEvent);
+#endif
+}
+
+
+MA_API ma_result ma_semaphore_init(int initialValue, ma_semaphore* pSemaphore)
+{
+    if (pSemaphore == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_POSIX)
+    return ma_semaphore_init__posix(initialValue, pSemaphore);
+#elif defined(MA_WIN32)
+    return ma_semaphore_init__win32(initialValue, pSemaphore);
+#endif
+}
+
+MA_API void ma_semaphore_uninit(ma_semaphore* pSemaphore)
+{
+    if (pSemaphore == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
+        return;
+    }
+
+#if defined(MA_POSIX)
+    ma_semaphore_uninit__posix(pSemaphore);
+#elif defined(MA_WIN32)
+    ma_semaphore_uninit__win32(pSemaphore);
+#endif
+}
+
+MA_API ma_result ma_semaphore_wait(ma_semaphore* pSemaphore)
+{
+    if (pSemaphore == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_POSIX)
+    return ma_semaphore_wait__posix(pSemaphore);
+#elif defined(MA_WIN32)
+    return ma_semaphore_wait__win32(pSemaphore);
+#endif
+}
+
+MA_API ma_result ma_semaphore_release(ma_semaphore* pSemaphore)
+{
+    if (pSemaphore == NULL) {
+        MA_ASSERT(MA_FALSE);    /* Fire an assert so the caller is aware of this bug. */
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_POSIX)
+    return ma_semaphore_release__posix(pSemaphore);
+#elif defined(MA_WIN32)
+    return ma_semaphore_release__win32(pSemaphore);
+#endif
+}
+#else
+/* MA_NO_THREADING is set which means threading is disabled. Threading is required by some API families. If any of these are enabled we need to throw an error. */
+#ifndef MA_NO_DEVICE_IO
+#error "MA_NO_THREADING cannot be used without MA_NO_DEVICE_IO";
+#endif
+#endif  /* MA_NO_THREADING */
+
+
+
+#define MA_FENCE_COUNTER_MAX    0x7FFFFFFF
+
+MA_API ma_result ma_fence_init(ma_fence* pFence)
+{
+    if (pFence == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pFence);
+    pFence->counter = 0;
+
+    #ifndef MA_NO_THREADING
+    {
+        ma_result result;
+
+        result = ma_event_init(&pFence->e);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+    #endif
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_fence_uninit(ma_fence* pFence)
+{
+    if (pFence == NULL) {
+        return;
+    }
+
+    #ifndef MA_NO_THREADING
+    {
+        ma_event_uninit(&pFence->e);
+    }
+    #endif
+
+    MA_ZERO_OBJECT(pFence);
+}
+
+MA_API ma_result ma_fence_acquire(ma_fence* pFence)
+{
+    if (pFence == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (;;) {
+        ma_uint32 oldCounter = ma_atomic_load_32(&pFence->counter);
+        ma_uint32 newCounter = oldCounter + 1;
+
+        /* Make sure we're not about to exceed our maximum value. */
+        if (newCounter > MA_FENCE_COUNTER_MAX) {
+            MA_ASSERT(MA_FALSE);
+            return MA_OUT_OF_RANGE;
+        }
+
+        if (ma_atomic_compare_exchange_weak_32(&pFence->counter, &oldCounter, newCounter)) {
+            return MA_SUCCESS;
+        } else {
+            if (oldCounter == MA_FENCE_COUNTER_MAX) {
+                MA_ASSERT(MA_FALSE);
+                return MA_OUT_OF_RANGE; /* The other thread took the last available slot. Abort. */
+            }
+        }
+    }
+
+    /* Should never get here. */
+    /*return MA_SUCCESS;*/
+}
+
+MA_API ma_result ma_fence_release(ma_fence* pFence)
+{
+    if (pFence == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (;;) {
+        ma_uint32 oldCounter = ma_atomic_load_32(&pFence->counter);
+        ma_uint32 newCounter = oldCounter - 1;
+
+        if (oldCounter == 0) {
+            MA_ASSERT(MA_FALSE);
+            return MA_INVALID_OPERATION;    /* Acquire/release mismatch. */
+        }
+
+        if (ma_atomic_compare_exchange_weak_32(&pFence->counter, &oldCounter, newCounter)) {
+            #ifndef MA_NO_THREADING
+            {
+                if (newCounter == 0) {
+                    ma_event_signal(&pFence->e);    /* <-- ma_fence_wait() will be waiting on this. */
+                }
+            }
+            #endif
+
+            return MA_SUCCESS;
+        } else {
+            if (oldCounter == 0) {
+                MA_ASSERT(MA_FALSE);
+                return MA_INVALID_OPERATION;    /* Another thread has taken the 0 slot. Acquire/release mismatch. */
+            }
+        }
+    }
+
+    /* Should never get here. */
+    /*return MA_SUCCESS;*/
+}
+
+MA_API ma_result ma_fence_wait(ma_fence* pFence)
+{
+    if (pFence == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (;;) {
+        ma_uint32 counter;
+
+        counter = ma_atomic_load_32(&pFence->counter);
+        if (counter == 0) {
+            /*
+            Counter has hit zero. By the time we get here some other thread may have acquired the
+            fence again, but that is where the caller needs to take care with how they se the fence.
+            */
+            return MA_SUCCESS;
+        }
+
+        /* Getting here means the counter is > 0. We'll need to wait for something to happen. */
+        #ifndef MA_NO_THREADING
+        {
+            ma_result result;
+
+            result = ma_event_wait(&pFence->e);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+        }
+        #endif
+    }
+
+    /* Should never get here. */
+    /*return MA_INVALID_OPERATION;*/
+}
+
+
+MA_API ma_result ma_async_notification_signal(ma_async_notification* pNotification)
+{
+    ma_async_notification_callbacks* pNotificationCallbacks = (ma_async_notification_callbacks*)pNotification;
+
+    if (pNotification == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pNotificationCallbacks->onSignal == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    pNotificationCallbacks->onSignal(pNotification);
+    return MA_INVALID_ARGS;
+}
+
+
+static void ma_async_notification_poll__on_signal(ma_async_notification* pNotification)
+{
+    ((ma_async_notification_poll*)pNotification)->signalled = MA_TRUE;
+}
+
+MA_API ma_result ma_async_notification_poll_init(ma_async_notification_poll* pNotificationPoll)
+{
+    if (pNotificationPoll == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pNotificationPoll->cb.onSignal = ma_async_notification_poll__on_signal;
+    pNotificationPoll->signalled = MA_FALSE;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_bool32 ma_async_notification_poll_is_signalled(const ma_async_notification_poll* pNotificationPoll)
+{
+    if (pNotificationPoll == NULL) {
+        return MA_FALSE;
+    }
+
+    return pNotificationPoll->signalled;
+}
+
+
+static void ma_async_notification_event__on_signal(ma_async_notification* pNotification)
+{
+    ma_async_notification_event_signal((ma_async_notification_event*)pNotification);
+}
+
+MA_API ma_result ma_async_notification_event_init(ma_async_notification_event* pNotificationEvent)
+{
+    if (pNotificationEvent == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pNotificationEvent->cb.onSignal = ma_async_notification_event__on_signal;
+
+    #ifndef MA_NO_THREADING
+    {
+        ma_result result;
+
+        result = ma_event_init(&pNotificationEvent->e);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        return MA_NOT_IMPLEMENTED;  /* Threading is disabled. */
+    }
+    #endif
+}
+
+MA_API ma_result ma_async_notification_event_uninit(ma_async_notification_event* pNotificationEvent)
+{
+    if (pNotificationEvent == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #ifndef MA_NO_THREADING
+    {
+        ma_event_uninit(&pNotificationEvent->e);
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        return MA_NOT_IMPLEMENTED;  /* Threading is disabled. */
+    }
+    #endif
+}
+
+MA_API ma_result ma_async_notification_event_wait(ma_async_notification_event* pNotificationEvent)
+{
+    if (pNotificationEvent == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #ifndef MA_NO_THREADING
+    {
+        return ma_event_wait(&pNotificationEvent->e);
+    }
+    #else
+    {
+        return MA_NOT_IMPLEMENTED;  /* Threading is disabled. */
+    }
+    #endif
+}
+
+MA_API ma_result ma_async_notification_event_signal(ma_async_notification_event* pNotificationEvent)
+{
+    if (pNotificationEvent == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #ifndef MA_NO_THREADING
+    {
+        return ma_event_signal(&pNotificationEvent->e);
+    }
+    #else
+    {
+        return MA_NOT_IMPLEMENTED;  /* Threading is disabled. */
+    }
+    #endif
+}
+
+
+
+/************************************************************************************************************************************************************
+
+Job Queue
+
+************************************************************************************************************************************************************/
+MA_API ma_slot_allocator_config ma_slot_allocator_config_init(ma_uint32 capacity)
+{
+    ma_slot_allocator_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.capacity = capacity;
+
+    return config;
+}
+
+
+static MA_INLINE ma_uint32 ma_slot_allocator_calculate_group_capacity(ma_uint32 slotCapacity)
+{
+    ma_uint32 cap = slotCapacity / 32;
+    if ((slotCapacity % 32) != 0) {
+        cap += 1;
+    }
+
+    return cap;
+}
+
+static MA_INLINE ma_uint32 ma_slot_allocator_group_capacity(const ma_slot_allocator* pAllocator)
+{
+    return ma_slot_allocator_calculate_group_capacity(pAllocator->capacity);
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t groupsOffset;
+    size_t slotsOffset;
+} ma_slot_allocator_heap_layout;
+
+static ma_result ma_slot_allocator_get_heap_layout(const ma_slot_allocator_config* pConfig, ma_slot_allocator_heap_layout* pHeapLayout)
+{
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->capacity == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Groups. */
+    pHeapLayout->groupsOffset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += ma_align_64(ma_slot_allocator_calculate_group_capacity(pConfig->capacity) * sizeof(ma_slot_allocator_group));
+
+    /* Slots. */
+    pHeapLayout->slotsOffset  = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += ma_align_64(pConfig->capacity * sizeof(ma_uint32));
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_slot_allocator_get_heap_size(const ma_slot_allocator_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_slot_allocator_heap_layout layout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_slot_allocator_get_heap_layout(pConfig, &layout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = layout.sizeInBytes;
+
+    return result;
+}
+
+MA_API ma_result ma_slot_allocator_init_preallocated(const ma_slot_allocator_config* pConfig, void* pHeap, ma_slot_allocator* pAllocator)
+{
+    ma_result result;
+    ma_slot_allocator_heap_layout heapLayout;
+
+    if (pAllocator == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pAllocator);
+
+    if (pHeap == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_slot_allocator_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pAllocator->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pAllocator->pGroups  = (ma_slot_allocator_group*)ma_offset_ptr(pHeap, heapLayout.groupsOffset);
+    pAllocator->pSlots   = (ma_uint32*)ma_offset_ptr(pHeap, heapLayout.slotsOffset);
+    pAllocator->capacity = pConfig->capacity;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_slot_allocator_init(const ma_slot_allocator_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_slot_allocator* pAllocator)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_slot_allocator_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to retrieve the size of the heap allocation. */
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_slot_allocator_init_preallocated(pConfig, pHeap, pAllocator);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pAllocator->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_slot_allocator_uninit(ma_slot_allocator* pAllocator, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocator == NULL) {
+        return;
+    }
+
+    if (pAllocator->_ownsHeap) {
+        ma_free(pAllocator->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_slot_allocator_alloc(ma_slot_allocator* pAllocator, ma_uint64* pSlot)
+{
+    ma_uint32 iAttempt;
+    const ma_uint32 maxAttempts = 2;    /* The number of iterations to perform until returning MA_OUT_OF_MEMORY if no slots can be found. */
+
+    if (pAllocator == NULL || pSlot == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (iAttempt = 0; iAttempt < maxAttempts; iAttempt += 1) {
+        /* We need to acquire a suitable bitfield first. This is a bitfield that's got an available slot within it. */
+        ma_uint32 iGroup;
+        for (iGroup = 0; iGroup < ma_slot_allocator_group_capacity(pAllocator); iGroup += 1) {
+            /* CAS */
+            for (;;) {
+                ma_uint32 oldBitfield;
+                ma_uint32 newBitfield;
+                ma_uint32 bitOffset;
+
+                oldBitfield = ma_atomic_load_32(&pAllocator->pGroups[iGroup].bitfield);  /* <-- This copy must happen. The compiler must not optimize this away. */
+
+                /* Fast check to see if anything is available. */
+                if (oldBitfield == 0xFFFFFFFF) {
+                    break;  /* No available bits in this bitfield. */
+                }
+
+                bitOffset = ma_ffs_32(~oldBitfield);
+                MA_ASSERT(bitOffset < 32);
+
+                newBitfield = oldBitfield | (1 << bitOffset);
+
+                if (ma_atomic_compare_and_swap_32(&pAllocator->pGroups[iGroup].bitfield, oldBitfield, newBitfield) == oldBitfield) {
+                    ma_uint32 slotIndex;
+
+                    /* Increment the counter as soon as possible to have other threads report out-of-memory sooner than later. */
+                    ma_atomic_fetch_add_32(&pAllocator->count, 1);
+
+                    /* The slot index is required for constructing the output value. */
+                    slotIndex = (iGroup << 5) + bitOffset;  /* iGroup << 5 = iGroup * 32 */
+                    if (slotIndex >= pAllocator->capacity) {
+                        return MA_OUT_OF_MEMORY;
+                    }
+
+                    /* Increment the reference count before constructing the output value. */
+                    pAllocator->pSlots[slotIndex] += 1;
+
+                    /* Construct the output value. */
+                    *pSlot = (((ma_uint64)pAllocator->pSlots[slotIndex] << 32) | slotIndex);
+
+                    return MA_SUCCESS;
+                }
+            }
+        }
+
+        /* We weren't able to find a slot. If it's because we've reached our capacity we need to return MA_OUT_OF_MEMORY. Otherwise we need to do another iteration and try again. */
+        if (pAllocator->count < pAllocator->capacity) {
+            ma_yield();
+        } else {
+            return MA_OUT_OF_MEMORY;
+        }
+    }
+
+    /* We couldn't find a slot within the maximum number of attempts. */
+    return MA_OUT_OF_MEMORY;
+}
+
+MA_API ma_result ma_slot_allocator_free(ma_slot_allocator* pAllocator, ma_uint64 slot)
+{
+    ma_uint32 iGroup;
+    ma_uint32 iBit;
+
+    if (pAllocator == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    iGroup = (ma_uint32)((slot & 0xFFFFFFFF) >> 5);   /* slot / 32 */
+    iBit   = (ma_uint32)((slot & 0xFFFFFFFF) & 31);   /* slot % 32 */
+
+    if (iGroup >= ma_slot_allocator_group_capacity(pAllocator)) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ASSERT(iBit < 32);   /* This must be true due to the logic we used to actually calculate it. */
+
+    while (ma_atomic_load_32(&pAllocator->count) > 0) {
+        /* CAS */
+        ma_uint32 oldBitfield;
+        ma_uint32 newBitfield;
+
+        oldBitfield = ma_atomic_load_32(&pAllocator->pGroups[iGroup].bitfield);  /* <-- This copy must happen. The compiler must not optimize this away. */
+        newBitfield = oldBitfield & ~(1 << iBit);
+
+        /* Debugging for checking for double-frees. */
+        #if defined(MA_DEBUG_OUTPUT)
+        {
+            if ((oldBitfield & (1 << iBit)) == 0) {
+                MA_ASSERT(MA_FALSE);    /* Double free detected.*/
+            }
+        }
+        #endif
+
+        if (ma_atomic_compare_and_swap_32(&pAllocator->pGroups[iGroup].bitfield, oldBitfield, newBitfield) == oldBitfield) {
+            ma_atomic_fetch_sub_32(&pAllocator->count, 1);
+            return MA_SUCCESS;
+        }
+    }
+
+    /* Getting here means there are no allocations available for freeing. */
+    return MA_INVALID_OPERATION;
+}
+
+
+#define MA_JOB_ID_NONE      ~((ma_uint64)0)
+#define MA_JOB_SLOT_NONE    (ma_uint16)(~0)
+
+static MA_INLINE ma_uint32 ma_job_extract_refcount(ma_uint64 toc)
+{
+    return (ma_uint32)(toc >> 32);
+}
+
+static MA_INLINE ma_uint16 ma_job_extract_slot(ma_uint64 toc)
+{
+    return (ma_uint16)(toc & 0x0000FFFF);
+}
+
+static MA_INLINE ma_uint16 ma_job_extract_code(ma_uint64 toc)
+{
+    return (ma_uint16)((toc & 0xFFFF0000) >> 16);
+}
+
+static MA_INLINE ma_uint64 ma_job_toc_to_allocation(ma_uint64 toc)
+{
+    return ((ma_uint64)ma_job_extract_refcount(toc) << 32) | (ma_uint64)ma_job_extract_slot(toc);
+}
+
+static MA_INLINE ma_uint64 ma_job_set_refcount(ma_uint64 toc, ma_uint32 refcount)
+{
+    /* Clear the reference count first. */
+    toc = toc & ~((ma_uint64)0xFFFFFFFF << 32);
+    toc = toc |  ((ma_uint64)refcount   << 32);
+
+    return toc;
+}
+
+
+MA_API ma_job ma_job_init(ma_uint16 code)
+{
+    ma_job job;
+
+    MA_ZERO_OBJECT(&job);
+    job.toc.breakup.code = code;
+    job.toc.breakup.slot = MA_JOB_SLOT_NONE;    /* Temp value. Will be allocated when posted to a queue. */
+    job.next             = MA_JOB_ID_NONE;
+
+    return job;
+}
+
+
+static ma_result ma_job_process__noop(ma_job* pJob);
+static ma_result ma_job_process__quit(ma_job* pJob);
+static ma_result ma_job_process__custom(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__load_data_buffer_node(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__free_data_buffer_node(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__page_data_buffer_node(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__load_data_buffer(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__free_data_buffer(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__load_data_stream(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__free_data_stream(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__page_data_stream(ma_job* pJob);
+static ma_result ma_job_process__resource_manager__seek_data_stream(ma_job* pJob);
+
+#if !defined(MA_NO_DEVICE_IO)
+static ma_result ma_job_process__device__aaudio_reroute(ma_job* pJob);
+#endif
+
+static ma_job_proc g_jobVTable[MA_JOB_TYPE_COUNT] =
+{
+    /* Miscellaneous. */
+    ma_job_process__quit,                                       /* MA_JOB_TYPE_QUIT */
+    ma_job_process__custom,                                     /* MA_JOB_TYPE_CUSTOM */
+
+    /* Resource Manager. */
+    ma_job_process__resource_manager__load_data_buffer_node,    /* MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE */
+    ma_job_process__resource_manager__free_data_buffer_node,    /* MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER_NODE */
+    ma_job_process__resource_manager__page_data_buffer_node,    /* MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE */
+    ma_job_process__resource_manager__load_data_buffer,         /* MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER */
+    ma_job_process__resource_manager__free_data_buffer,         /* MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER */
+    ma_job_process__resource_manager__load_data_stream,         /* MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_STREAM */
+    ma_job_process__resource_manager__free_data_stream,         /* MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_STREAM */
+    ma_job_process__resource_manager__page_data_stream,         /* MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_STREAM */
+    ma_job_process__resource_manager__seek_data_stream,         /* MA_JOB_TYPE_RESOURCE_MANAGER_SEEK_DATA_STREAM */
+
+    /* Device. */
+#if !defined(MA_NO_DEVICE_IO)
+    ma_job_process__device__aaudio_reroute                      /* MA_JOB_TYPE_DEVICE_AAUDIO_REROUTE */
+#endif
+};
+
+MA_API ma_result ma_job_process(ma_job* pJob)
+{
+    if (pJob == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pJob->toc.breakup.code >= MA_JOB_TYPE_COUNT) {
+        return MA_INVALID_OPERATION;
+    }
+
+    return g_jobVTable[pJob->toc.breakup.code](pJob);
+}
+
+static ma_result ma_job_process__noop(ma_job* pJob)
+{
+    MA_ASSERT(pJob != NULL);
+
+    /* No-op. */
+    (void)pJob;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_job_process__quit(ma_job* pJob)
+{
+    return ma_job_process__noop(pJob);
+}
+
+static ma_result ma_job_process__custom(ma_job* pJob)
+{
+    MA_ASSERT(pJob != NULL);
+
+    /* No-op if there's no callback. */
+    if (pJob->data.custom.proc == NULL) {
+        return MA_SUCCESS;
+    }
+
+    return pJob->data.custom.proc(pJob);
+}
+
+
+
+MA_API ma_job_queue_config ma_job_queue_config_init(ma_uint32 flags, ma_uint32 capacity)
+{
+    ma_job_queue_config config;
+
+    config.flags    = flags;
+    config.capacity = capacity;
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t allocatorOffset;
+    size_t jobsOffset;
+} ma_job_queue_heap_layout;
+
+static ma_result ma_job_queue_get_heap_layout(const ma_job_queue_config* pConfig, ma_job_queue_heap_layout* pHeapLayout)
+{
+    ma_result result;
+
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->capacity == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Allocator. */
+    {
+        ma_slot_allocator_config allocatorConfig;
+        size_t allocatorHeapSizeInBytes;
+
+        allocatorConfig = ma_slot_allocator_config_init(pConfig->capacity);
+        result = ma_slot_allocator_get_heap_size(&allocatorConfig, &allocatorHeapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->allocatorOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes    += allocatorHeapSizeInBytes;
+    }
+
+    /* Jobs. */
+    pHeapLayout->jobsOffset   = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += ma_align_64(pConfig->capacity * sizeof(ma_job));
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_job_queue_get_heap_size(const ma_job_queue_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_job_queue_heap_layout layout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_job_queue_get_heap_layout(pConfig, &layout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = layout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_job_queue_init_preallocated(const ma_job_queue_config* pConfig, void* pHeap, ma_job_queue* pQueue)
+{
+    ma_result result;
+    ma_job_queue_heap_layout heapLayout;
+    ma_slot_allocator_config allocatorConfig;
+
+    if (pQueue == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pQueue);
+
+    result = ma_job_queue_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pQueue->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pQueue->flags    = pConfig->flags;
+    pQueue->capacity = pConfig->capacity;
+    pQueue->pJobs    = (ma_job*)ma_offset_ptr(pHeap, heapLayout.jobsOffset);
+
+    allocatorConfig = ma_slot_allocator_config_init(pConfig->capacity);
+    result = ma_slot_allocator_init_preallocated(&allocatorConfig, ma_offset_ptr(pHeap, heapLayout.allocatorOffset), &pQueue->allocator);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* We need a semaphore if we're running in non-blocking mode. If threading is disabled we need to return an error. */
+    if ((pQueue->flags & MA_JOB_QUEUE_FLAG_NON_BLOCKING) == 0) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_semaphore_init(0, &pQueue->sem);
+        }
+        #else
+        {
+            /* Threading is disabled and we've requested non-blocking mode. */
+            return MA_INVALID_OPERATION;
+        }
+        #endif
+    }
+
+    /*
+    Our queue needs to be initialized with a free standing node. This should always be slot 0. Required for the lock free algorithm. The first job in the queue is
+    just a dummy item for giving us the first item in the list which is stored in the "next" member.
+    */
+    ma_slot_allocator_alloc(&pQueue->allocator, &pQueue->head);  /* Will never fail. */
+    pQueue->pJobs[ma_job_extract_slot(pQueue->head)].next = MA_JOB_ID_NONE;
+    pQueue->tail = pQueue->head;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_job_queue_init(const ma_job_queue_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_job_queue* pQueue)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_job_queue_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_job_queue_init_preallocated(pConfig, pHeap, pQueue);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pQueue->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_job_queue_uninit(ma_job_queue* pQueue, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pQueue == NULL) {
+        return;
+    }
+
+    /* All we need to do is uninitialize the semaphore. */
+    if ((pQueue->flags & MA_JOB_QUEUE_FLAG_NON_BLOCKING) == 0) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_semaphore_uninit(&pQueue->sem);
+        }
+        #else
+        {
+            MA_ASSERT(MA_FALSE);    /* Should never get here. Should have been checked at initialization time. */
+        }
+        #endif
+    }
+
+    ma_slot_allocator_uninit(&pQueue->allocator, pAllocationCallbacks);
+
+    if (pQueue->_ownsHeap) {
+        ma_free(pQueue->_pHeap, pAllocationCallbacks);
+    }
+}
+
+static ma_bool32 ma_job_queue_cas(volatile ma_uint64* dst, ma_uint64 expected, ma_uint64 desired)
+{
+    /* The new counter is taken from the expected value. */
+    return ma_atomic_compare_and_swap_64(dst, expected, ma_job_set_refcount(desired, ma_job_extract_refcount(expected) + 1)) == expected;
+}
+
+MA_API ma_result ma_job_queue_post(ma_job_queue* pQueue, const ma_job* pJob)
+{
+    /*
+    Lock free queue implementation based on the paper by Michael and Scott: Nonblocking Algorithms and Preemption-Safe Locking on Multiprogrammed Shared Memory Multiprocessors
+    */
+    ma_result result;
+    ma_uint64 slot;
+    ma_uint64 tail;
+    ma_uint64 next;
+
+    if (pQueue == NULL || pJob == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* We need a new slot. */
+    result = ma_slot_allocator_alloc(&pQueue->allocator, &slot);
+    if (result != MA_SUCCESS) {
+        return result;  /* Probably ran out of slots. If so, MA_OUT_OF_MEMORY will be returned. */
+    }
+
+    /* At this point we should have a slot to place the job. */
+    MA_ASSERT(ma_job_extract_slot(slot) < pQueue->capacity);
+
+    /* We need to put the job into memory before we do anything. */
+    pQueue->pJobs[ma_job_extract_slot(slot)]                  = *pJob;
+    pQueue->pJobs[ma_job_extract_slot(slot)].toc.allocation   = slot;                    /* This will overwrite the job code. */
+    pQueue->pJobs[ma_job_extract_slot(slot)].toc.breakup.code = pJob->toc.breakup.code;  /* The job code needs to be applied again because the line above overwrote it. */
+    pQueue->pJobs[ma_job_extract_slot(slot)].next             = MA_JOB_ID_NONE;          /* Reset for safety. */
+
+    #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
+    ma_spinlock_lock(&pQueue->lock);
+    #endif
+    {
+        /* The job is stored in memory so now we need to add it to our linked list. We only ever add items to the end of the list. */
+        for (;;) {
+            tail = ma_atomic_load_64(&pQueue->tail);
+            next = ma_atomic_load_64(&pQueue->pJobs[ma_job_extract_slot(tail)].next);
+
+            if (ma_job_toc_to_allocation(tail) == ma_job_toc_to_allocation(ma_atomic_load_64(&pQueue->tail))) {
+                if (ma_job_extract_slot(next) == 0xFFFF) {
+                    if (ma_job_queue_cas(&pQueue->pJobs[ma_job_extract_slot(tail)].next, next, slot)) {
+                        break;
+                    }
+                } else {
+                    ma_job_queue_cas(&pQueue->tail, tail, ma_job_extract_slot(next));
+                }
+            }
+        }
+        ma_job_queue_cas(&pQueue->tail, tail, slot);
+    }
+    #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
+    ma_spinlock_unlock(&pQueue->lock);
+    #endif
+
+
+    /* Signal the semaphore as the last step if we're using synchronous mode. */
+    if ((pQueue->flags & MA_JOB_QUEUE_FLAG_NON_BLOCKING) == 0) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_semaphore_release(&pQueue->sem);
+        }
+        #else
+        {
+            MA_ASSERT(MA_FALSE);    /* Should never get here. Should have been checked at initialization time. */
+        }
+        #endif
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_job_queue_next(ma_job_queue* pQueue, ma_job* pJob)
+{
+    ma_uint64 head;
+    ma_uint64 tail;
+    ma_uint64 next;
+
+    if (pQueue == NULL || pJob == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* If we're running in synchronous mode we'll need to wait on a semaphore. */
+    if ((pQueue->flags & MA_JOB_QUEUE_FLAG_NON_BLOCKING) == 0) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_semaphore_wait(&pQueue->sem);
+        }
+        #else
+        {
+            MA_ASSERT(MA_FALSE);    /* Should never get here. Should have been checked at initialization time. */
+        }
+        #endif
+    }
+
+    #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
+    ma_spinlock_lock(&pQueue->lock);
+    #endif
+    {
+        /*
+        BUG: In lock-free mode, multiple threads can be in this section of code. The "head" variable in the loop below
+        is stored. One thread can fall through to the freeing of this item while another is still using "head" for the
+        retrieval of the "next" variable.
+
+        The slot allocator might need to make use of some reference counting to ensure it's only truly freed when
+        there are no more references to the item. This must be fixed before removing these locks.
+        */
+
+        /* Now we need to remove the root item from the list. */
+        for (;;) {
+            head = ma_atomic_load_64(&pQueue->head);
+            tail = ma_atomic_load_64(&pQueue->tail);
+            next = ma_atomic_load_64(&pQueue->pJobs[ma_job_extract_slot(head)].next);
+
+            if (ma_job_toc_to_allocation(head) == ma_job_toc_to_allocation(ma_atomic_load_64(&pQueue->head))) {
+                if (ma_job_extract_slot(head) == ma_job_extract_slot(tail)) {
+                    if (ma_job_extract_slot(next) == 0xFFFF) {
+                        #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
+                        ma_spinlock_unlock(&pQueue->lock);
+                        #endif
+                        return MA_NO_DATA_AVAILABLE;
+                    }
+                    ma_job_queue_cas(&pQueue->tail, tail, ma_job_extract_slot(next));
+                } else {
+                    *pJob = pQueue->pJobs[ma_job_extract_slot(next)];
+                    if (ma_job_queue_cas(&pQueue->head, head, ma_job_extract_slot(next))) {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    #ifndef MA_USE_EXPERIMENTAL_LOCK_FREE_JOB_QUEUE
+    ma_spinlock_unlock(&pQueue->lock);
+    #endif
+
+    ma_slot_allocator_free(&pQueue->allocator, head);
+
+    /*
+    If it's a quit job make sure it's put back on the queue to ensure other threads have an opportunity to detect it and terminate naturally. We
+    could instead just leave it on the queue, but that would involve fiddling with the lock-free code above and I want to keep that as simple as
+    possible.
+    */
+    if (pJob->toc.breakup.code == MA_JOB_TYPE_QUIT) {
+        ma_job_queue_post(pQueue, pJob);
+        return MA_CANCELLED;    /* Return a cancelled status just in case the thread is checking return codes and not properly checking for a quit job. */
+    }
+
+    return MA_SUCCESS;
+}
+
+
+
+/*******************************************************************************
+
+Dynamic Linking
+
+*******************************************************************************/
+#ifdef MA_POSIX
+    /* No need for dlfcn.h if we're not using runtime linking. */
+    #ifndef MA_NO_RUNTIME_LINKING
+        #include <dlfcn.h>
+    #endif
+#endif
+
+MA_API ma_handle ma_dlopen(ma_log* pLog, const char* filename)
+{
+#ifndef MA_NO_RUNTIME_LINKING
+    ma_handle handle;
+
+    ma_log_postf(pLog, MA_LOG_LEVEL_DEBUG, "Loading library: %s\n", filename);
+
+    #ifdef MA_WIN32
+        /* From MSDN: Desktop applications cannot use LoadPackagedLibrary; if a desktop application calls this function it fails with APPMODEL_ERROR_NO_PACKAGE.*/
+        #if !defined(MA_WIN32_UWP) || !(defined(WINAPI_FAMILY) && ((defined(WINAPI_FAMILY_PHONE_APP) && WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)))
+            handle = (ma_handle)LoadLibraryA(filename);
+        #else
+            /* *sigh* It appears there is no ANSI version of LoadPackagedLibrary()... */
+            WCHAR filenameW[4096];
+            if (MultiByteToWideChar(CP_UTF8, 0, filename, -1, filenameW, sizeof(filenameW)) == 0) {
+                handle = NULL;
+            } else {
+                handle = (ma_handle)LoadPackagedLibrary(filenameW, 0);
+            }
+        #endif
+    #else
+        handle = (ma_handle)dlopen(filename, RTLD_NOW);
+    #endif
+
+    /*
+    I'm not considering failure to load a library an error nor a warning because seamlessly falling through to a lower-priority
+    backend is a deliberate design choice. Instead I'm logging it as an informational message.
+    */
+    if (handle == NULL) {
+        ma_log_postf(pLog, MA_LOG_LEVEL_INFO, "Failed to load library: %s\n", filename);
+    }
+
+    return handle;
+#else
+    /* Runtime linking is disabled. */
+    (void)pLog;
+    (void)filename;
+    return NULL;
+#endif
+}
+
+MA_API void ma_dlclose(ma_log* pLog, ma_handle handle)
+{
+#ifndef MA_NO_RUNTIME_LINKING
+    #ifdef MA_WIN32
+        FreeLibrary((HMODULE)handle);
+    #else
+        /* Hack for Android bug (see https://github.com/android/ndk/issues/360). Calling dlclose() pre-API 28 may segfault. */
+        #if !defined(MA_ANDROID) || (defined(__ANDROID_API__) && __ANDROID_API__ >= 28)
+        {
+            dlclose((void*)handle);
+        }
+        #else
+        {
+            (void)handle;
+        }
+        #endif
+    #endif
+
+    (void)pLog;
+#else
+    /* Runtime linking is disabled. */
+    (void)pLog;
+    (void)handle;
+#endif
+}
+
+MA_API ma_proc ma_dlsym(ma_log* pLog, ma_handle handle, const char* symbol)
+{
+#ifndef MA_NO_RUNTIME_LINKING
+    ma_proc proc;
+
+    ma_log_postf(pLog, MA_LOG_LEVEL_DEBUG, "Loading symbol: %s\n", symbol);
+
+#ifdef _WIN32
+    proc = (ma_proc)GetProcAddress((HMODULE)handle, symbol);
+#else
+#if (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) || defined(__clang__)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+    proc = (ma_proc)dlsym((void*)handle, symbol);
+#if (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) || defined(__clang__)
+    #pragma GCC diagnostic pop
+#endif
+#endif
+
+    if (proc == NULL) {
+        ma_log_postf(pLog, MA_LOG_LEVEL_WARNING, "Failed to load symbol: %s\n", symbol);
+    }
+
+    (void)pLog; /* It's possible for pContext to be unused. */
+    return proc;
+#else
+    /* Runtime linking is disabled. */
+    (void)pLog;
+    (void)handle;
+    (void)symbol;
+    return NULL;
+#endif
+}
+
+
+
+/************************************************************************************************************************************************************
+*************************************************************************************************************************************************************
+
+DEVICE I/O
+==========
+
+*************************************************************************************************************************************************************
+************************************************************************************************************************************************************/
+
+/* Disable run-time linking on certain backends and platforms. */
+#ifndef MA_NO_RUNTIME_LINKING
+    #if defined(MA_EMSCRIPTEN) || defined(MA_ORBIS) || defined(MA_PROSPERO)
+        #define MA_NO_RUNTIME_LINKING
+    #endif
+#endif
+
+#ifdef MA_APPLE
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifndef MA_NO_DEVICE_IO
+
+#if defined(MA_APPLE) && (MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
+    #include <mach/mach_time.h> /* For mach_absolute_time() */
+#endif
+
+#ifdef MA_POSIX
+    #include <sys/types.h>
+    #include <unistd.h>
+
+    /* No need for dlfcn.h if we're not using runtime linking. */
+    #ifndef MA_NO_RUNTIME_LINKING
+        #include <dlfcn.h>
+    #endif
+#endif
+
+/* This must be set to at least 26. */
+#ifndef MA_AAUDIO_MIN_ANDROID_SDK_VERSION
+#define MA_AAUDIO_MIN_ANDROID_SDK_VERSION 27
+#endif
+
+
+MA_API void ma_device_info_add_native_data_format(ma_device_info* pDeviceInfo, ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 flags)
+{
+    if (pDeviceInfo == NULL) {
+        return;
+    }
+
+    if (pDeviceInfo->nativeDataFormatCount < ma_countof(pDeviceInfo->nativeDataFormats)) {
+        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
+        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
+        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = sampleRate;
+        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = flags;
+        pDeviceInfo->nativeDataFormatCount += 1;
+    }
+}
+
+
+typedef struct
+{
+    ma_backend backend;
+    const char* pName;
+} ma_backend_info;
+
+static ma_backend_info gBackendInfo[] = /* Indexed by the backend enum. Must be in the order backends are declared in the ma_backend enum. */
+{
+    {ma_backend_wasapi,     "WASAPI"},
+    {ma_backend_dsound,     "DirectSound"},
+    {ma_backend_winmm,      "WinMM"},
+    {ma_backend_coreaudio,  "Core Audio"},
+    {ma_backend_sndio,      "sndio"},
+    {ma_backend_audio4,     "audio(4)"},
+    {ma_backend_oss,        "OSS"},
+    {ma_backend_pulseaudio, "PulseAudio"},
+    {ma_backend_alsa,       "ALSA"},
+    {ma_backend_jack,       "JACK"},
+    {ma_backend_aaudio,     "AAudio"},
+    {ma_backend_opensl,     "OpenSL|ES"},
+    {ma_backend_webaudio,   "Web Audio"},
+    {ma_backend_custom,     "Custom"},
+    {ma_backend_null,       "Null"}
+};
+
+MA_API const char* ma_get_backend_name(ma_backend backend)
+{
+    if (backend < 0 || backend >= (int)ma_countof(gBackendInfo)) {
+        return "Unknown";
+    }
+
+    return gBackendInfo[backend].pName;
+}
+
+MA_API ma_result ma_get_backend_from_name(const char* pBackendName, ma_backend* pBackend)
+{
+    size_t iBackend;
+
+    if (pBackendName == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (iBackend = 0; iBackend < ma_countof(gBackendInfo); iBackend += 1) {
+        if (ma_strcmp(pBackendName, gBackendInfo[iBackend].pName) == 0) {
+            if (pBackend != NULL) {
+                *pBackend = gBackendInfo[iBackend].backend;
+            }
+
+            return MA_SUCCESS;
+        }
+    }
+
+    /* Getting here means the backend name is unknown. */
+    return MA_INVALID_ARGS;
+}
+
+MA_API ma_bool32 ma_is_backend_enabled(ma_backend backend)
+{
+    /*
+    This looks a little bit gross, but we want all backends to be included in the switch to avoid warnings on some compilers
+    about some enums not being handled by the switch statement.
+    */
+    switch (backend)
+    {
+        case ma_backend_wasapi:
+        #if defined(MA_HAS_WASAPI)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_dsound:
+        #if defined(MA_HAS_DSOUND)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_winmm:
+        #if defined(MA_HAS_WINMM)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_coreaudio:
+        #if defined(MA_HAS_COREAUDIO)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_sndio:
+        #if defined(MA_HAS_SNDIO)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_audio4:
+        #if defined(MA_HAS_AUDIO4)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_oss:
+        #if defined(MA_HAS_OSS)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_pulseaudio:
+        #if defined(MA_HAS_PULSEAUDIO)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_alsa:
+        #if defined(MA_HAS_ALSA)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_jack:
+        #if defined(MA_HAS_JACK)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_aaudio:
+        #if defined(MA_HAS_AAUDIO)
+            #if defined(MA_ANDROID)
+            {
+                return ma_android_sdk_version() >= MA_AAUDIO_MIN_ANDROID_SDK_VERSION;
+            }
+            #else
+                return MA_FALSE;
+            #endif
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_opensl:
+        #if defined(MA_HAS_OPENSL)
+            #if defined(MA_ANDROID)
+            {
+                return ma_android_sdk_version() >= 9;
+            }
+            #else
+                return MA_TRUE;
+            #endif
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_webaudio:
+        #if defined(MA_HAS_WEBAUDIO)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_custom:
+        #if defined(MA_HAS_CUSTOM)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+        case ma_backend_null:
+        #if defined(MA_HAS_NULL)
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+
+        default: return MA_FALSE;
+    }
+}
+
+MA_API ma_result ma_get_enabled_backends(ma_backend* pBackends, size_t backendCap, size_t* pBackendCount)
+{
+    size_t backendCount;
+    size_t iBackend;
+    ma_result result = MA_SUCCESS;
+
+    if (pBackendCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    backendCount = 0;
+
+    for (iBackend = 0; iBackend <= ma_backend_null; iBackend += 1) {
+        ma_backend backend = (ma_backend)iBackend;
+
+        if (ma_is_backend_enabled(backend)) {
+            /* The backend is enabled. Try adding it to the list. If there's no room, MA_NO_SPACE needs to be returned. */
+            if (backendCount == backendCap) {
+                result = MA_NO_SPACE;
+                break;
+            } else {
+                pBackends[backendCount] = backend;
+                backendCount += 1;
+            }
+        }
+    }
+
+    if (pBackendCount != NULL) {
+        *pBackendCount = backendCount;
+    }
+
+    return result;
+}
+
+MA_API ma_bool32 ma_is_loopback_supported(ma_backend backend)
+{
+    switch (backend)
+    {
+        case ma_backend_wasapi:     return MA_TRUE;
+        case ma_backend_dsound:     return MA_FALSE;
+        case ma_backend_winmm:      return MA_FALSE;
+        case ma_backend_coreaudio:  return MA_FALSE;
+        case ma_backend_sndio:      return MA_FALSE;
+        case ma_backend_audio4:     return MA_FALSE;
+        case ma_backend_oss:        return MA_FALSE;
+        case ma_backend_pulseaudio: return MA_FALSE;
+        case ma_backend_alsa:       return MA_FALSE;
+        case ma_backend_jack:       return MA_FALSE;
+        case ma_backend_aaudio:     return MA_FALSE;
+        case ma_backend_opensl:     return MA_FALSE;
+        case ma_backend_webaudio:   return MA_FALSE;
+        case ma_backend_custom:     return MA_FALSE;    /* <-- Will depend on the implementation of the backend. */
+        case ma_backend_null:       return MA_FALSE;
+        default:                    return MA_FALSE;
+    }
+}
+
+
+
+#if defined(MA_WIN32)
+/* WASAPI error codes. */
+#define MA_AUDCLNT_E_NOT_INITIALIZED              ((HRESULT)0x88890001)
+#define MA_AUDCLNT_E_ALREADY_INITIALIZED          ((HRESULT)0x88890002)
+#define MA_AUDCLNT_E_WRONG_ENDPOINT_TYPE          ((HRESULT)0x88890003)
+#define MA_AUDCLNT_E_DEVICE_INVALIDATED           ((HRESULT)0x88890004)
+#define MA_AUDCLNT_E_NOT_STOPPED                  ((HRESULT)0x88890005)
+#define MA_AUDCLNT_E_BUFFER_TOO_LARGE             ((HRESULT)0x88890006)
+#define MA_AUDCLNT_E_OUT_OF_ORDER                 ((HRESULT)0x88890007)
+#define MA_AUDCLNT_E_UNSUPPORTED_FORMAT           ((HRESULT)0x88890008)
+#define MA_AUDCLNT_E_INVALID_SIZE                 ((HRESULT)0x88890009)
+#define MA_AUDCLNT_E_DEVICE_IN_USE                ((HRESULT)0x8889000A)
+#define MA_AUDCLNT_E_BUFFER_OPERATION_PENDING     ((HRESULT)0x8889000B)
+#define MA_AUDCLNT_E_THREAD_NOT_REGISTERED        ((HRESULT)0x8889000C)
+#define MA_AUDCLNT_E_NO_SINGLE_PROCESS            ((HRESULT)0x8889000D)
+#define MA_AUDCLNT_E_EXCLUSIVE_MODE_NOT_ALLOWED   ((HRESULT)0x8889000E)
+#define MA_AUDCLNT_E_ENDPOINT_CREATE_FAILED       ((HRESULT)0x8889000F)
+#define MA_AUDCLNT_E_SERVICE_NOT_RUNNING          ((HRESULT)0x88890010)
+#define MA_AUDCLNT_E_EVENTHANDLE_NOT_EXPECTED     ((HRESULT)0x88890011)
+#define MA_AUDCLNT_E_EXCLUSIVE_MODE_ONLY          ((HRESULT)0x88890012)
+#define MA_AUDCLNT_E_BUFDURATION_PERIOD_NOT_EQUAL ((HRESULT)0x88890013)
+#define MA_AUDCLNT_E_EVENTHANDLE_NOT_SET          ((HRESULT)0x88890014)
+#define MA_AUDCLNT_E_INCORRECT_BUFFER_SIZE        ((HRESULT)0x88890015)
+#define MA_AUDCLNT_E_BUFFER_SIZE_ERROR            ((HRESULT)0x88890016)
+#define MA_AUDCLNT_E_CPUUSAGE_EXCEEDED            ((HRESULT)0x88890017)
+#define MA_AUDCLNT_E_BUFFER_ERROR                 ((HRESULT)0x88890018)
+#define MA_AUDCLNT_E_BUFFER_SIZE_NOT_ALIGNED      ((HRESULT)0x88890019)
+#define MA_AUDCLNT_E_INVALID_DEVICE_PERIOD        ((HRESULT)0x88890020)
+#define MA_AUDCLNT_E_INVALID_STREAM_FLAG          ((HRESULT)0x88890021)
+#define MA_AUDCLNT_E_ENDPOINT_OFFLOAD_NOT_CAPABLE ((HRESULT)0x88890022)
+#define MA_AUDCLNT_E_OUT_OF_OFFLOAD_RESOURCES     ((HRESULT)0x88890023)
+#define MA_AUDCLNT_E_OFFLOAD_MODE_ONLY            ((HRESULT)0x88890024)
+#define MA_AUDCLNT_E_NONOFFLOAD_MODE_ONLY         ((HRESULT)0x88890025)
+#define MA_AUDCLNT_E_RESOURCES_INVALIDATED        ((HRESULT)0x88890026)
+#define MA_AUDCLNT_E_RAW_MODE_UNSUPPORTED         ((HRESULT)0x88890027)
+#define MA_AUDCLNT_E_ENGINE_PERIODICITY_LOCKED    ((HRESULT)0x88890028)
+#define MA_AUDCLNT_E_ENGINE_FORMAT_LOCKED         ((HRESULT)0x88890029)
+#define MA_AUDCLNT_E_HEADTRACKING_ENABLED         ((HRESULT)0x88890030)
+#define MA_AUDCLNT_E_HEADTRACKING_UNSUPPORTED     ((HRESULT)0x88890040)
+#define MA_AUDCLNT_S_BUFFER_EMPTY                 ((HRESULT)0x08890001)
+#define MA_AUDCLNT_S_THREAD_ALREADY_REGISTERED    ((HRESULT)0x08890002)
+#define MA_AUDCLNT_S_POSITION_STALLED             ((HRESULT)0x08890003)
+
+#define MA_DS_OK                                  ((HRESULT)0)
+#define MA_DS_NO_VIRTUALIZATION                   ((HRESULT)0x0878000A)
+#define MA_DSERR_ALLOCATED                        ((HRESULT)0x8878000A)
+#define MA_DSERR_CONTROLUNAVAIL                   ((HRESULT)0x8878001E)
+#define MA_DSERR_INVALIDPARAM                     ((HRESULT)0x80070057) /*E_INVALIDARG*/
+#define MA_DSERR_INVALIDCALL                      ((HRESULT)0x88780032)
+#define MA_DSERR_GENERIC                          ((HRESULT)0x80004005) /*E_FAIL*/
+#define MA_DSERR_PRIOLEVELNEEDED                  ((HRESULT)0x88780046)
+#define MA_DSERR_OUTOFMEMORY                      ((HRESULT)0x8007000E) /*E_OUTOFMEMORY*/
+#define MA_DSERR_BADFORMAT                        ((HRESULT)0x88780064)
+#define MA_DSERR_UNSUPPORTED                      ((HRESULT)0x80004001) /*E_NOTIMPL*/
+#define MA_DSERR_NODRIVER                         ((HRESULT)0x88780078)
+#define MA_DSERR_ALREADYINITIALIZED               ((HRESULT)0x88780082)
+#define MA_DSERR_NOAGGREGATION                    ((HRESULT)0x80040110) /*CLASS_E_NOAGGREGATION*/
+#define MA_DSERR_BUFFERLOST                       ((HRESULT)0x88780096)
+#define MA_DSERR_OTHERAPPHASPRIO                  ((HRESULT)0x887800A0)
+#define MA_DSERR_UNINITIALIZED                    ((HRESULT)0x887800AA)
+#define MA_DSERR_NOINTERFACE                      ((HRESULT)0x80004002) /*E_NOINTERFACE*/
+#define MA_DSERR_ACCESSDENIED                     ((HRESULT)0x80070005) /*E_ACCESSDENIED*/
+#define MA_DSERR_BUFFERTOOSMALL                   ((HRESULT)0x887800B4)
+#define MA_DSERR_DS8_REQUIRED                     ((HRESULT)0x887800BE)
+#define MA_DSERR_SENDLOOP                         ((HRESULT)0x887800C8)
+#define MA_DSERR_BADSENDBUFFERGUID                ((HRESULT)0x887800D2)
+#define MA_DSERR_OBJECTNOTFOUND                   ((HRESULT)0x88781161)
+#define MA_DSERR_FXUNAVAILABLE                    ((HRESULT)0x887800DC)
+
+static ma_result ma_result_from_HRESULT(HRESULT hr)
+{
+    switch (hr)
+    {
+        case NOERROR:                                   return MA_SUCCESS;
+        /*case S_OK:                                      return MA_SUCCESS;*/
+
+        case E_POINTER:                                 return MA_INVALID_ARGS;
+        case E_UNEXPECTED:                              return MA_ERROR;
+        case E_NOTIMPL:                                 return MA_NOT_IMPLEMENTED;
+        case E_OUTOFMEMORY:                             return MA_OUT_OF_MEMORY;
+        case E_INVALIDARG:                              return MA_INVALID_ARGS;
+        case E_NOINTERFACE:                             return MA_API_NOT_FOUND;
+        case E_HANDLE:                                  return MA_INVALID_ARGS;
+        case E_ABORT:                                   return MA_ERROR;
+        case E_FAIL:                                    return MA_ERROR;
+        case E_ACCESSDENIED:                            return MA_ACCESS_DENIED;
+
+        /* WASAPI */
+        case MA_AUDCLNT_E_NOT_INITIALIZED:              return MA_DEVICE_NOT_INITIALIZED;
+        case MA_AUDCLNT_E_ALREADY_INITIALIZED:          return MA_DEVICE_ALREADY_INITIALIZED;
+        case MA_AUDCLNT_E_WRONG_ENDPOINT_TYPE:          return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_DEVICE_INVALIDATED:           return MA_UNAVAILABLE;
+        case MA_AUDCLNT_E_NOT_STOPPED:                  return MA_DEVICE_NOT_STOPPED;
+        case MA_AUDCLNT_E_BUFFER_TOO_LARGE:             return MA_TOO_BIG;
+        case MA_AUDCLNT_E_OUT_OF_ORDER:                 return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_UNSUPPORTED_FORMAT:           return MA_FORMAT_NOT_SUPPORTED;
+        case MA_AUDCLNT_E_INVALID_SIZE:                 return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_DEVICE_IN_USE:                return MA_BUSY;
+        case MA_AUDCLNT_E_BUFFER_OPERATION_PENDING:     return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_THREAD_NOT_REGISTERED:        return MA_DOES_NOT_EXIST;
+        case MA_AUDCLNT_E_NO_SINGLE_PROCESS:            return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_EXCLUSIVE_MODE_NOT_ALLOWED:   return MA_SHARE_MODE_NOT_SUPPORTED;
+        case MA_AUDCLNT_E_ENDPOINT_CREATE_FAILED:       return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+        case MA_AUDCLNT_E_SERVICE_NOT_RUNNING:          return MA_NOT_CONNECTED;
+        case MA_AUDCLNT_E_EVENTHANDLE_NOT_EXPECTED:     return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_EXCLUSIVE_MODE_ONLY:          return MA_SHARE_MODE_NOT_SUPPORTED;
+        case MA_AUDCLNT_E_BUFDURATION_PERIOD_NOT_EQUAL: return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_EVENTHANDLE_NOT_SET:          return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_INCORRECT_BUFFER_SIZE:        return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_BUFFER_SIZE_ERROR:            return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_CPUUSAGE_EXCEEDED:            return MA_ERROR;
+        case MA_AUDCLNT_E_BUFFER_ERROR:                 return MA_ERROR;
+        case MA_AUDCLNT_E_BUFFER_SIZE_NOT_ALIGNED:      return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_INVALID_DEVICE_PERIOD:        return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_INVALID_STREAM_FLAG:          return MA_INVALID_ARGS;
+        case MA_AUDCLNT_E_ENDPOINT_OFFLOAD_NOT_CAPABLE: return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_OUT_OF_OFFLOAD_RESOURCES:     return MA_OUT_OF_MEMORY;
+        case MA_AUDCLNT_E_OFFLOAD_MODE_ONLY:            return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_NONOFFLOAD_MODE_ONLY:         return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_RESOURCES_INVALIDATED:        return MA_INVALID_DATA;
+        case MA_AUDCLNT_E_RAW_MODE_UNSUPPORTED:         return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_ENGINE_PERIODICITY_LOCKED:    return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_ENGINE_FORMAT_LOCKED:         return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_HEADTRACKING_ENABLED:         return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_E_HEADTRACKING_UNSUPPORTED:     return MA_INVALID_OPERATION;
+        case MA_AUDCLNT_S_BUFFER_EMPTY:                 return MA_NO_SPACE;
+        case MA_AUDCLNT_S_THREAD_ALREADY_REGISTERED:    return MA_ALREADY_EXISTS;
+        case MA_AUDCLNT_S_POSITION_STALLED:             return MA_ERROR;
+
+        /* DirectSound */
+        /*case MA_DS_OK:                                  return MA_SUCCESS;*/          /* S_OK */
+        case MA_DS_NO_VIRTUALIZATION:                   return MA_SUCCESS;
+        case MA_DSERR_ALLOCATED:                        return MA_ALREADY_IN_USE;
+        case MA_DSERR_CONTROLUNAVAIL:                   return MA_INVALID_OPERATION;
+        /*case MA_DSERR_INVALIDPARAM:                    return MA_INVALID_ARGS;*/      /* E_INVALIDARG */
+        case MA_DSERR_INVALIDCALL:                      return MA_INVALID_OPERATION;
+        /*case MA_DSERR_GENERIC:                          return MA_ERROR;*/            /* E_FAIL */
+        case MA_DSERR_PRIOLEVELNEEDED:                  return MA_INVALID_OPERATION;
+        /*case MA_DSERR_OUTOFMEMORY:                      return MA_OUT_OF_MEMORY;*/    /* E_OUTOFMEMORY */
+        case MA_DSERR_BADFORMAT:                        return MA_FORMAT_NOT_SUPPORTED;
+        /*case MA_DSERR_UNSUPPORTED:                      return MA_NOT_IMPLEMENTED;*/  /* E_NOTIMPL */
+        case MA_DSERR_NODRIVER:                         return MA_FAILED_TO_INIT_BACKEND;
+        case MA_DSERR_ALREADYINITIALIZED:               return MA_DEVICE_ALREADY_INITIALIZED;
+        case MA_DSERR_NOAGGREGATION:                    return MA_ERROR;
+        case MA_DSERR_BUFFERLOST:                       return MA_UNAVAILABLE;
+        case MA_DSERR_OTHERAPPHASPRIO:                  return MA_ACCESS_DENIED;
+        case MA_DSERR_UNINITIALIZED:                    return MA_DEVICE_NOT_INITIALIZED;
+        /*case MA_DSERR_NOINTERFACE:                      return MA_API_NOT_FOUND;*/    /* E_NOINTERFACE */
+        /*case MA_DSERR_ACCESSDENIED:                     return MA_ACCESS_DENIED;*/    /* E_ACCESSDENIED */
+        case MA_DSERR_BUFFERTOOSMALL:                   return MA_NO_SPACE;
+        case MA_DSERR_DS8_REQUIRED:                     return MA_INVALID_OPERATION;
+        case MA_DSERR_SENDLOOP:                         return MA_DEADLOCK;
+        case MA_DSERR_BADSENDBUFFERGUID:                return MA_INVALID_ARGS;
+        case MA_DSERR_OBJECTNOTFOUND:                   return MA_NO_DEVICE;
+        case MA_DSERR_FXUNAVAILABLE:                    return MA_UNAVAILABLE;
+
+        default:                                        return MA_ERROR;
+    }
+}
+
+/* PROPVARIANT */
+#define MA_VT_LPWSTR    31
+#define MA_VT_BLOB      65
+
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(push)
+    #pragma warning(disable:4201)   /* nonstandard extension used: nameless struct/union */
+#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wpedantic" /* For ISO C99 doesn't support unnamed structs/unions [-Wpedantic] */
+    #if defined(__clang__)
+        #pragma GCC diagnostic ignored "-Wc11-extensions"   /* anonymous unions are a C11 extension */
+    #endif
+#endif
+typedef struct
+{
+    WORD vt;
+    WORD wReserved1;
+    WORD wReserved2;
+    WORD wReserved3;
+    union
+    {
+        struct
+        {
+            ULONG cbSize;
+            BYTE* pBlobData;
+        } blob;
+        WCHAR* pwszVal;
+        char pad[16];   /* Just to ensure the size of the struct matches the official version. */
+    };
+} MA_PROPVARIANT;
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(pop)
+#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
+    #pragma GCC diagnostic pop
+#endif
+
+typedef HRESULT (WINAPI * MA_PFN_CoInitialize)(void* pvReserved);
+typedef HRESULT (WINAPI * MA_PFN_CoInitializeEx)(void* pvReserved, DWORD  dwCoInit);
+typedef void    (WINAPI * MA_PFN_CoUninitialize)(void);
+typedef HRESULT (WINAPI * MA_PFN_CoCreateInstance)(const IID* rclsid, void* pUnkOuter, DWORD dwClsContext, const IID* riid, void* ppv);
+typedef void    (WINAPI * MA_PFN_CoTaskMemFree)(void* pv);
+typedef HRESULT (WINAPI * MA_PFN_PropVariantClear)(MA_PROPVARIANT *pvar);
+typedef int     (WINAPI * MA_PFN_StringFromGUID2)(const GUID* const rguid, WCHAR* lpsz, int cchMax);
+
+typedef HWND    (WINAPI * MA_PFN_GetForegroundWindow)(void);
+typedef HWND    (WINAPI * MA_PFN_GetDesktopWindow)(void);
+
+#if defined(MA_WIN32_DESKTOP)
+/* Microsoft documents these APIs as returning LSTATUS, but the Win32 API shipping with some compilers do not define it. It's just a LONG. */
+typedef LONG    (WINAPI * MA_PFN_RegOpenKeyExA)(HKEY hKey, const char* lpSubKey, DWORD ulOptions, DWORD samDesired, HKEY* phkResult);
+typedef LONG    (WINAPI * MA_PFN_RegCloseKey)(HKEY hKey);
+typedef LONG    (WINAPI * MA_PFN_RegQueryValueExA)(HKEY hKey, const char* lpValueName, DWORD* lpReserved, DWORD* lpType, BYTE* lpData, DWORD* lpcbData);
+#endif  /* MA_WIN32_DESKTOP */
+
+MA_API size_t ma_strlen_WCHAR(const WCHAR* str)
+{
+    size_t len = 0;
+    while (str[len] != '\0') {
+        len += 1;
+    }
+
+    return len;
+}
+
+MA_API int ma_strcmp_WCHAR(const WCHAR *s1, const WCHAR *s2)
+{
+    while (*s1 != '\0' && *s1 == *s2) {
+        s1 += 1;
+        s2 += 1;
+    }
+
+    return *s1 - *s2;
+}
+
+MA_API int ma_strcpy_s_WCHAR(WCHAR* dst, size_t dstCap, const WCHAR* src)
+{
+    size_t i;
+
+    if (dst == 0) {
+        return 22;
+    }
+    if (dstCap == 0) {
+        return 34;
+    }
+    if (src == 0) {
+        dst[0] = '\0';
+        return 22;
+    }
+
+    for (i = 0; i < dstCap && src[i] != '\0'; ++i) {
+        dst[i] = src[i];
+    }
+
+    if (i < dstCap) {
+        dst[i] = '\0';
+        return 0;
+    }
+
+    dst[0] = '\0';
+    return 34;
+}
+#endif  /* MA_WIN32 */
+
+
+#define MA_DEFAULT_PLAYBACK_DEVICE_NAME    "Default Playback Device"
+#define MA_DEFAULT_CAPTURE_DEVICE_NAME     "Default Capture Device"
+
+
+
+
+/*******************************************************************************
+
+Timing
+
+*******************************************************************************/
+#if defined(MA_WIN32) && !defined(MA_POSIX)
+    static LARGE_INTEGER g_ma_TimerFrequency;   /* <-- Initialized to zero since it's static. */
+    static void ma_timer_init(ma_timer* pTimer)
+    {
+        LARGE_INTEGER counter;
+
+        if (g_ma_TimerFrequency.QuadPart == 0) {
+            QueryPerformanceFrequency(&g_ma_TimerFrequency);
+        }
+
+        QueryPerformanceCounter(&counter);
+        pTimer->counter = counter.QuadPart;
+    }
+
+    static double ma_timer_get_time_in_seconds(ma_timer* pTimer)
+    {
+        LARGE_INTEGER counter;
+        if (!QueryPerformanceCounter(&counter)) {
+            return 0;
+        }
+
+        return (double)(counter.QuadPart - pTimer->counter) / g_ma_TimerFrequency.QuadPart;
+    }
+#elif defined(MA_APPLE) && (MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
+    static ma_uint64 g_ma_TimerFrequency = 0;
+    static void ma_timer_init(ma_timer* pTimer)
+    {
+        mach_timebase_info_data_t baseTime;
+        mach_timebase_info(&baseTime);
+        g_ma_TimerFrequency = (baseTime.denom * 1e9) / baseTime.numer;
+
+        pTimer->counter = mach_absolute_time();
+    }
+
+    static double ma_timer_get_time_in_seconds(ma_timer* pTimer)
+    {
+        ma_uint64 newTimeCounter = mach_absolute_time();
+        ma_uint64 oldTimeCounter = pTimer->counter;
+
+        return (newTimeCounter - oldTimeCounter) / g_ma_TimerFrequency;
+    }
+#elif defined(MA_EMSCRIPTEN)
+    static MA_INLINE void ma_timer_init(ma_timer* pTimer)
+    {
+        pTimer->counterD = emscripten_get_now();
+    }
+
+    static MA_INLINE double ma_timer_get_time_in_seconds(ma_timer* pTimer)
+    {
+        return (emscripten_get_now() - pTimer->counterD) / 1000;    /* Emscripten is in milliseconds. */
+    }
+#else
+    #if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309L
+        #if defined(CLOCK_MONOTONIC)
+            #define MA_CLOCK_ID CLOCK_MONOTONIC
+        #else
+            #define MA_CLOCK_ID CLOCK_REALTIME
+        #endif
+
+        static void ma_timer_init(ma_timer* pTimer)
+        {
+            struct timespec newTime;
+            clock_gettime(MA_CLOCK_ID, &newTime);
+
+            pTimer->counter = (newTime.tv_sec * 1000000000) + newTime.tv_nsec;
+        }
+
+        static double ma_timer_get_time_in_seconds(ma_timer* pTimer)
+        {
+            ma_uint64 newTimeCounter;
+            ma_uint64 oldTimeCounter;
+
+            struct timespec newTime;
+            clock_gettime(MA_CLOCK_ID, &newTime);
+
+            newTimeCounter = (newTime.tv_sec * 1000000000) + newTime.tv_nsec;
+            oldTimeCounter = pTimer->counter;
+
+            return (newTimeCounter - oldTimeCounter) / 1000000000.0;
+        }
+    #else
+        static void ma_timer_init(ma_timer* pTimer)
+        {
+            struct timeval newTime;
+            gettimeofday(&newTime, NULL);
+
+            pTimer->counter = (newTime.tv_sec * 1000000) + newTime.tv_usec;
+        }
+
+        static double ma_timer_get_time_in_seconds(ma_timer* pTimer)
+        {
+            ma_uint64 newTimeCounter;
+            ma_uint64 oldTimeCounter;
+
+            struct timeval newTime;
+            gettimeofday(&newTime, NULL);
+
+            newTimeCounter = (newTime.tv_sec * 1000000) + newTime.tv_usec;
+            oldTimeCounter = pTimer->counter;
+
+            return (newTimeCounter - oldTimeCounter) / 1000000.0;
+        }
+    #endif
+#endif
+
+
+
+#if 0
+static ma_uint32 ma_get_closest_standard_sample_rate(ma_uint32 sampleRateIn)
+{
+    ma_uint32 closestRate = 0;
+    ma_uint32 closestDiff = 0xFFFFFFFF;
+    size_t iStandardRate;
+
+    for (iStandardRate = 0; iStandardRate < ma_countof(g_maStandardSampleRatePriorities); ++iStandardRate) {
+        ma_uint32 standardRate = g_maStandardSampleRatePriorities[iStandardRate];
+        ma_uint32 diff;
+
+        if (sampleRateIn > standardRate) {
+            diff = sampleRateIn - standardRate;
+        } else {
+            diff = standardRate - sampleRateIn;
+        }
+
+        if (diff == 0) {
+            return standardRate;    /* The input sample rate is a standard rate. */
+        }
+
+        if (closestDiff > diff) {
+            closestDiff = diff;
+            closestRate = standardRate;
+        }
+    }
+
+    return closestRate;
+}
+#endif
+
+
+static MA_INLINE unsigned int ma_device_disable_denormals(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (!pDevice->noDisableDenormals) {
+        return ma_disable_denormals();
+    } else {
+        return 0;
+    }
+}
+
+static MA_INLINE void ma_device_restore_denormals(ma_device* pDevice, unsigned int prevState)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (!pDevice->noDisableDenormals) {
+        ma_restore_denormals(prevState);
+    } else {
+        /* Do nothing. */
+        (void)prevState;
+    }
+}
+
+static ma_device_notification ma_device_notification_init(ma_device* pDevice, ma_device_notification_type type)
+{
+    ma_device_notification notification;
+
+    MA_ZERO_OBJECT(&notification);
+    notification.pDevice = pDevice;
+    notification.type    = type;
+
+    return notification;
+}
+
+static void ma_device__on_notification(ma_device_notification notification)
+{
+    MA_ASSERT(notification.pDevice != NULL);
+
+    if (notification.pDevice->onNotification != NULL) {
+        notification.pDevice->onNotification(&notification);
+    }
+
+    /* TEMP FOR COMPATIBILITY: If it's a stopped notification, fire the onStop callback as well. This is only for backwards compatibility and will be removed. */
+    if (notification.pDevice->onStop != NULL && notification.type == ma_device_notification_type_stopped) {
+        notification.pDevice->onStop(notification.pDevice);
+    }
+}
+
+static void ma_device__on_notification_started(ma_device* pDevice)
+{
+    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_started));
+}
+
+static void ma_device__on_notification_stopped(ma_device* pDevice)
+{
+    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_stopped));
+}
+
+/* Not all platforms support reroute notifications. */
+#if !defined(MA_EMSCRIPTEN)
+static void ma_device__on_notification_rerouted(ma_device* pDevice)
+{
+    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_rerouted));
+}
+#endif
+
+#if defined(MA_EMSCRIPTEN)
+#ifdef __cplusplus
+extern "C" {
+#endif
+void EMSCRIPTEN_KEEPALIVE ma_device__on_notification_unlocked(ma_device* pDevice)
+{
+    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_unlocked));
+}
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+
+static void ma_device__on_data_inner(ma_device* pDevice, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
+{
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pDevice->onData != NULL);
+
+    if (!pDevice->noPreSilencedOutputBuffer && pFramesOut != NULL) {
+        ma_silence_pcm_frames(pFramesOut, frameCount, pDevice->playback.format, pDevice->playback.channels);
+    }
+
+    pDevice->onData(pDevice, pFramesOut, pFramesIn, frameCount);
+}
+
+static void ma_device__on_data(ma_device* pDevice, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /* Don't read more data from the client if we're in the process of stopping. */
+    if (ma_device_get_state(pDevice) == ma_device_state_stopping) {
+        return;
+    }
+
+    if (pDevice->noFixedSizedCallback) {
+        /* Fast path. Not using a fixed sized callback. Process directly from the specified buffers. */
+        ma_device__on_data_inner(pDevice, pFramesOut, pFramesIn, frameCount);
+    } else {
+        /* Slow path. Using a fixed sized callback. Need to use the intermediary buffer. */
+        ma_uint32 totalFramesProcessed = 0;
+
+        while (totalFramesProcessed < frameCount) {
+            ma_uint32 totalFramesRemaining = frameCount - totalFramesProcessed;
+            ma_uint32 framesToProcessThisIteration = 0;
+
+            if (pFramesIn != NULL) {
+                /* Capturing. Write to the intermediary buffer. If there's no room, fire the callback to empty it. */
+                if (pDevice->capture.intermediaryBufferLen < pDevice->capture.intermediaryBufferCap) {
+                    /* There's some room left in the intermediary buffer. Write to it without firing the callback. */
+                    framesToProcessThisIteration = totalFramesRemaining;
+                    if (framesToProcessThisIteration > pDevice->capture.intermediaryBufferCap - pDevice->capture.intermediaryBufferLen) {
+                        framesToProcessThisIteration = pDevice->capture.intermediaryBufferCap - pDevice->capture.intermediaryBufferLen;
+                    }
+
+                    ma_copy_pcm_frames(
+                        ma_offset_pcm_frames_ptr(pDevice->capture.pIntermediaryBuffer, pDevice->capture.intermediaryBufferLen, pDevice->capture.format, pDevice->capture.channels),
+                        ma_offset_pcm_frames_const_ptr(pFramesIn, totalFramesProcessed, pDevice->capture.format, pDevice->capture.channels),
+                        framesToProcessThisIteration,
+                        pDevice->capture.format, pDevice->capture.channels);
+
+                    pDevice->capture.intermediaryBufferLen += framesToProcessThisIteration;
+                }
+
+                if (pDevice->capture.intermediaryBufferLen == pDevice->capture.intermediaryBufferCap) {
+                    /* No room left in the intermediary buffer. Fire the data callback. */
+                    if (pDevice->type == ma_device_type_duplex) {
+                        /* We'll do the duplex data callback later after we've processed the playback data. */
+                    } else {
+                        ma_device__on_data_inner(pDevice, NULL, pDevice->capture.pIntermediaryBuffer, pDevice->capture.intermediaryBufferCap);
+
+                        /* The intermediary buffer has just been drained. */
+                        pDevice->capture.intermediaryBufferLen = 0;
+                    }
+                }
+            }
+
+            if (pFramesOut != NULL) {
+                /* Playing back. Read from the intermediary buffer. If there's nothing in it, fire the callback to fill it. */
+                if (pDevice->playback.intermediaryBufferLen > 0) {
+                    /* There's some content in the intermediary buffer. Read from that without firing the callback. */
+                    if (pDevice->type == ma_device_type_duplex) {
+                        /* The frames processed this iteration for a duplex device will always be based on the capture side. Leave it unmodified. */
+                    } else {
+                        framesToProcessThisIteration = totalFramesRemaining;
+                        if (framesToProcessThisIteration > pDevice->playback.intermediaryBufferLen) {
+                            framesToProcessThisIteration = pDevice->playback.intermediaryBufferLen;
+                        }
+                    }
+
+                    ma_copy_pcm_frames(
+                        ma_offset_pcm_frames_ptr(pFramesOut, totalFramesProcessed, pDevice->playback.format, pDevice->playback.channels),
+                        ma_offset_pcm_frames_ptr(pDevice->playback.pIntermediaryBuffer, pDevice->playback.intermediaryBufferCap - pDevice->playback.intermediaryBufferLen, pDevice->playback.format, pDevice->playback.channels),
+                        framesToProcessThisIteration,
+                        pDevice->playback.format, pDevice->playback.channels);
+
+                    pDevice->playback.intermediaryBufferLen -= framesToProcessThisIteration;
+                }
+
+                if (pDevice->playback.intermediaryBufferLen == 0) {
+                    /* There's nothing in the intermediary buffer. Fire the data callback to fill it. */
+                    if (pDevice->type == ma_device_type_duplex) {
+                        /* In duplex mode, the data callback will be fired later. Nothing to do here. */
+                    } else {
+                        ma_device__on_data_inner(pDevice, pDevice->playback.pIntermediaryBuffer, NULL, pDevice->playback.intermediaryBufferCap);
+
+                        /* The intermediary buffer has just been filled. */
+                        pDevice->playback.intermediaryBufferLen = pDevice->playback.intermediaryBufferCap;
+                    }
+                }
+            }
+
+            /* If we're in duplex mode we might need to do a refill of the data. */
+            if (pDevice->type == ma_device_type_duplex) {
+                if (pDevice->capture.intermediaryBufferLen == pDevice->capture.intermediaryBufferCap) {
+                    ma_device__on_data_inner(pDevice, pDevice->playback.pIntermediaryBuffer, pDevice->capture.pIntermediaryBuffer, pDevice->capture.intermediaryBufferCap);
+
+                    pDevice->playback.intermediaryBufferLen = pDevice->playback.intermediaryBufferCap;  /* The playback buffer will have just been filled. */
+                    pDevice->capture.intermediaryBufferLen  = 0;                                        /* The intermediary buffer has just been drained. */
+                }
+            }
+
+            /* Make sure this is only incremented once in the duplex case. */
+            totalFramesProcessed += framesToProcessThisIteration;
+        }
+    }
+}
+
+static void ma_device__handle_data_callback(ma_device* pDevice, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
+{
+    float masterVolumeFactor;
+
+    ma_device_get_master_volume(pDevice, &masterVolumeFactor);  /* Use ma_device_get_master_volume() to ensure the volume is loaded atomically. */
+
+    if (pDevice->onData) {
+        unsigned int prevDenormalState = ma_device_disable_denormals(pDevice);
+        {
+            /* Volume control of input makes things a bit awkward because the input buffer is read-only. We'll need to use a temp buffer and loop in this case. */
+            if (pFramesIn != NULL && masterVolumeFactor != 1) {
+                ma_uint8 tempFramesIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+                ma_uint32 bpfCapture  = ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
+                ma_uint32 bpfPlayback = ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
+                ma_uint32 totalFramesProcessed = 0;
+                while (totalFramesProcessed < frameCount) {
+                    ma_uint32 framesToProcessThisIteration = frameCount - totalFramesProcessed;
+                    if (framesToProcessThisIteration > sizeof(tempFramesIn)/bpfCapture) {
+                        framesToProcessThisIteration = sizeof(tempFramesIn)/bpfCapture;
+                    }
+
+                    ma_copy_and_apply_volume_factor_pcm_frames(tempFramesIn, ma_offset_ptr(pFramesIn, totalFramesProcessed*bpfCapture), framesToProcessThisIteration, pDevice->capture.format, pDevice->capture.channels, masterVolumeFactor);
+
+                    ma_device__on_data(pDevice, ma_offset_ptr(pFramesOut, totalFramesProcessed*bpfPlayback), tempFramesIn, framesToProcessThisIteration);
+
+                    totalFramesProcessed += framesToProcessThisIteration;
+                }
+            } else {
+                ma_device__on_data(pDevice, pFramesOut, pFramesIn, frameCount);
+            }
+
+            /* Volume control and clipping for playback devices. */
+            if (pFramesOut != NULL) {
+                if (masterVolumeFactor != 1) {
+                    if (pFramesIn == NULL) {    /* <-- In full-duplex situations, the volume will have been applied to the input samples before the data callback. Applying it again post-callback will incorrectly compound it. */
+                        ma_apply_volume_factor_pcm_frames(pFramesOut, frameCount, pDevice->playback.format, pDevice->playback.channels, masterVolumeFactor);
+                    }
+                }
+
+                if (!pDevice->noClip && pDevice->playback.format == ma_format_f32) {
+                    ma_clip_samples_f32((float*)pFramesOut, (const float*)pFramesOut, frameCount * pDevice->playback.channels);   /* Intentionally specifying the same pointer for both input and output for in-place processing. */
+                }
+            }
+        }
+        ma_device_restore_denormals(pDevice, prevDenormalState);
+    } else {
+        /* No data callback. Just silence the output. */
+        if (pFramesOut != NULL) {
+            ma_silence_pcm_frames(pFramesOut, frameCount, pDevice->playback.format, pDevice->playback.channels);
+        }
+    }
+}
+
+
+
+/* A helper function for reading sample data from the client. */
+static void ma_device__read_frames_from_client(ma_device* pDevice, ma_uint32 frameCount, void* pFramesOut)
+{
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(frameCount > 0);
+    MA_ASSERT(pFramesOut != NULL);
+
+    if (pDevice->playback.converter.isPassthrough) {
+        ma_device__handle_data_callback(pDevice, pFramesOut, NULL, frameCount);
+    } else {
+        ma_result result;
+        ma_uint64 totalFramesReadOut;
+        void* pRunningFramesOut;
+
+        totalFramesReadOut = 0;
+        pRunningFramesOut  = pFramesOut;
+
+        /*
+        We run slightly different logic depending on whether or not we're using a heap-allocated
+        buffer for caching input data. This will be the case if the data converter does not have
+        the ability to retrieve the required input frame count for a given output frame count.
+        */
+        if (pDevice->playback.pInputCache != NULL) {
+            while (totalFramesReadOut < frameCount) {
+                ma_uint64 framesToReadThisIterationIn;
+                ma_uint64 framesToReadThisIterationOut;
+
+                /* If there's any data available in the cache, that needs to get processed first. */
+                if (pDevice->playback.inputCacheRemaining > 0) {
+                    framesToReadThisIterationOut = (frameCount - totalFramesReadOut);
+                    framesToReadThisIterationIn  = framesToReadThisIterationOut;
+                    if (framesToReadThisIterationIn > pDevice->playback.inputCacheRemaining) {
+                        framesToReadThisIterationIn = pDevice->playback.inputCacheRemaining;
+                    }
+
+                    result = ma_data_converter_process_pcm_frames(&pDevice->playback.converter, ma_offset_pcm_frames_ptr(pDevice->playback.pInputCache, pDevice->playback.inputCacheConsumed, pDevice->playback.format, pDevice->playback.channels), &framesToReadThisIterationIn, pRunningFramesOut, &framesToReadThisIterationOut);
+                    if (result != MA_SUCCESS) {
+                        break;
+                    }
+
+                    pDevice->playback.inputCacheConsumed  += framesToReadThisIterationIn;
+                    pDevice->playback.inputCacheRemaining -= framesToReadThisIterationIn;
+
+                    totalFramesReadOut += framesToReadThisIterationOut;
+                    pRunningFramesOut   = ma_offset_ptr(pRunningFramesOut, framesToReadThisIterationOut * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
+
+                    if (framesToReadThisIterationIn == 0 && framesToReadThisIterationOut == 0) {
+                        break;  /* We're done. */
+                    }
+                }
+
+                /* Getting here means there's no data in the cache and we need to fill it up with data from the client. */
+                if (pDevice->playback.inputCacheRemaining == 0) {
+                    ma_device__handle_data_callback(pDevice, pDevice->playback.pInputCache, NULL, (ma_uint32)pDevice->playback.inputCacheCap);
+
+                    pDevice->playback.inputCacheConsumed  = 0;
+                    pDevice->playback.inputCacheRemaining = pDevice->playback.inputCacheCap;
+                }
+            }
+        } else {
+            while (totalFramesReadOut < frameCount) {
+                ma_uint8 pIntermediaryBuffer[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In client format. */
+                ma_uint64 intermediaryBufferCap = sizeof(pIntermediaryBuffer) / ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
+                ma_uint64 framesToReadThisIterationIn;
+                ma_uint64 framesReadThisIterationIn;
+                ma_uint64 framesToReadThisIterationOut;
+                ma_uint64 framesReadThisIterationOut;
+                ma_uint64 requiredInputFrameCount;
+
+                framesToReadThisIterationOut = (frameCount - totalFramesReadOut);
+                framesToReadThisIterationIn = framesToReadThisIterationOut;
+                if (framesToReadThisIterationIn > intermediaryBufferCap) {
+                    framesToReadThisIterationIn = intermediaryBufferCap;
+                }
+
+                ma_data_converter_get_required_input_frame_count(&pDevice->playback.converter, framesToReadThisIterationOut, &requiredInputFrameCount);
+                if (framesToReadThisIterationIn > requiredInputFrameCount) {
+                    framesToReadThisIterationIn = requiredInputFrameCount;
+                }
+
+                ma_device__handle_data_callback(pDevice, pIntermediaryBuffer, NULL, (ma_uint32)framesToReadThisIterationIn);
+
+                /*
+                At this point we have our decoded data in input format and now we need to convert to output format. Note that even if we didn't read any
+                input frames, we still want to try processing frames because there may some output frames generated from cached input data.
+                */
+                framesReadThisIterationIn  = framesToReadThisIterationIn;
+                framesReadThisIterationOut = framesToReadThisIterationOut;
+                result = ma_data_converter_process_pcm_frames(&pDevice->playback.converter, pIntermediaryBuffer, &framesReadThisIterationIn, pRunningFramesOut, &framesReadThisIterationOut);
+                if (result != MA_SUCCESS) {
+                    break;
+                }
+
+                totalFramesReadOut += framesReadThisIterationOut;
+                pRunningFramesOut   = ma_offset_ptr(pRunningFramesOut, framesReadThisIterationOut * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
+
+                if (framesReadThisIterationIn == 0 && framesReadThisIterationOut == 0) {
+                    break;  /* We're done. */
+                }
+            }
+        }
+    }
+}
+
+/* A helper for sending sample data to the client. */
+static void ma_device__send_frames_to_client(ma_device* pDevice, ma_uint32 frameCountInDeviceFormat, const void* pFramesInDeviceFormat)
+{
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(frameCountInDeviceFormat > 0);
+    MA_ASSERT(pFramesInDeviceFormat != NULL);
+
+    if (pDevice->capture.converter.isPassthrough) {
+        ma_device__handle_data_callback(pDevice, NULL, pFramesInDeviceFormat, frameCountInDeviceFormat);
+    } else {
+        ma_result result;
+        ma_uint8 pFramesInClientFormat[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+        ma_uint64 framesInClientFormatCap = sizeof(pFramesInClientFormat) / ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
+        ma_uint64 totalDeviceFramesProcessed = 0;
+        ma_uint64 totalClientFramesProcessed = 0;
+        const void* pRunningFramesInDeviceFormat = pFramesInDeviceFormat;
+
+        /* We just keep going until we've exhausted all of our input frames and cannot generate any more output frames. */
+        for (;;) {
+            ma_uint64 deviceFramesProcessedThisIteration;
+            ma_uint64 clientFramesProcessedThisIteration;
+
+            deviceFramesProcessedThisIteration = (frameCountInDeviceFormat - totalDeviceFramesProcessed);
+            clientFramesProcessedThisIteration = framesInClientFormatCap;
+
+            result = ma_data_converter_process_pcm_frames(&pDevice->capture.converter, pRunningFramesInDeviceFormat, &deviceFramesProcessedThisIteration, pFramesInClientFormat, &clientFramesProcessedThisIteration);
+            if (result != MA_SUCCESS) {
+                break;
+            }
+
+            if (clientFramesProcessedThisIteration > 0) {
+                ma_device__handle_data_callback(pDevice, NULL, pFramesInClientFormat, (ma_uint32)clientFramesProcessedThisIteration);    /* Safe cast. */
+            }
+
+            pRunningFramesInDeviceFormat = ma_offset_ptr(pRunningFramesInDeviceFormat, deviceFramesProcessedThisIteration * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
+            totalDeviceFramesProcessed  += deviceFramesProcessedThisIteration;
+            totalClientFramesProcessed  += clientFramesProcessedThisIteration;
+
+            /* This is just to silence a warning. I might want to use this variable later so leaving in place for now. */
+            (void)totalClientFramesProcessed;
+
+            if (deviceFramesProcessedThisIteration == 0 && clientFramesProcessedThisIteration == 0) {
+                break;  /* We're done. */
+            }
+        }
+    }
+}
+
+static ma_result ma_device__handle_duplex_callback_capture(ma_device* pDevice, ma_uint32 frameCountInDeviceFormat, const void* pFramesInDeviceFormat, ma_pcm_rb* pRB)
+{
+    ma_result result;
+    ma_uint32 totalDeviceFramesProcessed = 0;
+    const void* pRunningFramesInDeviceFormat = pFramesInDeviceFormat;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(frameCountInDeviceFormat > 0);
+    MA_ASSERT(pFramesInDeviceFormat != NULL);
+    MA_ASSERT(pRB != NULL);
+
+    /* Write to the ring buffer. The ring buffer is in the client format which means we need to convert. */
+    for (;;) {
+        ma_uint32 framesToProcessInDeviceFormat = (frameCountInDeviceFormat - totalDeviceFramesProcessed);
+        ma_uint32 framesToProcessInClientFormat = MA_DATA_CONVERTER_STACK_BUFFER_SIZE / ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
+        ma_uint64 framesProcessedInDeviceFormat;
+        ma_uint64 framesProcessedInClientFormat;
+        void* pFramesInClientFormat;
+
+        result = ma_pcm_rb_acquire_write(pRB, &framesToProcessInClientFormat, &pFramesInClientFormat);
+        if (result != MA_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "Failed to acquire capture PCM frames from ring buffer.");
+            break;
+        }
+
+        if (framesToProcessInClientFormat == 0) {
+            if (ma_pcm_rb_pointer_distance(pRB) == (ma_int32)ma_pcm_rb_get_subbuffer_size(pRB)) {
+                break;  /* Overrun. Not enough room in the ring buffer for input frame. Excess frames are dropped. */
+            }
+        }
+
+        /* Convert. */
+        framesProcessedInDeviceFormat = framesToProcessInDeviceFormat;
+        framesProcessedInClientFormat = framesToProcessInClientFormat;
+        result = ma_data_converter_process_pcm_frames(&pDevice->capture.converter, pRunningFramesInDeviceFormat, &framesProcessedInDeviceFormat, pFramesInClientFormat, &framesProcessedInClientFormat);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        result = ma_pcm_rb_commit_write(pRB, (ma_uint32)framesProcessedInClientFormat);  /* Safe cast. */
+        if (result != MA_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "Failed to commit capture PCM frames to ring buffer.");
+            break;
+        }
+
+        pRunningFramesInDeviceFormat = ma_offset_ptr(pRunningFramesInDeviceFormat, framesProcessedInDeviceFormat * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
+        totalDeviceFramesProcessed += (ma_uint32)framesProcessedInDeviceFormat; /* Safe cast. */
+
+        /* We're done when we're unable to process any client nor device frames. */
+        if (framesProcessedInClientFormat == 0 && framesProcessedInDeviceFormat == 0) {
+            break;  /* Done. */
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device__handle_duplex_callback_playback(ma_device* pDevice, ma_uint32 frameCount, void* pFramesInInternalFormat, ma_pcm_rb* pRB)
+{
+    ma_result result;
+    ma_uint8 silentInputFrames[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+    ma_uint32 totalFramesReadOut = 0;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(frameCount > 0);
+    MA_ASSERT(pFramesInInternalFormat != NULL);
+    MA_ASSERT(pRB != NULL);
+    MA_ASSERT(pDevice->playback.pInputCache != NULL);
+
+    /*
+    Sitting in the ring buffer should be captured data from the capture callback in external format. If there's not enough data in there for
+    the whole frameCount frames we just use silence instead for the input data.
+    */
+    MA_ZERO_MEMORY(silentInputFrames, sizeof(silentInputFrames));
+
+    while (totalFramesReadOut < frameCount && ma_device_is_started(pDevice)) {
+        /*
+        We should have a buffer allocated on the heap. Any playback frames still sitting in there
+        need to be sent to the internal device before we process any more data from the client.
+        */
+        if (pDevice->playback.inputCacheRemaining > 0) {
+            ma_uint64 framesConvertedIn  = pDevice->playback.inputCacheRemaining;
+            ma_uint64 framesConvertedOut = (frameCount - totalFramesReadOut);
+            ma_data_converter_process_pcm_frames(&pDevice->playback.converter, ma_offset_pcm_frames_ptr(pDevice->playback.pInputCache, pDevice->playback.inputCacheConsumed, pDevice->playback.format, pDevice->playback.channels), &framesConvertedIn, pFramesInInternalFormat, &framesConvertedOut);
+
+            pDevice->playback.inputCacheConsumed  += framesConvertedIn;
+            pDevice->playback.inputCacheRemaining -= framesConvertedIn;
+
+            totalFramesReadOut        += (ma_uint32)framesConvertedOut; /* Safe cast. */
+            pFramesInInternalFormat    = ma_offset_ptr(pFramesInInternalFormat, framesConvertedOut * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
+        }
+
+        /* If there's no more data in the cache we'll need to fill it with some. */
+        if (totalFramesReadOut < frameCount && pDevice->playback.inputCacheRemaining == 0) {
+            ma_uint32 inputFrameCount;
+            void* pInputFrames;
+
+            inputFrameCount = (ma_uint32)pDevice->playback.inputCacheCap;
+            result = ma_pcm_rb_acquire_read(pRB, &inputFrameCount, &pInputFrames);
+            if (result == MA_SUCCESS) {
+                if (inputFrameCount > 0) {
+                    ma_device__handle_data_callback(pDevice, pDevice->playback.pInputCache, pInputFrames, inputFrameCount);
+                } else {
+                    if (ma_pcm_rb_pointer_distance(pRB) == 0) {
+                        break;  /* Underrun. */
+                    }
+                }
+            } else {
+                /* No capture data available. Feed in silence. */
+                inputFrameCount = (ma_uint32)ma_min(pDevice->playback.inputCacheCap, sizeof(silentInputFrames) / ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels));
+                ma_device__handle_data_callback(pDevice, pDevice->playback.pInputCache, silentInputFrames, inputFrameCount);
+            }
+
+            pDevice->playback.inputCacheConsumed  = 0;
+            pDevice->playback.inputCacheRemaining = inputFrameCount;
+
+            result = ma_pcm_rb_commit_read(pRB, inputFrameCount);
+            if (result != MA_SUCCESS) {
+                return result;  /* Should never happen. */
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+/* A helper for changing the state of the device. */
+static MA_INLINE void ma_device__set_state(ma_device* pDevice, ma_device_state newState)
+{
+    ma_atomic_device_state_set(&pDevice->state, newState);
+}
+
+
+#if defined(MA_WIN32)
+    static GUID MA_GUID_KSDATAFORMAT_SUBTYPE_PCM        = {0x00000001, 0x0000, 0x0010, {0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}};
+    static GUID MA_GUID_KSDATAFORMAT_SUBTYPE_IEEE_FLOAT = {0x00000003, 0x0000, 0x0010, {0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}};
+    /*static GUID MA_GUID_KSDATAFORMAT_SUBTYPE_ALAW       = {0x00000006, 0x0000, 0x0010, {0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}};*/
+    /*static GUID MA_GUID_KSDATAFORMAT_SUBTYPE_MULAW      = {0x00000007, 0x0000, 0x0010, {0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}};*/
+#endif
+
+
+
+MA_API ma_uint32 ma_get_format_priority_index(ma_format format) /* Lower = better. */
+{
+    ma_uint32 i;
+    for (i = 0; i < ma_countof(g_maFormatPriorities); ++i) {
+        if (g_maFormatPriorities[i] == format) {
+            return i;
+        }
+    }
+
+    /* Getting here means the format could not be found or is equal to ma_format_unknown. */
+    return (ma_uint32)-1;
+}
+
+static ma_result ma_device__post_init_setup(ma_device* pDevice, ma_device_type deviceType);
+
+static ma_bool32 ma_device_descriptor_is_valid(const ma_device_descriptor* pDeviceDescriptor)
+{
+    if (pDeviceDescriptor == NULL) {
+        return MA_FALSE;
+    }
+
+    if (pDeviceDescriptor->format == ma_format_unknown) {
+        return MA_FALSE;
+    }
+
+    if (pDeviceDescriptor->channels == 0 || pDeviceDescriptor->channels > MA_MAX_CHANNELS) {
+        return MA_FALSE;
+    }
+
+    if (pDeviceDescriptor->sampleRate == 0) {
+        return MA_FALSE;
+    }
+
+    return MA_TRUE;
+}
+
+
+static ma_result ma_device_audio_thread__default_read_write(ma_device* pDevice)
+{
+    ma_result result = MA_SUCCESS;
+    ma_bool32 exitLoop = MA_FALSE;
+    ma_uint8  capturedDeviceData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+    ma_uint8  playbackDeviceData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+    ma_uint32 capturedDeviceDataCapInFrames = 0;
+    ma_uint32 playbackDeviceDataCapInFrames = 0;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /* Just some quick validation on the device type and the available callbacks. */
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
+        if (pDevice->pContext->callbacks.onDeviceRead == NULL) {
+            return MA_NOT_IMPLEMENTED;
+        }
+
+        capturedDeviceDataCapInFrames = sizeof(capturedDeviceData) / ma_get_bytes_per_frame(pDevice->capture.internalFormat,  pDevice->capture.internalChannels);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->pContext->callbacks.onDeviceWrite == NULL) {
+            return MA_NOT_IMPLEMENTED;
+        }
+
+        playbackDeviceDataCapInFrames = sizeof(playbackDeviceData) / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+    }
+
+    /* NOTE: The device was started outside of this function, in the worker thread. */
+
+    while (ma_device_get_state(pDevice) == ma_device_state_started && !exitLoop) {
+        switch (pDevice->type) {
+            case ma_device_type_duplex:
+            {
+                /* The process is: onDeviceRead() -> convert -> callback -> convert -> onDeviceWrite() */
+                ma_uint32 totalCapturedDeviceFramesProcessed = 0;
+                ma_uint32 capturedDevicePeriodSizeInFrames = ma_min(pDevice->capture.internalPeriodSizeInFrames, pDevice->playback.internalPeriodSizeInFrames);
+
+                while (totalCapturedDeviceFramesProcessed < capturedDevicePeriodSizeInFrames) {
+                    ma_uint32 capturedDeviceFramesRemaining;
+                    ma_uint32 capturedDeviceFramesProcessed;
+                    ma_uint32 capturedDeviceFramesToProcess;
+                    ma_uint32 capturedDeviceFramesToTryProcessing = capturedDevicePeriodSizeInFrames - totalCapturedDeviceFramesProcessed;
+                    if (capturedDeviceFramesToTryProcessing > capturedDeviceDataCapInFrames) {
+                        capturedDeviceFramesToTryProcessing = capturedDeviceDataCapInFrames;
+                    }
+
+                    result = pDevice->pContext->callbacks.onDeviceRead(pDevice, capturedDeviceData, capturedDeviceFramesToTryProcessing, &capturedDeviceFramesToProcess);
+                    if (result != MA_SUCCESS) {
+                        exitLoop = MA_TRUE;
+                        break;
+                    }
+
+                    capturedDeviceFramesRemaining = capturedDeviceFramesToProcess;
+                    capturedDeviceFramesProcessed = 0;
+
+                    /* At this point we have our captured data in device format and we now need to convert it to client format. */
+                    for (;;) {
+                        ma_uint8  capturedClientData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+                        ma_uint8  playbackClientData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+                        ma_uint32 capturedClientDataCapInFrames = sizeof(capturedClientData) / ma_get_bytes_per_frame(pDevice->capture.format,  pDevice->capture.channels);
+                        ma_uint32 playbackClientDataCapInFrames = sizeof(playbackClientData) / ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
+                        ma_uint64 capturedClientFramesToProcessThisIteration = ma_min(capturedClientDataCapInFrames, playbackClientDataCapInFrames);
+                        ma_uint64 capturedDeviceFramesToProcessThisIteration = capturedDeviceFramesRemaining;
+                        ma_uint8* pRunningCapturedDeviceFrames = ma_offset_ptr(capturedDeviceData, capturedDeviceFramesProcessed * ma_get_bytes_per_frame(pDevice->capture.internalFormat,  pDevice->capture.internalChannels));
+
+                        /* Convert capture data from device format to client format. */
+                        result = ma_data_converter_process_pcm_frames(&pDevice->capture.converter, pRunningCapturedDeviceFrames, &capturedDeviceFramesToProcessThisIteration, capturedClientData, &capturedClientFramesToProcessThisIteration);
+                        if (result != MA_SUCCESS) {
+                            break;
+                        }
+
+                        /*
+                        If we weren't able to generate any output frames it must mean we've exhausted all of our input. The only time this would not be the case is if capturedClientData was too small
+                        which should never be the case when it's of the size MA_DATA_CONVERTER_STACK_BUFFER_SIZE.
+                        */
+                        if (capturedClientFramesToProcessThisIteration == 0) {
+                            break;
+                        }
+
+                        ma_device__handle_data_callback(pDevice, playbackClientData, capturedClientData, (ma_uint32)capturedClientFramesToProcessThisIteration);    /* Safe cast .*/
+
+                        capturedDeviceFramesProcessed += (ma_uint32)capturedDeviceFramesToProcessThisIteration; /* Safe cast. */
+                        capturedDeviceFramesRemaining -= (ma_uint32)capturedDeviceFramesToProcessThisIteration; /* Safe cast. */
+
+                        /* At this point the playbackClientData buffer should be holding data that needs to be written to the device. */
+                        for (;;) {
+                            ma_uint64 convertedClientFrameCount = capturedClientFramesToProcessThisIteration;
+                            ma_uint64 convertedDeviceFrameCount = playbackDeviceDataCapInFrames;
+                            result = ma_data_converter_process_pcm_frames(&pDevice->playback.converter, playbackClientData, &convertedClientFrameCount, playbackDeviceData, &convertedDeviceFrameCount);
+                            if (result != MA_SUCCESS) {
+                                break;
+                            }
+
+                            result = pDevice->pContext->callbacks.onDeviceWrite(pDevice, playbackDeviceData, (ma_uint32)convertedDeviceFrameCount, NULL);   /* Safe cast. */
+                            if (result != MA_SUCCESS) {
+                                exitLoop = MA_TRUE;
+                                break;
+                            }
+
+                            capturedClientFramesToProcessThisIteration -= (ma_uint32)convertedClientFrameCount;  /* Safe cast. */
+                            if (capturedClientFramesToProcessThisIteration == 0) {
+                                break;
+                            }
+                        }
+
+                        /* In case an error happened from ma_device_write__null()... */
+                        if (result != MA_SUCCESS) {
+                            exitLoop = MA_TRUE;
+                            break;
+                        }
+                    }
+
+                    /* Make sure we don't get stuck in the inner loop. */
+                    if (capturedDeviceFramesProcessed == 0) {
+                        break;
+                    }
+
+                    totalCapturedDeviceFramesProcessed += capturedDeviceFramesProcessed;
+                }
+            } break;
+
+            case ma_device_type_capture:
+            case ma_device_type_loopback:
+            {
+                ma_uint32 periodSizeInFrames = pDevice->capture.internalPeriodSizeInFrames;
+                ma_uint32 framesReadThisPeriod = 0;
+                while (framesReadThisPeriod < periodSizeInFrames) {
+                    ma_uint32 framesRemainingInPeriod = periodSizeInFrames - framesReadThisPeriod;
+                    ma_uint32 framesProcessed;
+                    ma_uint32 framesToReadThisIteration = framesRemainingInPeriod;
+                    if (framesToReadThisIteration > capturedDeviceDataCapInFrames) {
+                        framesToReadThisIteration = capturedDeviceDataCapInFrames;
+                    }
+
+                    result = pDevice->pContext->callbacks.onDeviceRead(pDevice, capturedDeviceData, framesToReadThisIteration, &framesProcessed);
+                    if (result != MA_SUCCESS) {
+                        exitLoop = MA_TRUE;
+                        break;
+                    }
+
+                    /* Make sure we don't get stuck in the inner loop. */
+                    if (framesProcessed == 0) {
+                        break;
+                    }
+
+                    ma_device__send_frames_to_client(pDevice, framesProcessed, capturedDeviceData);
+
+                    framesReadThisPeriod += framesProcessed;
+                }
+            } break;
+
+            case ma_device_type_playback:
+            {
+                /* We write in chunks of the period size, but use a stack allocated buffer for the intermediary. */
+                ma_uint32 periodSizeInFrames = pDevice->playback.internalPeriodSizeInFrames;
+                ma_uint32 framesWrittenThisPeriod = 0;
+                while (framesWrittenThisPeriod < periodSizeInFrames) {
+                    ma_uint32 framesRemainingInPeriod = periodSizeInFrames - framesWrittenThisPeriod;
+                    ma_uint32 framesProcessed;
+                    ma_uint32 framesToWriteThisIteration = framesRemainingInPeriod;
+                    if (framesToWriteThisIteration > playbackDeviceDataCapInFrames) {
+                        framesToWriteThisIteration = playbackDeviceDataCapInFrames;
+                    }
+
+                    ma_device__read_frames_from_client(pDevice, framesToWriteThisIteration, playbackDeviceData);
+
+                    result = pDevice->pContext->callbacks.onDeviceWrite(pDevice, playbackDeviceData, framesToWriteThisIteration, &framesProcessed);
+                    if (result != MA_SUCCESS) {
+                        exitLoop = MA_TRUE;
+                        break;
+                    }
+
+                    /* Make sure we don't get stuck in the inner loop. */
+                    if (framesProcessed == 0) {
+                        break;
+                    }
+
+                    framesWrittenThisPeriod += framesProcessed;
+                }
+            } break;
+
+            /* Should never get here. */
+            default: break;
+        }
+    }
+
+    return result;
+}
+
+
+
+/*******************************************************************************
+
+Null Backend
+
+*******************************************************************************/
+#ifdef MA_HAS_NULL
+
+#define MA_DEVICE_OP_NONE__NULL    0
+#define MA_DEVICE_OP_START__NULL   1
+#define MA_DEVICE_OP_SUSPEND__NULL 2
+#define MA_DEVICE_OP_KILL__NULL    3
+
+static ma_thread_result MA_THREADCALL ma_device_thread__null(void* pData)
+{
+    ma_device* pDevice = (ma_device*)pData;
+    MA_ASSERT(pDevice != NULL);
+
+    for (;;) {  /* Keep the thread alive until the device is uninitialized. */
+        ma_uint32 operation;
+
+        /* Wait for an operation to be requested. */
+        ma_event_wait(&pDevice->null_device.operationEvent);
+
+        /* At this point an event should have been triggered. */
+        operation = pDevice->null_device.operation;
+
+        /* Starting the device needs to put the thread into a loop. */
+        if (operation == MA_DEVICE_OP_START__NULL) {
+            /* Reset the timer just in case. */
+            ma_timer_init(&pDevice->null_device.timer);
+
+            /* Getting here means a suspend or kill operation has been requested. */
+            pDevice->null_device.operationResult = MA_SUCCESS;
+            ma_event_signal(&pDevice->null_device.operationCompletionEvent);
+            ma_semaphore_release(&pDevice->null_device.operationSemaphore);
+            continue;
+        }
+
+        /* Suspending the device means we need to stop the timer and just continue the loop. */
+        if (operation == MA_DEVICE_OP_SUSPEND__NULL) {
+            /* We need to add the current run time to the prior run time, then reset the timer. */
+            pDevice->null_device.priorRunTime += ma_timer_get_time_in_seconds(&pDevice->null_device.timer);
+            ma_timer_init(&pDevice->null_device.timer);
+
+            /* We're done. */
+            pDevice->null_device.operationResult = MA_SUCCESS;
+            ma_event_signal(&pDevice->null_device.operationCompletionEvent);
+            ma_semaphore_release(&pDevice->null_device.operationSemaphore);
+            continue;
+        }
+
+        /* Killing the device means we need to get out of this loop so that this thread can terminate. */
+        if (operation == MA_DEVICE_OP_KILL__NULL) {
+            pDevice->null_device.operationResult = MA_SUCCESS;
+            ma_event_signal(&pDevice->null_device.operationCompletionEvent);
+            ma_semaphore_release(&pDevice->null_device.operationSemaphore);
+            break;
+        }
+
+        /* Getting a signal on a "none" operation probably means an error. Return invalid operation. */
+        if (operation == MA_DEVICE_OP_NONE__NULL) {
+            MA_ASSERT(MA_FALSE);  /* <-- Trigger this in debug mode to ensure developers are aware they're doing something wrong (or there's a bug in a miniaudio). */
+            pDevice->null_device.operationResult = MA_INVALID_OPERATION;
+            ma_event_signal(&pDevice->null_device.operationCompletionEvent);
+            ma_semaphore_release(&pDevice->null_device.operationSemaphore);
+            continue;   /* Continue the loop. Don't terminate. */
+        }
+    }
+
+    return (ma_thread_result)0;
+}
+
+static ma_result ma_device_do_operation__null(ma_device* pDevice, ma_uint32 operation)
+{
+    ma_result result;
+
+    /*
+    TODO: Need to review this and consider just using mutual exclusion. I think the original motivation
+    for this was to just post the event to a queue and return immediately, but that has since changed
+    and now this function is synchronous. I think this can be simplified to just use a mutex.
+    */
+
+    /*
+    The first thing to do is wait for an operation slot to become available. We only have a single slot for this, but we could extend this later
+    to support queuing of operations.
+    */
+    result = ma_semaphore_wait(&pDevice->null_device.operationSemaphore);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to wait for the event. */
+    }
+
+    /*
+    When we get here it means the background thread is not referencing the operation code and it can be changed. After changing this we need to
+    signal an event to the worker thread to let it know that it can start work.
+    */
+    pDevice->null_device.operation = operation;
+
+    /* Once the operation code has been set, the worker thread can start work. */
+    if (ma_event_signal(&pDevice->null_device.operationEvent) != MA_SUCCESS) {
+        return MA_ERROR;
+    }
+
+    /* We want everything to be synchronous so we're going to wait for the worker thread to complete it's operation. */
+    if (ma_event_wait(&pDevice->null_device.operationCompletionEvent) != MA_SUCCESS) {
+        return MA_ERROR;
+    }
+
+    return pDevice->null_device.operationResult;
+}
+
+static ma_uint64 ma_device_get_total_run_time_in_frames__null(ma_device* pDevice)
+{
+    ma_uint32 internalSampleRate;
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        internalSampleRate = pDevice->capture.internalSampleRate;
+    } else {
+        internalSampleRate = pDevice->playback.internalSampleRate;
+    }
+
+    return (ma_uint64)((pDevice->null_device.priorRunTime + ma_timer_get_time_in_seconds(&pDevice->null_device.timer)) * internalSampleRate);
+}
+
+static ma_result ma_context_enumerate_devices__null(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_bool32 cbResult = MA_TRUE;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    /* Playback. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), "NULL Playback Device", (size_t)-1);
+        deviceInfo.isDefault = MA_TRUE; /* Only one playback and capture device for the null backend, so might as well mark as default. */
+        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+    }
+
+    /* Capture. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), "NULL Capture Device", (size_t)-1);
+        deviceInfo.isDefault = MA_TRUE; /* Only one playback and capture device for the null backend, so might as well mark as default. */
+        cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+    }
+
+    (void)cbResult; /* Silence a static analysis warning. */
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info__null(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    MA_ASSERT(pContext != NULL);
+
+    if (pDeviceID != NULL && pDeviceID->nullbackend != 0) {
+        return MA_NO_DEVICE;   /* Don't know the device. */
+    }
+
+    /* Name / Description */
+    if (deviceType == ma_device_type_playback) {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), "NULL Playback Device", (size_t)-1);
+    } else {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), "NULL Capture Device", (size_t)-1);
+    }
+
+    pDeviceInfo->isDefault = MA_TRUE;   /* Only one playback and capture device for the null backend, so might as well mark as default. */
+
+    /* Support everything on the null backend. */
+    pDeviceInfo->nativeDataFormats[0].format     = ma_format_unknown;
+    pDeviceInfo->nativeDataFormats[0].channels   = 0;
+    pDeviceInfo->nativeDataFormats[0].sampleRate = 0;
+    pDeviceInfo->nativeDataFormats[0].flags      = 0;
+    pDeviceInfo->nativeDataFormatCount = 1;
+
+    (void)pContext;
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_device_uninit__null(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /* Keep it clean and wait for the device thread to finish before returning. */
+    ma_device_do_operation__null(pDevice, MA_DEVICE_OP_KILL__NULL);
+
+    /* Wait for the thread to finish before continuing. */
+    ma_thread_wait(&pDevice->null_device.deviceThread);
+
+    /* At this point the loop in the device thread is as good as terminated so we can uninitialize our events. */
+    ma_semaphore_uninit(&pDevice->null_device.operationSemaphore);
+    ma_event_uninit(&pDevice->null_device.operationCompletionEvent);
+    ma_event_uninit(&pDevice->null_device.operationEvent);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init__null(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ZERO_OBJECT(&pDevice->null_device);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    /* The null backend supports everything exactly as we specify it. */
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        pDescriptorCapture->format     = (pDescriptorCapture->format     != ma_format_unknown) ? pDescriptorCapture->format     : MA_DEFAULT_FORMAT;
+        pDescriptorCapture->channels   = (pDescriptorCapture->channels   != 0)                 ? pDescriptorCapture->channels   : MA_DEFAULT_CHANNELS;
+        pDescriptorCapture->sampleRate = (pDescriptorCapture->sampleRate != 0)                 ? pDescriptorCapture->sampleRate : MA_DEFAULT_SAMPLE_RATE;
+
+        if (pDescriptorCapture->channelMap[0] == MA_CHANNEL_NONE) {
+            ma_channel_map_init_standard(ma_standard_channel_map_default, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorCapture->channels);
+        }
+
+        pDescriptorCapture->periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptorCapture, pDescriptorCapture->sampleRate, pConfig->performanceProfile);
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        pDescriptorPlayback->format     = (pDescriptorPlayback->format     != ma_format_unknown) ? pDescriptorPlayback->format     : MA_DEFAULT_FORMAT;
+        pDescriptorPlayback->channels   = (pDescriptorPlayback->channels   != 0)                 ? pDescriptorPlayback->channels   : MA_DEFAULT_CHANNELS;
+        pDescriptorPlayback->sampleRate = (pDescriptorPlayback->sampleRate != 0)                 ? pDescriptorPlayback->sampleRate : MA_DEFAULT_SAMPLE_RATE;
+
+        if (pDescriptorPlayback->channelMap[0] == MA_CHANNEL_NONE) {
+            ma_channel_map_init_standard(ma_standard_channel_map_default, pDescriptorPlayback->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorPlayback->channels);
+        }
+
+        pDescriptorPlayback->periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptorPlayback, pDescriptorPlayback->sampleRate, pConfig->performanceProfile);
+    }
+
+    /*
+    In order to get timing right, we need to create a thread that does nothing but keeps track of the timer. This timer is started when the
+    first period is "written" to it, and then stopped in ma_device_stop__null().
+    */
+    result = ma_event_init(&pDevice->null_device.operationEvent);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_event_init(&pDevice->null_device.operationCompletionEvent);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_semaphore_init(1, &pDevice->null_device.operationSemaphore);    /* <-- It's important that the initial value is set to 1. */
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_thread_create(&pDevice->null_device.deviceThread, pDevice->pContext->threadPriority, 0, ma_device_thread__null, pDevice, &pDevice->pContext->allocationCallbacks);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start__null(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    ma_device_do_operation__null(pDevice, MA_DEVICE_OP_START__NULL);
+
+    ma_atomic_bool32_set(&pDevice->null_device.isStarted, MA_TRUE);
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__null(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    ma_device_do_operation__null(pDevice, MA_DEVICE_OP_SUSPEND__NULL);
+
+    ma_atomic_bool32_set(&pDevice->null_device.isStarted, MA_FALSE);
+    return MA_SUCCESS;
+}
+
+static ma_bool32 ma_device_is_started__null(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    return ma_atomic_bool32_get(&pDevice->null_device.isStarted);
+}
+
+static ma_result ma_device_write__null(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint32 totalPCMFramesProcessed;
+    ma_bool32 wasStartedOnEntry;
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = 0;
+    }
+
+    wasStartedOnEntry = ma_device_is_started__null(pDevice);
+
+    /* Keep going until everything has been read. */
+    totalPCMFramesProcessed = 0;
+    while (totalPCMFramesProcessed < frameCount) {
+        ma_uint64 targetFrame;
+
+        /* If there are any frames remaining in the current period, consume those first. */
+        if (pDevice->null_device.currentPeriodFramesRemainingPlayback > 0) {
+            ma_uint32 framesRemaining = (frameCount - totalPCMFramesProcessed);
+            ma_uint32 framesToProcess = pDevice->null_device.currentPeriodFramesRemainingPlayback;
+            if (framesToProcess > framesRemaining) {
+                framesToProcess = framesRemaining;
+            }
+
+            /* We don't actually do anything with pPCMFrames, so just mark it as unused to prevent a warning. */
+            (void)pPCMFrames;
+
+            pDevice->null_device.currentPeriodFramesRemainingPlayback -= framesToProcess;
+            totalPCMFramesProcessed += framesToProcess;
+        }
+
+        /* If we've consumed the current period we'll need to mark it as such an ensure the device is started if it's not already. */
+        if (pDevice->null_device.currentPeriodFramesRemainingPlayback == 0) {
+            pDevice->null_device.currentPeriodFramesRemainingPlayback = 0;
+
+            if (!ma_device_is_started__null(pDevice) && !wasStartedOnEntry) {
+                result = ma_device_start__null(pDevice);
+                if (result != MA_SUCCESS) {
+                    break;
+                }
+            }
+        }
+
+        /* If we've consumed the whole buffer we can return now. */
+        MA_ASSERT(totalPCMFramesProcessed <= frameCount);
+        if (totalPCMFramesProcessed == frameCount) {
+            break;
+        }
+
+        /* Getting here means we've still got more frames to consume, we but need to wait for it to become available. */
+        targetFrame = pDevice->null_device.lastProcessedFramePlayback;
+        for (;;) {
+            ma_uint64 currentFrame;
+
+            /* Stop waiting if the device has been stopped. */
+            if (!ma_device_is_started__null(pDevice)) {
+                break;
+            }
+
+            currentFrame = ma_device_get_total_run_time_in_frames__null(pDevice);
+            if (currentFrame >= targetFrame) {
+                break;
+            }
+
+            /* Getting here means we haven't yet reached the target sample, so continue waiting. */
+            ma_sleep(10);
+        }
+
+        pDevice->null_device.lastProcessedFramePlayback          += pDevice->playback.internalPeriodSizeInFrames;
+        pDevice->null_device.currentPeriodFramesRemainingPlayback = pDevice->playback.internalPeriodSizeInFrames;
+    }
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = totalPCMFramesProcessed;
+    }
+
+    return result;
+}
+
+static ma_result ma_device_read__null(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint32 totalPCMFramesProcessed;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    /* Keep going until everything has been read. */
+    totalPCMFramesProcessed = 0;
+    while (totalPCMFramesProcessed < frameCount) {
+        ma_uint64 targetFrame;
+
+        /* If there are any frames remaining in the current period, consume those first. */
+        if (pDevice->null_device.currentPeriodFramesRemainingCapture > 0) {
+            ma_uint32 bpf = ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+            ma_uint32 framesRemaining = (frameCount - totalPCMFramesProcessed);
+            ma_uint32 framesToProcess = pDevice->null_device.currentPeriodFramesRemainingCapture;
+            if (framesToProcess > framesRemaining) {
+                framesToProcess = framesRemaining;
+            }
+
+            /* We need to ensure the output buffer is zeroed. */
+            MA_ZERO_MEMORY(ma_offset_ptr(pPCMFrames, totalPCMFramesProcessed*bpf), framesToProcess*bpf);
+
+            pDevice->null_device.currentPeriodFramesRemainingCapture -= framesToProcess;
+            totalPCMFramesProcessed += framesToProcess;
+        }
+
+        /* If we've consumed the current period we'll need to mark it as such an ensure the device is started if it's not already. */
+        if (pDevice->null_device.currentPeriodFramesRemainingCapture == 0) {
+            pDevice->null_device.currentPeriodFramesRemainingCapture = 0;
+        }
+
+        /* If we've consumed the whole buffer we can return now. */
+        MA_ASSERT(totalPCMFramesProcessed <= frameCount);
+        if (totalPCMFramesProcessed == frameCount) {
+            break;
+        }
+
+        /* Getting here means we've still got more frames to consume, we but need to wait for it to become available. */
+        targetFrame = pDevice->null_device.lastProcessedFrameCapture + pDevice->capture.internalPeriodSizeInFrames;
+        for (;;) {
+            ma_uint64 currentFrame;
+
+            /* Stop waiting if the device has been stopped. */
+            if (!ma_device_is_started__null(pDevice)) {
+                break;
+            }
+
+            currentFrame = ma_device_get_total_run_time_in_frames__null(pDevice);
+            if (currentFrame >= targetFrame) {
+                break;
+            }
+
+            /* Getting here means we haven't yet reached the target sample, so continue waiting. */
+            ma_sleep(10);
+        }
+
+        pDevice->null_device.lastProcessedFrameCapture          += pDevice->capture.internalPeriodSizeInFrames;
+        pDevice->null_device.currentPeriodFramesRemainingCapture = pDevice->capture.internalPeriodSizeInFrames;
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = totalPCMFramesProcessed;
+    }
+
+    return result;
+}
+
+static ma_result ma_context_uninit__null(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_null);
+
+    (void)pContext;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__null(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    MA_ASSERT(pContext != NULL);
+
+    (void)pConfig;
+    (void)pContext;
+
+    pCallbacks->onContextInit             = ma_context_init__null;
+    pCallbacks->onContextUninit           = ma_context_uninit__null;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__null;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__null;
+    pCallbacks->onDeviceInit              = ma_device_init__null;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__null;
+    pCallbacks->onDeviceStart             = ma_device_start__null;
+    pCallbacks->onDeviceStop              = ma_device_stop__null;
+    pCallbacks->onDeviceRead              = ma_device_read__null;
+    pCallbacks->onDeviceWrite             = ma_device_write__null;
+    pCallbacks->onDeviceDataLoop          = NULL;   /* Our backend is asynchronous with a blocking read-write API which means we can get miniaudio to deal with the audio thread. */
+
+    /* The null backend always works. */
+    return MA_SUCCESS;
+}
+#endif
+
+
+
+/*******************************************************************************
+
+WIN32 COMMON
+
+*******************************************************************************/
+#if defined(MA_WIN32)
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    #define ma_CoInitializeEx(pContext, pvReserved, dwCoInit)                          ((pContext->win32.CoInitializeEx) ? ((MA_PFN_CoInitializeEx)pContext->win32.CoInitializeEx)(pvReserved, dwCoInit) : ((MA_PFN_CoInitialize)pContext->win32.CoInitialize)(pvReserved))
+    #define ma_CoUninitialize(pContext)                                                ((MA_PFN_CoUninitialize)pContext->win32.CoUninitialize)()
+    #define ma_CoCreateInstance(pContext, rclsid, pUnkOuter, dwClsContext, riid, ppv)  ((MA_PFN_CoCreateInstance)pContext->win32.CoCreateInstance)(rclsid, pUnkOuter, dwClsContext, riid, ppv)
+    #define ma_CoTaskMemFree(pContext, pv)                                             ((MA_PFN_CoTaskMemFree)pContext->win32.CoTaskMemFree)(pv)
+    #define ma_PropVariantClear(pContext, pvar)                                        ((MA_PFN_PropVariantClear)pContext->win32.PropVariantClear)(pvar)
+#else
+    #define ma_CoInitializeEx(pContext, pvReserved, dwCoInit)                          CoInitializeEx(pvReserved, dwCoInit)
+    #define ma_CoUninitialize(pContext)                                                CoUninitialize()
+    #define ma_CoCreateInstance(pContext, rclsid, pUnkOuter, dwClsContext, riid, ppv)  CoCreateInstance(rclsid, pUnkOuter, dwClsContext, riid, ppv)
+    #define ma_CoTaskMemFree(pContext, pv)                                             CoTaskMemFree(pv)
+    #define ma_PropVariantClear(pContext, pvar)                                        PropVariantClear(pvar)
+#endif
+
+#if !defined(MAXULONG_PTR) && !defined(__WATCOMC__)
+typedef size_t DWORD_PTR;
+#endif
+
+#if !defined(WAVE_FORMAT_1M08)
+#define WAVE_FORMAT_1M08    0x00000001
+#define WAVE_FORMAT_1S08    0x00000002
+#define WAVE_FORMAT_1M16    0x00000004
+#define WAVE_FORMAT_1S16    0x00000008
+#define WAVE_FORMAT_2M08    0x00000010
+#define WAVE_FORMAT_2S08    0x00000020
+#define WAVE_FORMAT_2M16    0x00000040
+#define WAVE_FORMAT_2S16    0x00000080
+#define WAVE_FORMAT_4M08    0x00000100
+#define WAVE_FORMAT_4S08    0x00000200
+#define WAVE_FORMAT_4M16    0x00000400
+#define WAVE_FORMAT_4S16    0x00000800
+#endif
+
+#if !defined(WAVE_FORMAT_44M08)
+#define WAVE_FORMAT_44M08   0x00000100
+#define WAVE_FORMAT_44S08   0x00000200
+#define WAVE_FORMAT_44M16   0x00000400
+#define WAVE_FORMAT_44S16   0x00000800
+#define WAVE_FORMAT_48M08   0x00001000
+#define WAVE_FORMAT_48S08   0x00002000
+#define WAVE_FORMAT_48M16   0x00004000
+#define WAVE_FORMAT_48S16   0x00008000
+#define WAVE_FORMAT_96M08   0x00010000
+#define WAVE_FORMAT_96S08   0x00020000
+#define WAVE_FORMAT_96M16   0x00040000
+#define WAVE_FORMAT_96S16   0x00080000
+#endif
+
+#ifndef SPEAKER_FRONT_LEFT
+#define SPEAKER_FRONT_LEFT            0x1
+#define SPEAKER_FRONT_RIGHT           0x2
+#define SPEAKER_FRONT_CENTER          0x4
+#define SPEAKER_LOW_FREQUENCY         0x8
+#define SPEAKER_BACK_LEFT             0x10
+#define SPEAKER_BACK_RIGHT            0x20
+#define SPEAKER_FRONT_LEFT_OF_CENTER  0x40
+#define SPEAKER_FRONT_RIGHT_OF_CENTER 0x80
+#define SPEAKER_BACK_CENTER           0x100
+#define SPEAKER_SIDE_LEFT             0x200
+#define SPEAKER_SIDE_RIGHT            0x400
+#define SPEAKER_TOP_CENTER            0x800
+#define SPEAKER_TOP_FRONT_LEFT        0x1000
+#define SPEAKER_TOP_FRONT_CENTER      0x2000
+#define SPEAKER_TOP_FRONT_RIGHT       0x4000
+#define SPEAKER_TOP_BACK_LEFT         0x8000
+#define SPEAKER_TOP_BACK_CENTER       0x10000
+#define SPEAKER_TOP_BACK_RIGHT        0x20000
+#endif
+
+/*
+Implement our own version of MA_WAVEFORMATEXTENSIBLE so we can avoid a header. Be careful with this
+because MA_WAVEFORMATEX has an extra two bytes over standard WAVEFORMATEX due to padding. The
+standard version uses tight packing, but for compiler compatibility we're not doing that with ours.
+*/
+typedef struct
+{
+    WORD wFormatTag;
+    WORD nChannels;
+    DWORD nSamplesPerSec;
+    DWORD nAvgBytesPerSec;
+    WORD nBlockAlign;
+    WORD wBitsPerSample;
+    WORD cbSize;
+} MA_WAVEFORMATEX;
+
+typedef struct
+{
+    WORD wFormatTag;
+    WORD nChannels;
+    DWORD nSamplesPerSec;
+    DWORD nAvgBytesPerSec;
+    WORD nBlockAlign;
+    WORD wBitsPerSample;
+    WORD cbSize;
+    union
+    {
+        WORD wValidBitsPerSample;
+        WORD wSamplesPerBlock;
+        WORD wReserved;
+    } Samples;
+    DWORD dwChannelMask;
+    GUID SubFormat;
+} MA_WAVEFORMATEXTENSIBLE;
+
+
+
+#ifndef WAVE_FORMAT_EXTENSIBLE
+#define WAVE_FORMAT_EXTENSIBLE  0xFFFE
+#endif
+
+#ifndef WAVE_FORMAT_PCM
+#define WAVE_FORMAT_PCM         1
+#endif
+
+#ifndef WAVE_FORMAT_IEEE_FLOAT
+#define WAVE_FORMAT_IEEE_FLOAT  0x0003
+#endif
+
+/* Converts an individual Win32-style channel identifier (SPEAKER_FRONT_LEFT, etc.) to miniaudio. */
+static ma_uint8 ma_channel_id_to_ma__win32(DWORD id)
+{
+    switch (id)
+    {
+        case SPEAKER_FRONT_LEFT:            return MA_CHANNEL_FRONT_LEFT;
+        case SPEAKER_FRONT_RIGHT:           return MA_CHANNEL_FRONT_RIGHT;
+        case SPEAKER_FRONT_CENTER:          return MA_CHANNEL_FRONT_CENTER;
+        case SPEAKER_LOW_FREQUENCY:         return MA_CHANNEL_LFE;
+        case SPEAKER_BACK_LEFT:             return MA_CHANNEL_BACK_LEFT;
+        case SPEAKER_BACK_RIGHT:            return MA_CHANNEL_BACK_RIGHT;
+        case SPEAKER_FRONT_LEFT_OF_CENTER:  return MA_CHANNEL_FRONT_LEFT_CENTER;
+        case SPEAKER_FRONT_RIGHT_OF_CENTER: return MA_CHANNEL_FRONT_RIGHT_CENTER;
+        case SPEAKER_BACK_CENTER:           return MA_CHANNEL_BACK_CENTER;
+        case SPEAKER_SIDE_LEFT:             return MA_CHANNEL_SIDE_LEFT;
+        case SPEAKER_SIDE_RIGHT:            return MA_CHANNEL_SIDE_RIGHT;
+        case SPEAKER_TOP_CENTER:            return MA_CHANNEL_TOP_CENTER;
+        case SPEAKER_TOP_FRONT_LEFT:        return MA_CHANNEL_TOP_FRONT_LEFT;
+        case SPEAKER_TOP_FRONT_CENTER:      return MA_CHANNEL_TOP_FRONT_CENTER;
+        case SPEAKER_TOP_FRONT_RIGHT:       return MA_CHANNEL_TOP_FRONT_RIGHT;
+        case SPEAKER_TOP_BACK_LEFT:         return MA_CHANNEL_TOP_BACK_LEFT;
+        case SPEAKER_TOP_BACK_CENTER:       return MA_CHANNEL_TOP_BACK_CENTER;
+        case SPEAKER_TOP_BACK_RIGHT:        return MA_CHANNEL_TOP_BACK_RIGHT;
+        default: return 0;
+    }
+}
+
+/* Converts an individual miniaudio channel identifier (MA_CHANNEL_FRONT_LEFT, etc.) to Win32-style. */
+static DWORD ma_channel_id_to_win32(DWORD id)
+{
+    switch (id)
+    {
+        case MA_CHANNEL_MONO:               return SPEAKER_FRONT_CENTER;
+        case MA_CHANNEL_FRONT_LEFT:         return SPEAKER_FRONT_LEFT;
+        case MA_CHANNEL_FRONT_RIGHT:        return SPEAKER_FRONT_RIGHT;
+        case MA_CHANNEL_FRONT_CENTER:       return SPEAKER_FRONT_CENTER;
+        case MA_CHANNEL_LFE:                return SPEAKER_LOW_FREQUENCY;
+        case MA_CHANNEL_BACK_LEFT:          return SPEAKER_BACK_LEFT;
+        case MA_CHANNEL_BACK_RIGHT:         return SPEAKER_BACK_RIGHT;
+        case MA_CHANNEL_FRONT_LEFT_CENTER:  return SPEAKER_FRONT_LEFT_OF_CENTER;
+        case MA_CHANNEL_FRONT_RIGHT_CENTER: return SPEAKER_FRONT_RIGHT_OF_CENTER;
+        case MA_CHANNEL_BACK_CENTER:        return SPEAKER_BACK_CENTER;
+        case MA_CHANNEL_SIDE_LEFT:          return SPEAKER_SIDE_LEFT;
+        case MA_CHANNEL_SIDE_RIGHT:         return SPEAKER_SIDE_RIGHT;
+        case MA_CHANNEL_TOP_CENTER:         return SPEAKER_TOP_CENTER;
+        case MA_CHANNEL_TOP_FRONT_LEFT:     return SPEAKER_TOP_FRONT_LEFT;
+        case MA_CHANNEL_TOP_FRONT_CENTER:   return SPEAKER_TOP_FRONT_CENTER;
+        case MA_CHANNEL_TOP_FRONT_RIGHT:    return SPEAKER_TOP_FRONT_RIGHT;
+        case MA_CHANNEL_TOP_BACK_LEFT:      return SPEAKER_TOP_BACK_LEFT;
+        case MA_CHANNEL_TOP_BACK_CENTER:    return SPEAKER_TOP_BACK_CENTER;
+        case MA_CHANNEL_TOP_BACK_RIGHT:     return SPEAKER_TOP_BACK_RIGHT;
+        default: return 0;
+    }
+}
+
+/* Converts a channel mapping to a Win32-style channel mask. */
+static DWORD ma_channel_map_to_channel_mask__win32(const ma_channel* pChannelMap, ma_uint32 channels)
+{
+    DWORD dwChannelMask = 0;
+    ma_uint32 iChannel;
+
+    for (iChannel = 0; iChannel < channels; ++iChannel) {
+        dwChannelMask |= ma_channel_id_to_win32(pChannelMap[iChannel]);
+    }
+
+    return dwChannelMask;
+}
+
+/* Converts a Win32-style channel mask to a miniaudio channel map. */
+static void ma_channel_mask_to_channel_map__win32(DWORD dwChannelMask, ma_uint32 channels, ma_channel* pChannelMap)
+{
+    /* If the channel mask is set to 0, just assume a default Win32 channel map. */
+    if (dwChannelMask == 0) {
+        ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pChannelMap, channels, channels);
+    } else {
+        if (channels == 1 && (dwChannelMask & SPEAKER_FRONT_CENTER) != 0) {
+            pChannelMap[0] = MA_CHANNEL_MONO;
+        } else {
+            /* Just iterate over each bit. */
+            ma_uint32 iChannel = 0;
+            ma_uint32 iBit;
+
+            for (iBit = 0; iBit < 32 && iChannel < channels; ++iBit) {
+                DWORD bitValue = (dwChannelMask & (1UL << iBit));
+                if (bitValue != 0) {
+                    /* The bit is set. */
+                    pChannelMap[iChannel] = ma_channel_id_to_ma__win32(bitValue);
+                    iChannel += 1;
+                }
+            }
+        }
+    }
+}
+
+#ifdef __cplusplus
+static ma_bool32 ma_is_guid_equal(const void* a, const void* b)
+{
+    return IsEqualGUID(*(const GUID*)a, *(const GUID*)b);
+}
+#else
+#define ma_is_guid_equal(a, b) IsEqualGUID((const GUID*)a, (const GUID*)b)
+#endif
+
+static MA_INLINE ma_bool32 ma_is_guid_null(const void* guid)
+{
+    static GUID nullguid = {0x00000000, 0x0000, 0x0000, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
+    return ma_is_guid_equal(guid, &nullguid);
+}
+
+static ma_format ma_format_from_WAVEFORMATEX(const MA_WAVEFORMATEX* pWF)
+{
+    MA_ASSERT(pWF != NULL);
+
+    if (pWF->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
+        const MA_WAVEFORMATEXTENSIBLE* pWFEX = (const MA_WAVEFORMATEXTENSIBLE*)pWF;
+        if (ma_is_guid_equal(&pWFEX->SubFormat, &MA_GUID_KSDATAFORMAT_SUBTYPE_PCM)) {
+            if (pWFEX->Samples.wValidBitsPerSample == 32) {
+                return ma_format_s32;
+            }
+            if (pWFEX->Samples.wValidBitsPerSample == 24) {
+                if (pWFEX->wBitsPerSample == 32) {
+                    return ma_format_s32;
+                }
+                if (pWFEX->wBitsPerSample == 24) {
+                    return ma_format_s24;
+                }
+            }
+            if (pWFEX->Samples.wValidBitsPerSample == 16) {
+                return ma_format_s16;
+            }
+            if (pWFEX->Samples.wValidBitsPerSample == 8) {
+                return ma_format_u8;
+            }
+        }
+        if (ma_is_guid_equal(&pWFEX->SubFormat, &MA_GUID_KSDATAFORMAT_SUBTYPE_IEEE_FLOAT)) {
+            if (pWFEX->Samples.wValidBitsPerSample == 32) {
+                return ma_format_f32;
+            }
+            /*
+            if (pWFEX->Samples.wValidBitsPerSample == 64) {
+                return ma_format_f64;
+            }
+            */
+        }
+    } else {
+        if (pWF->wFormatTag == WAVE_FORMAT_PCM) {
+            if (pWF->wBitsPerSample == 32) {
+                return ma_format_s32;
+            }
+            if (pWF->wBitsPerSample == 24) {
+                return ma_format_s24;
+            }
+            if (pWF->wBitsPerSample == 16) {
+                return ma_format_s16;
+            }
+            if (pWF->wBitsPerSample == 8) {
+                return ma_format_u8;
+            }
+        }
+        if (pWF->wFormatTag == WAVE_FORMAT_IEEE_FLOAT) {
+            if (pWF->wBitsPerSample == 32) {
+                return ma_format_f32;
+            }
+            if (pWF->wBitsPerSample == 64) {
+                /*return ma_format_f64;*/
+            }
+        }
+    }
+
+    return ma_format_unknown;
+}
+#endif
+
+
+/*******************************************************************************
+
+WASAPI Backend
+
+*******************************************************************************/
+#ifdef MA_HAS_WASAPI
+#if 0
+#if defined(_MSC_VER)
+    #pragma warning(push)
+    #pragma warning(disable:4091)   /* 'typedef ': ignored on left of '' when no variable is declared */
+#endif
+#include <audioclient.h>
+#include <mmdeviceapi.h>
+#if defined(_MSC_VER)
+    #pragma warning(pop)
+#endif
+#endif  /* 0 */
+
+static ma_result ma_device_reroute__wasapi(ma_device* pDevice, ma_device_type deviceType);
+
+/* Some compilers don't define VerifyVersionInfoW. Need to write this ourselves. */
+#define MA_WIN32_WINNT_VISTA    0x0600
+#define MA_VER_MINORVERSION     0x01
+#define MA_VER_MAJORVERSION     0x02
+#define MA_VER_SERVICEPACKMAJOR 0x20
+#define MA_VER_GREATER_EQUAL    0x03
+
+typedef struct  {
+    DWORD dwOSVersionInfoSize;
+    DWORD dwMajorVersion;
+    DWORD dwMinorVersion;
+    DWORD dwBuildNumber;
+    DWORD dwPlatformId;
+    WCHAR szCSDVersion[128];
+    WORD  wServicePackMajor;
+    WORD  wServicePackMinor;
+    WORD  wSuiteMask;
+    BYTE  wProductType;
+    BYTE  wReserved;
+} ma_OSVERSIONINFOEXW;
+
+typedef BOOL      (WINAPI * ma_PFNVerifyVersionInfoW) (ma_OSVERSIONINFOEXW* lpVersionInfo, DWORD dwTypeMask, DWORDLONG dwlConditionMask);
+typedef ULONGLONG (WINAPI * ma_PFNVerSetConditionMask)(ULONGLONG dwlConditionMask, DWORD dwTypeBitMask, BYTE dwConditionMask);
+
+
+#ifndef PROPERTYKEY_DEFINED
+#define PROPERTYKEY_DEFINED
+#ifndef __WATCOMC__
+typedef struct
+{
+    GUID fmtid;
+    DWORD pid;
+} PROPERTYKEY;
+#endif
+#endif
+
+/* Some compilers don't define PropVariantInit(). We just do this ourselves since it's just a memset(). */
+static MA_INLINE void ma_PropVariantInit(MA_PROPVARIANT* pProp)
+{
+    MA_ZERO_OBJECT(pProp);
+}
+
+
+static const PROPERTYKEY MA_PKEY_Device_FriendlyName             = {{0xA45C254E, 0xDF1C, 0x4EFD, {0x80, 0x20, 0x67, 0xD1, 0x46, 0xA8, 0x50, 0xE0}}, 14};
+static const PROPERTYKEY MA_PKEY_AudioEngine_DeviceFormat        = {{0xF19F064D, 0x82C,  0x4E27, {0xBC, 0x73, 0x68, 0x82, 0xA1, 0xBB, 0x8E, 0x4C}},  0};
+
+static const IID MA_IID_IUnknown                                 = {0x00000000, 0x0000, 0x0000, {0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46}}; /* 00000000-0000-0000-C000-000000000046 */
+#if !defined(MA_WIN32_DESKTOP) && !defined(MA_WIN32_GDK)
+static const IID MA_IID_IAgileObject                             = {0x94EA2B94, 0xE9CC, 0x49E0, {0xC0, 0xFF, 0xEE, 0x64, 0xCA, 0x8F, 0x5B, 0x90}}; /* 94EA2B94-E9CC-49E0-C0FF-EE64CA8F5B90 */
+#endif
+
+static const IID MA_IID_IAudioClient                             = {0x1CB9AD4C, 0xDBFA, 0x4C32, {0xB1, 0x78, 0xC2, 0xF5, 0x68, 0xA7, 0x03, 0xB2}}; /* 1CB9AD4C-DBFA-4C32-B178-C2F568A703B2 = __uuidof(IAudioClient) */
+static const IID MA_IID_IAudioClient2                            = {0x726778CD, 0xF60A, 0x4EDA, {0x82, 0xDE, 0xE4, 0x76, 0x10, 0xCD, 0x78, 0xAA}}; /* 726778CD-F60A-4EDA-82DE-E47610CD78AA = __uuidof(IAudioClient2) */
+static const IID MA_IID_IAudioClient3                            = {0x7ED4EE07, 0x8E67, 0x4CD4, {0x8C, 0x1A, 0x2B, 0x7A, 0x59, 0x87, 0xAD, 0x42}}; /* 7ED4EE07-8E67-4CD4-8C1A-2B7A5987AD42 = __uuidof(IAudioClient3) */
+static const IID MA_IID_IAudioRenderClient                       = {0xF294ACFC, 0x3146, 0x4483, {0xA7, 0xBF, 0xAD, 0xDC, 0xA7, 0xC2, 0x60, 0xE2}}; /* F294ACFC-3146-4483-A7BF-ADDCA7C260E2 = __uuidof(IAudioRenderClient) */
+static const IID MA_IID_IAudioCaptureClient                      = {0xC8ADBD64, 0xE71E, 0x48A0, {0xA4, 0xDE, 0x18, 0x5C, 0x39, 0x5C, 0xD3, 0x17}}; /* C8ADBD64-E71E-48A0-A4DE-185C395CD317 = __uuidof(IAudioCaptureClient) */
+static const IID MA_IID_IMMNotificationClient                    = {0x7991EEC9, 0x7E89, 0x4D85, {0x83, 0x90, 0x6C, 0x70, 0x3C, 0xEC, 0x60, 0xC0}}; /* 7991EEC9-7E89-4D85-8390-6C703CEC60C0 = __uuidof(IMMNotificationClient) */
+#if !defined(MA_WIN32_DESKTOP) && !defined(MA_WIN32_GDK)
+static const IID MA_IID_DEVINTERFACE_AUDIO_RENDER                = {0xE6327CAD, 0xDCEC, 0x4949, {0xAE, 0x8A, 0x99, 0x1E, 0x97, 0x6A, 0x79, 0xD2}}; /* E6327CAD-DCEC-4949-AE8A-991E976A79D2 */
+static const IID MA_IID_DEVINTERFACE_AUDIO_CAPTURE               = {0x2EEF81BE, 0x33FA, 0x4800, {0x96, 0x70, 0x1C, 0xD4, 0x74, 0x97, 0x2C, 0x3F}}; /* 2EEF81BE-33FA-4800-9670-1CD474972C3F */
+static const IID MA_IID_IActivateAudioInterfaceCompletionHandler = {0x41D949AB, 0x9862, 0x444A, {0x80, 0xF6, 0xC2, 0x61, 0x33, 0x4D, 0xA5, 0xEB}}; /* 41D949AB-9862-444A-80F6-C261334DA5EB */
+#endif
+
+static const IID MA_CLSID_MMDeviceEnumerator                     = {0xBCDE0395, 0xE52F, 0x467C, {0x8E, 0x3D, 0xC4, 0x57, 0x92, 0x91, 0x69, 0x2E}}; /* BCDE0395-E52F-467C-8E3D-C4579291692E = __uuidof(MMDeviceEnumerator) */
+static const IID MA_IID_IMMDeviceEnumerator                      = {0xA95664D2, 0x9614, 0x4F35, {0xA7, 0x46, 0xDE, 0x8D, 0xB6, 0x36, 0x17, 0xE6}}; /* A95664D2-9614-4F35-A746-DE8DB63617E6 = __uuidof(IMMDeviceEnumerator) */
+
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+#define MA_MM_DEVICE_STATE_ACTIVE                          1
+#define MA_MM_DEVICE_STATE_DISABLED                        2
+#define MA_MM_DEVICE_STATE_NOTPRESENT                      4
+#define MA_MM_DEVICE_STATE_UNPLUGGED                       8
+
+typedef struct ma_IMMDeviceEnumerator                      ma_IMMDeviceEnumerator;
+typedef struct ma_IMMDeviceCollection                      ma_IMMDeviceCollection;
+typedef struct ma_IMMDevice                                ma_IMMDevice;
+#else
+typedef struct ma_IActivateAudioInterfaceCompletionHandler ma_IActivateAudioInterfaceCompletionHandler;
+typedef struct ma_IActivateAudioInterfaceAsyncOperation    ma_IActivateAudioInterfaceAsyncOperation;
+#endif
+typedef struct ma_IPropertyStore                           ma_IPropertyStore;
+typedef struct ma_IAudioClient                             ma_IAudioClient;
+typedef struct ma_IAudioClient2                            ma_IAudioClient2;
+typedef struct ma_IAudioClient3                            ma_IAudioClient3;
+typedef struct ma_IAudioRenderClient                       ma_IAudioRenderClient;
+typedef struct ma_IAudioCaptureClient                      ma_IAudioCaptureClient;
+
+typedef ma_int64                                           MA_REFERENCE_TIME;
+
+#define MA_AUDCLNT_STREAMFLAGS_CROSSPROCESS                0x00010000
+#define MA_AUDCLNT_STREAMFLAGS_LOOPBACK                    0x00020000
+#define MA_AUDCLNT_STREAMFLAGS_EVENTCALLBACK               0x00040000
+#define MA_AUDCLNT_STREAMFLAGS_NOPERSIST                   0x00080000
+#define MA_AUDCLNT_STREAMFLAGS_RATEADJUST                  0x00100000
+#define MA_AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY         0x08000000
+#define MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM              0x80000000
+#define MA_AUDCLNT_SESSIONFLAGS_EXPIREWHENUNOWNED          0x10000000
+#define MA_AUDCLNT_SESSIONFLAGS_DISPLAY_HIDE               0x20000000
+#define MA_AUDCLNT_SESSIONFLAGS_DISPLAY_HIDEWHENEXPIRED    0x40000000
+
+/* Buffer flags. */
+#define MA_AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY          1
+#define MA_AUDCLNT_BUFFERFLAGS_SILENT                      2
+#define MA_AUDCLNT_BUFFERFLAGS_TIMESTAMP_ERROR             4
+
+typedef enum
+{
+    ma_eRender  = 0,
+    ma_eCapture = 1,
+    ma_eAll     = 2
+} ma_EDataFlow;
+
+typedef enum
+{
+    ma_eConsole        = 0,
+    ma_eMultimedia     = 1,
+    ma_eCommunications = 2
+} ma_ERole;
+
+typedef enum
+{
+    MA_AUDCLNT_SHAREMODE_SHARED,
+    MA_AUDCLNT_SHAREMODE_EXCLUSIVE
+} MA_AUDCLNT_SHAREMODE;
+
+typedef enum
+{
+    MA_AudioCategory_Other = 0  /* <-- miniaudio is only caring about Other. */
+} MA_AUDIO_STREAM_CATEGORY;
+
+typedef struct
+{
+    ma_uint32 cbSize;
+    BOOL bIsOffload;
+    MA_AUDIO_STREAM_CATEGORY eCategory;
+} ma_AudioClientProperties;
+
+/* IUnknown */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IUnknown* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IUnknown* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IUnknown* pThis);
+} ma_IUnknownVtbl;
+struct ma_IUnknown
+{
+    ma_IUnknownVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IUnknown_QueryInterface(ma_IUnknown* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IUnknown_AddRef(ma_IUnknown* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IUnknown_Release(ma_IUnknown* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    /* IMMNotificationClient */
+    typedef struct
+    {
+        /* IUnknown */
+        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IMMNotificationClient* pThis, const IID* const riid, void** ppObject);
+        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IMMNotificationClient* pThis);
+        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IMMNotificationClient* pThis);
+
+        /* IMMNotificationClient */
+        HRESULT (STDMETHODCALLTYPE * OnDeviceStateChanged)  (ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID, DWORD dwNewState);
+        HRESULT (STDMETHODCALLTYPE * OnDeviceAdded)         (ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID);
+        HRESULT (STDMETHODCALLTYPE * OnDeviceRemoved)       (ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID);
+        HRESULT (STDMETHODCALLTYPE * OnDefaultDeviceChanged)(ma_IMMNotificationClient* pThis, ma_EDataFlow dataFlow, ma_ERole role, const WCHAR* pDefaultDeviceID);
+        HRESULT (STDMETHODCALLTYPE * OnPropertyValueChanged)(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID, const PROPERTYKEY key);
+    } ma_IMMNotificationClientVtbl;
+
+    /* IMMDeviceEnumerator */
+    typedef struct
+    {
+        /* IUnknown */
+        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IMMDeviceEnumerator* pThis, const IID* const riid, void** ppObject);
+        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IMMDeviceEnumerator* pThis);
+        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IMMDeviceEnumerator* pThis);
+
+        /* IMMDeviceEnumerator */
+        HRESULT (STDMETHODCALLTYPE * EnumAudioEndpoints)                    (ma_IMMDeviceEnumerator* pThis, ma_EDataFlow dataFlow, DWORD dwStateMask, ma_IMMDeviceCollection** ppDevices);
+        HRESULT (STDMETHODCALLTYPE * GetDefaultAudioEndpoint)               (ma_IMMDeviceEnumerator* pThis, ma_EDataFlow dataFlow, ma_ERole role, ma_IMMDevice** ppEndpoint);
+        HRESULT (STDMETHODCALLTYPE * GetDevice)                             (ma_IMMDeviceEnumerator* pThis, const WCHAR* pID, ma_IMMDevice** ppDevice);
+        HRESULT (STDMETHODCALLTYPE * RegisterEndpointNotificationCallback)  (ma_IMMDeviceEnumerator* pThis, ma_IMMNotificationClient* pClient);
+        HRESULT (STDMETHODCALLTYPE * UnregisterEndpointNotificationCallback)(ma_IMMDeviceEnumerator* pThis, ma_IMMNotificationClient* pClient);
+    } ma_IMMDeviceEnumeratorVtbl;
+    struct ma_IMMDeviceEnumerator
+    {
+        ma_IMMDeviceEnumeratorVtbl* lpVtbl;
+    };
+    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_QueryInterface(ma_IMMDeviceEnumerator* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+    static MA_INLINE ULONG   ma_IMMDeviceEnumerator_AddRef(ma_IMMDeviceEnumerator* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+    static MA_INLINE ULONG   ma_IMMDeviceEnumerator_Release(ma_IMMDeviceEnumerator* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_EnumAudioEndpoints(ma_IMMDeviceEnumerator* pThis, ma_EDataFlow dataFlow, DWORD dwStateMask, ma_IMMDeviceCollection** ppDevices) { return pThis->lpVtbl->EnumAudioEndpoints(pThis, dataFlow, dwStateMask, ppDevices); }
+    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_GetDefaultAudioEndpoint(ma_IMMDeviceEnumerator* pThis, ma_EDataFlow dataFlow, ma_ERole role, ma_IMMDevice** ppEndpoint) { return pThis->lpVtbl->GetDefaultAudioEndpoint(pThis, dataFlow, role, ppEndpoint); }
+    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_GetDevice(ma_IMMDeviceEnumerator* pThis, const WCHAR* pID, ma_IMMDevice** ppDevice) { return pThis->lpVtbl->GetDevice(pThis, pID, ppDevice); }
+    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_RegisterEndpointNotificationCallback(ma_IMMDeviceEnumerator* pThis, ma_IMMNotificationClient* pClient) { return pThis->lpVtbl->RegisterEndpointNotificationCallback(pThis, pClient); }
+    static MA_INLINE HRESULT ma_IMMDeviceEnumerator_UnregisterEndpointNotificationCallback(ma_IMMDeviceEnumerator* pThis, ma_IMMNotificationClient* pClient) { return pThis->lpVtbl->UnregisterEndpointNotificationCallback(pThis, pClient); }
+
+
+    /* IMMDeviceCollection */
+    typedef struct
+    {
+        /* IUnknown */
+        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IMMDeviceCollection* pThis, const IID* const riid, void** ppObject);
+        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IMMDeviceCollection* pThis);
+        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IMMDeviceCollection* pThis);
+
+        /* IMMDeviceCollection */
+        HRESULT (STDMETHODCALLTYPE * GetCount)(ma_IMMDeviceCollection* pThis, UINT* pDevices);
+        HRESULT (STDMETHODCALLTYPE * Item)    (ma_IMMDeviceCollection* pThis, UINT nDevice, ma_IMMDevice** ppDevice);
+    } ma_IMMDeviceCollectionVtbl;
+    struct ma_IMMDeviceCollection
+    {
+        ma_IMMDeviceCollectionVtbl* lpVtbl;
+    };
+    static MA_INLINE HRESULT ma_IMMDeviceCollection_QueryInterface(ma_IMMDeviceCollection* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+    static MA_INLINE ULONG   ma_IMMDeviceCollection_AddRef(ma_IMMDeviceCollection* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+    static MA_INLINE ULONG   ma_IMMDeviceCollection_Release(ma_IMMDeviceCollection* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+    static MA_INLINE HRESULT ma_IMMDeviceCollection_GetCount(ma_IMMDeviceCollection* pThis, UINT* pDevices)                               { return pThis->lpVtbl->GetCount(pThis, pDevices); }
+    static MA_INLINE HRESULT ma_IMMDeviceCollection_Item(ma_IMMDeviceCollection* pThis, UINT nDevice, ma_IMMDevice** ppDevice)            { return pThis->lpVtbl->Item(pThis, nDevice, ppDevice); }
+
+
+    /* IMMDevice */
+    typedef struct
+    {
+        /* IUnknown */
+        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IMMDevice* pThis, const IID* const riid, void** ppObject);
+        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IMMDevice* pThis);
+        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IMMDevice* pThis);
+
+        /* IMMDevice */
+        HRESULT (STDMETHODCALLTYPE * Activate)         (ma_IMMDevice* pThis, const IID* const iid, DWORD dwClsCtx, MA_PROPVARIANT* pActivationParams, void** ppInterface);
+        HRESULT (STDMETHODCALLTYPE * OpenPropertyStore)(ma_IMMDevice* pThis, DWORD stgmAccess, ma_IPropertyStore** ppProperties);
+        HRESULT (STDMETHODCALLTYPE * GetId)            (ma_IMMDevice* pThis, WCHAR** pID);
+        HRESULT (STDMETHODCALLTYPE * GetState)         (ma_IMMDevice* pThis, DWORD *pState);
+    } ma_IMMDeviceVtbl;
+    struct ma_IMMDevice
+    {
+        ma_IMMDeviceVtbl* lpVtbl;
+    };
+    static MA_INLINE HRESULT ma_IMMDevice_QueryInterface(ma_IMMDevice* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+    static MA_INLINE ULONG   ma_IMMDevice_AddRef(ma_IMMDevice* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+    static MA_INLINE ULONG   ma_IMMDevice_Release(ma_IMMDevice* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+    static MA_INLINE HRESULT ma_IMMDevice_Activate(ma_IMMDevice* pThis, const IID* const iid, DWORD dwClsCtx, MA_PROPVARIANT* pActivationParams, void** ppInterface) { return pThis->lpVtbl->Activate(pThis, iid, dwClsCtx, pActivationParams, ppInterface); }
+    static MA_INLINE HRESULT ma_IMMDevice_OpenPropertyStore(ma_IMMDevice* pThis, DWORD stgmAccess, ma_IPropertyStore** ppProperties) { return pThis->lpVtbl->OpenPropertyStore(pThis, stgmAccess, ppProperties); }
+    static MA_INLINE HRESULT ma_IMMDevice_GetId(ma_IMMDevice* pThis, WCHAR** pID)                                     { return pThis->lpVtbl->GetId(pThis, pID); }
+    static MA_INLINE HRESULT ma_IMMDevice_GetState(ma_IMMDevice* pThis, DWORD *pState)                                { return pThis->lpVtbl->GetState(pThis, pState); }
+#else
+    /* IActivateAudioInterfaceAsyncOperation */
+    typedef struct
+    {
+        /* IUnknown */
+        HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IActivateAudioInterfaceAsyncOperation* pThis, const IID* const riid, void** ppObject);
+        ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IActivateAudioInterfaceAsyncOperation* pThis);
+        ULONG   (STDMETHODCALLTYPE * Release)       (ma_IActivateAudioInterfaceAsyncOperation* pThis);
+
+        /* IActivateAudioInterfaceAsyncOperation */
+        HRESULT (STDMETHODCALLTYPE * GetActivateResult)(ma_IActivateAudioInterfaceAsyncOperation* pThis, HRESULT *pActivateResult, ma_IUnknown** ppActivatedInterface);
+    } ma_IActivateAudioInterfaceAsyncOperationVtbl;
+    struct ma_IActivateAudioInterfaceAsyncOperation
+    {
+        ma_IActivateAudioInterfaceAsyncOperationVtbl* lpVtbl;
+    };
+    static MA_INLINE HRESULT ma_IActivateAudioInterfaceAsyncOperation_QueryInterface(ma_IActivateAudioInterfaceAsyncOperation* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+    static MA_INLINE ULONG   ma_IActivateAudioInterfaceAsyncOperation_AddRef(ma_IActivateAudioInterfaceAsyncOperation* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+    static MA_INLINE ULONG   ma_IActivateAudioInterfaceAsyncOperation_Release(ma_IActivateAudioInterfaceAsyncOperation* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+    static MA_INLINE HRESULT ma_IActivateAudioInterfaceAsyncOperation_GetActivateResult(ma_IActivateAudioInterfaceAsyncOperation* pThis, HRESULT *pActivateResult, ma_IUnknown** ppActivatedInterface) { return pThis->lpVtbl->GetActivateResult(pThis, pActivateResult, ppActivatedInterface); }
+#endif
+
+/* IPropertyStore */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IPropertyStore* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IPropertyStore* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IPropertyStore* pThis);
+
+    /* IPropertyStore */
+    HRESULT (STDMETHODCALLTYPE * GetCount)(ma_IPropertyStore* pThis, DWORD* pPropCount);
+    HRESULT (STDMETHODCALLTYPE * GetAt)   (ma_IPropertyStore* pThis, DWORD propIndex, PROPERTYKEY* pPropKey);
+    HRESULT (STDMETHODCALLTYPE * GetValue)(ma_IPropertyStore* pThis, const PROPERTYKEY* const pKey, MA_PROPVARIANT* pPropVar);
+    HRESULT (STDMETHODCALLTYPE * SetValue)(ma_IPropertyStore* pThis, const PROPERTYKEY* const pKey, const MA_PROPVARIANT* const pPropVar);
+    HRESULT (STDMETHODCALLTYPE * Commit)  (ma_IPropertyStore* pThis);
+} ma_IPropertyStoreVtbl;
+struct ma_IPropertyStore
+{
+    ma_IPropertyStoreVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IPropertyStore_QueryInterface(ma_IPropertyStore* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IPropertyStore_AddRef(ma_IPropertyStore* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IPropertyStore_Release(ma_IPropertyStore* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IPropertyStore_GetCount(ma_IPropertyStore* pThis, DWORD* pPropCount)                            { return pThis->lpVtbl->GetCount(pThis, pPropCount); }
+static MA_INLINE HRESULT ma_IPropertyStore_GetAt(ma_IPropertyStore* pThis, DWORD propIndex, PROPERTYKEY* pPropKey)          { return pThis->lpVtbl->GetAt(pThis, propIndex, pPropKey); }
+static MA_INLINE HRESULT ma_IPropertyStore_GetValue(ma_IPropertyStore* pThis, const PROPERTYKEY* const pKey, MA_PROPVARIANT* pPropVar) { return pThis->lpVtbl->GetValue(pThis, pKey, pPropVar); }
+static MA_INLINE HRESULT ma_IPropertyStore_SetValue(ma_IPropertyStore* pThis, const PROPERTYKEY* const pKey, const MA_PROPVARIANT* const pPropVar) { return pThis->lpVtbl->SetValue(pThis, pKey, pPropVar); }
+static MA_INLINE HRESULT ma_IPropertyStore_Commit(ma_IPropertyStore* pThis)                                                 { return pThis->lpVtbl->Commit(pThis); }
+
+
+/* IAudioClient */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioClient* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioClient* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioClient* pThis);
+
+    /* IAudioClient */
+    HRESULT (STDMETHODCALLTYPE * Initialize)       (ma_IAudioClient* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid);
+    HRESULT (STDMETHODCALLTYPE * GetBufferSize)    (ma_IAudioClient* pThis, ma_uint32* pNumBufferFrames);
+    HRESULT (STDMETHODCALLTYPE * GetStreamLatency) (ma_IAudioClient* pThis, MA_REFERENCE_TIME* pLatency);
+    HRESULT (STDMETHODCALLTYPE * GetCurrentPadding)(ma_IAudioClient* pThis, ma_uint32* pNumPaddingFrames);
+    HRESULT (STDMETHODCALLTYPE * IsFormatSupported)(ma_IAudioClient* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch);
+    HRESULT (STDMETHODCALLTYPE * GetMixFormat)     (ma_IAudioClient* pThis, MA_WAVEFORMATEX** ppDeviceFormat);
+    HRESULT (STDMETHODCALLTYPE * GetDevicePeriod)  (ma_IAudioClient* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod);
+    HRESULT (STDMETHODCALLTYPE * Start)            (ma_IAudioClient* pThis);
+    HRESULT (STDMETHODCALLTYPE * Stop)             (ma_IAudioClient* pThis);
+    HRESULT (STDMETHODCALLTYPE * Reset)            (ma_IAudioClient* pThis);
+    HRESULT (STDMETHODCALLTYPE * SetEventHandle)   (ma_IAudioClient* pThis, HANDLE eventHandle);
+    HRESULT (STDMETHODCALLTYPE * GetService)       (ma_IAudioClient* pThis, const IID* const riid, void** pp);
+} ma_IAudioClientVtbl;
+struct ma_IAudioClient
+{
+    ma_IAudioClientVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IAudioClient_QueryInterface(ma_IAudioClient* pThis, const IID* const riid, void** ppObject)    { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IAudioClient_AddRef(ma_IAudioClient* pThis)                                                    { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IAudioClient_Release(ma_IAudioClient* pThis)                                                   { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient_Initialize(ma_IAudioClient* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid) { return pThis->lpVtbl->Initialize(pThis, shareMode, streamFlags, bufferDuration, periodicity, pFormat, pAudioSessionGuid); }
+static MA_INLINE HRESULT ma_IAudioClient_GetBufferSize(ma_IAudioClient* pThis, ma_uint32* pNumBufferFrames)                { return pThis->lpVtbl->GetBufferSize(pThis, pNumBufferFrames); }
+static MA_INLINE HRESULT ma_IAudioClient_GetStreamLatency(ma_IAudioClient* pThis, MA_REFERENCE_TIME* pLatency)             { return pThis->lpVtbl->GetStreamLatency(pThis, pLatency); }
+static MA_INLINE HRESULT ma_IAudioClient_GetCurrentPadding(ma_IAudioClient* pThis, ma_uint32* pNumPaddingFrames)           { return pThis->lpVtbl->GetCurrentPadding(pThis, pNumPaddingFrames); }
+static MA_INLINE HRESULT ma_IAudioClient_IsFormatSupported(ma_IAudioClient* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch) { return pThis->lpVtbl->IsFormatSupported(pThis, shareMode, pFormat, ppClosestMatch); }
+static MA_INLINE HRESULT ma_IAudioClient_GetMixFormat(ma_IAudioClient* pThis, MA_WAVEFORMATEX** ppDeviceFormat)            { return pThis->lpVtbl->GetMixFormat(pThis, ppDeviceFormat); }
+static MA_INLINE HRESULT ma_IAudioClient_GetDevicePeriod(ma_IAudioClient* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod) { return pThis->lpVtbl->GetDevicePeriod(pThis, pDefaultDevicePeriod, pMinimumDevicePeriod); }
+static MA_INLINE HRESULT ma_IAudioClient_Start(ma_IAudioClient* pThis)                                                     { return pThis->lpVtbl->Start(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient_Stop(ma_IAudioClient* pThis)                                                      { return pThis->lpVtbl->Stop(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient_Reset(ma_IAudioClient* pThis)                                                     { return pThis->lpVtbl->Reset(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient_SetEventHandle(ma_IAudioClient* pThis, HANDLE eventHandle)                        { return pThis->lpVtbl->SetEventHandle(pThis, eventHandle); }
+static MA_INLINE HRESULT ma_IAudioClient_GetService(ma_IAudioClient* pThis, const IID* const riid, void** pp)              { return pThis->lpVtbl->GetService(pThis, riid, pp); }
+
+/* IAudioClient2 */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioClient2* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioClient2* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioClient2* pThis);
+
+    /* IAudioClient */
+    HRESULT (STDMETHODCALLTYPE * Initialize)       (ma_IAudioClient2* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid);
+    HRESULT (STDMETHODCALLTYPE * GetBufferSize)    (ma_IAudioClient2* pThis, ma_uint32* pNumBufferFrames);
+    HRESULT (STDMETHODCALLTYPE * GetStreamLatency) (ma_IAudioClient2* pThis, MA_REFERENCE_TIME* pLatency);
+    HRESULT (STDMETHODCALLTYPE * GetCurrentPadding)(ma_IAudioClient2* pThis, ma_uint32* pNumPaddingFrames);
+    HRESULT (STDMETHODCALLTYPE * IsFormatSupported)(ma_IAudioClient2* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch);
+    HRESULT (STDMETHODCALLTYPE * GetMixFormat)     (ma_IAudioClient2* pThis, MA_WAVEFORMATEX** ppDeviceFormat);
+    HRESULT (STDMETHODCALLTYPE * GetDevicePeriod)  (ma_IAudioClient2* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod);
+    HRESULT (STDMETHODCALLTYPE * Start)            (ma_IAudioClient2* pThis);
+    HRESULT (STDMETHODCALLTYPE * Stop)             (ma_IAudioClient2* pThis);
+    HRESULT (STDMETHODCALLTYPE * Reset)            (ma_IAudioClient2* pThis);
+    HRESULT (STDMETHODCALLTYPE * SetEventHandle)   (ma_IAudioClient2* pThis, HANDLE eventHandle);
+    HRESULT (STDMETHODCALLTYPE * GetService)       (ma_IAudioClient2* pThis, const IID* const riid, void** pp);
+
+    /* IAudioClient2 */
+    HRESULT (STDMETHODCALLTYPE * IsOffloadCapable)   (ma_IAudioClient2* pThis, MA_AUDIO_STREAM_CATEGORY category, BOOL* pOffloadCapable);
+    HRESULT (STDMETHODCALLTYPE * SetClientProperties)(ma_IAudioClient2* pThis, const ma_AudioClientProperties* pProperties);
+    HRESULT (STDMETHODCALLTYPE * GetBufferSizeLimits)(ma_IAudioClient2* pThis, const MA_WAVEFORMATEX* pFormat, BOOL eventDriven, MA_REFERENCE_TIME* pMinBufferDuration, MA_REFERENCE_TIME* pMaxBufferDuration);
+} ma_IAudioClient2Vtbl;
+struct ma_IAudioClient2
+{
+    ma_IAudioClient2Vtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IAudioClient2_QueryInterface(ma_IAudioClient2* pThis, const IID* const riid, void** ppObject)    { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IAudioClient2_AddRef(ma_IAudioClient2* pThis)                                                    { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IAudioClient2_Release(ma_IAudioClient2* pThis)                                                   { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient2_Initialize(ma_IAudioClient2* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid) { return pThis->lpVtbl->Initialize(pThis, shareMode, streamFlags, bufferDuration, periodicity, pFormat, pAudioSessionGuid); }
+static MA_INLINE HRESULT ma_IAudioClient2_GetBufferSize(ma_IAudioClient2* pThis, ma_uint32* pNumBufferFrames)                { return pThis->lpVtbl->GetBufferSize(pThis, pNumBufferFrames); }
+static MA_INLINE HRESULT ma_IAudioClient2_GetStreamLatency(ma_IAudioClient2* pThis, MA_REFERENCE_TIME* pLatency)             { return pThis->lpVtbl->GetStreamLatency(pThis, pLatency); }
+static MA_INLINE HRESULT ma_IAudioClient2_GetCurrentPadding(ma_IAudioClient2* pThis, ma_uint32* pNumPaddingFrames)           { return pThis->lpVtbl->GetCurrentPadding(pThis, pNumPaddingFrames); }
+static MA_INLINE HRESULT ma_IAudioClient2_IsFormatSupported(ma_IAudioClient2* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch) { return pThis->lpVtbl->IsFormatSupported(pThis, shareMode, pFormat, ppClosestMatch); }
+static MA_INLINE HRESULT ma_IAudioClient2_GetMixFormat(ma_IAudioClient2* pThis, MA_WAVEFORMATEX** ppDeviceFormat)            { return pThis->lpVtbl->GetMixFormat(pThis, ppDeviceFormat); }
+static MA_INLINE HRESULT ma_IAudioClient2_GetDevicePeriod(ma_IAudioClient2* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod) { return pThis->lpVtbl->GetDevicePeriod(pThis, pDefaultDevicePeriod, pMinimumDevicePeriod); }
+static MA_INLINE HRESULT ma_IAudioClient2_Start(ma_IAudioClient2* pThis)                                                     { return pThis->lpVtbl->Start(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient2_Stop(ma_IAudioClient2* pThis)                                                      { return pThis->lpVtbl->Stop(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient2_Reset(ma_IAudioClient2* pThis)                                                     { return pThis->lpVtbl->Reset(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient2_SetEventHandle(ma_IAudioClient2* pThis, HANDLE eventHandle)                        { return pThis->lpVtbl->SetEventHandle(pThis, eventHandle); }
+static MA_INLINE HRESULT ma_IAudioClient2_GetService(ma_IAudioClient2* pThis, const IID* const riid, void** pp)              { return pThis->lpVtbl->GetService(pThis, riid, pp); }
+static MA_INLINE HRESULT ma_IAudioClient2_IsOffloadCapable(ma_IAudioClient2* pThis, MA_AUDIO_STREAM_CATEGORY category, BOOL* pOffloadCapable) { return pThis->lpVtbl->IsOffloadCapable(pThis, category, pOffloadCapable); }
+static MA_INLINE HRESULT ma_IAudioClient2_SetClientProperties(ma_IAudioClient2* pThis, const ma_AudioClientProperties* pProperties)           { return pThis->lpVtbl->SetClientProperties(pThis, pProperties); }
+static MA_INLINE HRESULT ma_IAudioClient2_GetBufferSizeLimits(ma_IAudioClient2* pThis, const MA_WAVEFORMATEX* pFormat, BOOL eventDriven, MA_REFERENCE_TIME* pMinBufferDuration, MA_REFERENCE_TIME* pMaxBufferDuration) { return pThis->lpVtbl->GetBufferSizeLimits(pThis, pFormat, eventDriven, pMinBufferDuration, pMaxBufferDuration); }
+
+
+/* IAudioClient3 */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioClient3* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioClient3* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioClient3* pThis);
+
+    /* IAudioClient */
+    HRESULT (STDMETHODCALLTYPE * Initialize)       (ma_IAudioClient3* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid);
+    HRESULT (STDMETHODCALLTYPE * GetBufferSize)    (ma_IAudioClient3* pThis, ma_uint32* pNumBufferFrames);
+    HRESULT (STDMETHODCALLTYPE * GetStreamLatency) (ma_IAudioClient3* pThis, MA_REFERENCE_TIME* pLatency);
+    HRESULT (STDMETHODCALLTYPE * GetCurrentPadding)(ma_IAudioClient3* pThis, ma_uint32* pNumPaddingFrames);
+    HRESULT (STDMETHODCALLTYPE * IsFormatSupported)(ma_IAudioClient3* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch);
+    HRESULT (STDMETHODCALLTYPE * GetMixFormat)     (ma_IAudioClient3* pThis, MA_WAVEFORMATEX** ppDeviceFormat);
+    HRESULT (STDMETHODCALLTYPE * GetDevicePeriod)  (ma_IAudioClient3* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod);
+    HRESULT (STDMETHODCALLTYPE * Start)            (ma_IAudioClient3* pThis);
+    HRESULT (STDMETHODCALLTYPE * Stop)             (ma_IAudioClient3* pThis);
+    HRESULT (STDMETHODCALLTYPE * Reset)            (ma_IAudioClient3* pThis);
+    HRESULT (STDMETHODCALLTYPE * SetEventHandle)   (ma_IAudioClient3* pThis, HANDLE eventHandle);
+    HRESULT (STDMETHODCALLTYPE * GetService)       (ma_IAudioClient3* pThis, const IID* const riid, void** pp);
+
+    /* IAudioClient2 */
+    HRESULT (STDMETHODCALLTYPE * IsOffloadCapable)   (ma_IAudioClient3* pThis, MA_AUDIO_STREAM_CATEGORY category, BOOL* pOffloadCapable);
+    HRESULT (STDMETHODCALLTYPE * SetClientProperties)(ma_IAudioClient3* pThis, const ma_AudioClientProperties* pProperties);
+    HRESULT (STDMETHODCALLTYPE * GetBufferSizeLimits)(ma_IAudioClient3* pThis, const MA_WAVEFORMATEX* pFormat, BOOL eventDriven, MA_REFERENCE_TIME* pMinBufferDuration, MA_REFERENCE_TIME* pMaxBufferDuration);
+
+    /* IAudioClient3 */
+    HRESULT (STDMETHODCALLTYPE * GetSharedModeEnginePeriod)       (ma_IAudioClient3* pThis, const MA_WAVEFORMATEX* pFormat, ma_uint32* pDefaultPeriodInFrames, ma_uint32* pFundamentalPeriodInFrames, ma_uint32* pMinPeriodInFrames, ma_uint32* pMaxPeriodInFrames);
+    HRESULT (STDMETHODCALLTYPE * GetCurrentSharedModeEnginePeriod)(ma_IAudioClient3* pThis, MA_WAVEFORMATEX** ppFormat, ma_uint32* pCurrentPeriodInFrames);
+    HRESULT (STDMETHODCALLTYPE * InitializeSharedAudioStream)     (ma_IAudioClient3* pThis, DWORD streamFlags, ma_uint32 periodInFrames, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid);
+} ma_IAudioClient3Vtbl;
+struct ma_IAudioClient3
+{
+    ma_IAudioClient3Vtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IAudioClient3_QueryInterface(ma_IAudioClient3* pThis, const IID* const riid, void** ppObject)    { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IAudioClient3_AddRef(ma_IAudioClient3* pThis)                                                    { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IAudioClient3_Release(ma_IAudioClient3* pThis)                                                   { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient3_Initialize(ma_IAudioClient3* pThis, MA_AUDCLNT_SHAREMODE shareMode, DWORD streamFlags, MA_REFERENCE_TIME bufferDuration, MA_REFERENCE_TIME periodicity, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGuid) { return pThis->lpVtbl->Initialize(pThis, shareMode, streamFlags, bufferDuration, periodicity, pFormat, pAudioSessionGuid); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetBufferSize(ma_IAudioClient3* pThis, ma_uint32* pNumBufferFrames)                { return pThis->lpVtbl->GetBufferSize(pThis, pNumBufferFrames); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetStreamLatency(ma_IAudioClient3* pThis, MA_REFERENCE_TIME* pLatency)             { return pThis->lpVtbl->GetStreamLatency(pThis, pLatency); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetCurrentPadding(ma_IAudioClient3* pThis, ma_uint32* pNumPaddingFrames)           { return pThis->lpVtbl->GetCurrentPadding(pThis, pNumPaddingFrames); }
+static MA_INLINE HRESULT ma_IAudioClient3_IsFormatSupported(ma_IAudioClient3* pThis, MA_AUDCLNT_SHAREMODE shareMode, const MA_WAVEFORMATEX* pFormat, MA_WAVEFORMATEX** ppClosestMatch) { return pThis->lpVtbl->IsFormatSupported(pThis, shareMode, pFormat, ppClosestMatch); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetMixFormat(ma_IAudioClient3* pThis, MA_WAVEFORMATEX** ppDeviceFormat)               { return pThis->lpVtbl->GetMixFormat(pThis, ppDeviceFormat); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetDevicePeriod(ma_IAudioClient3* pThis, MA_REFERENCE_TIME* pDefaultDevicePeriod, MA_REFERENCE_TIME* pMinimumDevicePeriod) { return pThis->lpVtbl->GetDevicePeriod(pThis, pDefaultDevicePeriod, pMinimumDevicePeriod); }
+static MA_INLINE HRESULT ma_IAudioClient3_Start(ma_IAudioClient3* pThis)                                                     { return pThis->lpVtbl->Start(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient3_Stop(ma_IAudioClient3* pThis)                                                      { return pThis->lpVtbl->Stop(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient3_Reset(ma_IAudioClient3* pThis)                                                     { return pThis->lpVtbl->Reset(pThis); }
+static MA_INLINE HRESULT ma_IAudioClient3_SetEventHandle(ma_IAudioClient3* pThis, HANDLE eventHandle)                        { return pThis->lpVtbl->SetEventHandle(pThis, eventHandle); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetService(ma_IAudioClient3* pThis, const IID* const riid, void** pp)              { return pThis->lpVtbl->GetService(pThis, riid, pp); }
+static MA_INLINE HRESULT ma_IAudioClient3_IsOffloadCapable(ma_IAudioClient3* pThis, MA_AUDIO_STREAM_CATEGORY category, BOOL* pOffloadCapable) { return pThis->lpVtbl->IsOffloadCapable(pThis, category, pOffloadCapable); }
+static MA_INLINE HRESULT ma_IAudioClient3_SetClientProperties(ma_IAudioClient3* pThis, const ma_AudioClientProperties* pProperties)           { return pThis->lpVtbl->SetClientProperties(pThis, pProperties); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetBufferSizeLimits(ma_IAudioClient3* pThis, const MA_WAVEFORMATEX* pFormat, BOOL eventDriven, MA_REFERENCE_TIME* pMinBufferDuration, MA_REFERENCE_TIME* pMaxBufferDuration) { return pThis->lpVtbl->GetBufferSizeLimits(pThis, pFormat, eventDriven, pMinBufferDuration, pMaxBufferDuration); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetSharedModeEnginePeriod(ma_IAudioClient3* pThis, const MA_WAVEFORMATEX* pFormat, ma_uint32* pDefaultPeriodInFrames, ma_uint32* pFundamentalPeriodInFrames, ma_uint32* pMinPeriodInFrames, ma_uint32* pMaxPeriodInFrames) { return pThis->lpVtbl->GetSharedModeEnginePeriod(pThis, pFormat, pDefaultPeriodInFrames, pFundamentalPeriodInFrames, pMinPeriodInFrames, pMaxPeriodInFrames); }
+static MA_INLINE HRESULT ma_IAudioClient3_GetCurrentSharedModeEnginePeriod(ma_IAudioClient3* pThis, MA_WAVEFORMATEX** ppFormat, ma_uint32* pCurrentPeriodInFrames) { return pThis->lpVtbl->GetCurrentSharedModeEnginePeriod(pThis, ppFormat, pCurrentPeriodInFrames); }
+static MA_INLINE HRESULT ma_IAudioClient3_InitializeSharedAudioStream(ma_IAudioClient3* pThis, DWORD streamFlags, ma_uint32 periodInFrames, const MA_WAVEFORMATEX* pFormat, const GUID* pAudioSessionGUID) { return pThis->lpVtbl->InitializeSharedAudioStream(pThis, streamFlags, periodInFrames, pFormat, pAudioSessionGUID); }
+
+
+/* IAudioRenderClient */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioRenderClient* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioRenderClient* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioRenderClient* pThis);
+
+    /* IAudioRenderClient */
+    HRESULT (STDMETHODCALLTYPE * GetBuffer)    (ma_IAudioRenderClient* pThis, ma_uint32 numFramesRequested, BYTE** ppData);
+    HRESULT (STDMETHODCALLTYPE * ReleaseBuffer)(ma_IAudioRenderClient* pThis, ma_uint32 numFramesWritten, DWORD dwFlags);
+} ma_IAudioRenderClientVtbl;
+struct ma_IAudioRenderClient
+{
+    ma_IAudioRenderClientVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IAudioRenderClient_QueryInterface(ma_IAudioRenderClient* pThis, const IID* const riid, void** ppObject)   { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IAudioRenderClient_AddRef(ma_IAudioRenderClient* pThis)                                                   { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IAudioRenderClient_Release(ma_IAudioRenderClient* pThis)                                                  { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IAudioRenderClient_GetBuffer(ma_IAudioRenderClient* pThis, ma_uint32 numFramesRequested, BYTE** ppData)   { return pThis->lpVtbl->GetBuffer(pThis, numFramesRequested, ppData); }
+static MA_INLINE HRESULT ma_IAudioRenderClient_ReleaseBuffer(ma_IAudioRenderClient* pThis, ma_uint32 numFramesWritten, DWORD dwFlags) { return pThis->lpVtbl->ReleaseBuffer(pThis, numFramesWritten, dwFlags); }
+
+
+/* IAudioCaptureClient */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IAudioCaptureClient* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IAudioCaptureClient* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IAudioCaptureClient* pThis);
+
+    /* IAudioRenderClient */
+    HRESULT (STDMETHODCALLTYPE * GetBuffer)        (ma_IAudioCaptureClient* pThis, BYTE** ppData, ma_uint32* pNumFramesToRead, DWORD* pFlags, ma_uint64* pDevicePosition, ma_uint64* pQPCPosition);
+    HRESULT (STDMETHODCALLTYPE * ReleaseBuffer)    (ma_IAudioCaptureClient* pThis, ma_uint32 numFramesRead);
+    HRESULT (STDMETHODCALLTYPE * GetNextPacketSize)(ma_IAudioCaptureClient* pThis, ma_uint32* pNumFramesInNextPacket);
+} ma_IAudioCaptureClientVtbl;
+struct ma_IAudioCaptureClient
+{
+    ma_IAudioCaptureClientVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IAudioCaptureClient_QueryInterface(ma_IAudioCaptureClient* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IAudioCaptureClient_AddRef(ma_IAudioCaptureClient* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IAudioCaptureClient_Release(ma_IAudioCaptureClient* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IAudioCaptureClient_GetBuffer(ma_IAudioCaptureClient* pThis, BYTE** ppData, ma_uint32* pNumFramesToRead, DWORD* pFlags, ma_uint64* pDevicePosition, ma_uint64* pQPCPosition) { return pThis->lpVtbl->GetBuffer(pThis, ppData, pNumFramesToRead, pFlags, pDevicePosition, pQPCPosition); }
+static MA_INLINE HRESULT ma_IAudioCaptureClient_ReleaseBuffer(ma_IAudioCaptureClient* pThis, ma_uint32 numFramesRead)                 { return pThis->lpVtbl->ReleaseBuffer(pThis, numFramesRead); }
+static MA_INLINE HRESULT ma_IAudioCaptureClient_GetNextPacketSize(ma_IAudioCaptureClient* pThis, ma_uint32* pNumFramesInNextPacket)   { return pThis->lpVtbl->GetNextPacketSize(pThis, pNumFramesInNextPacket); }
+
+#if defined(MA_WIN32_UWP)
+/* mmdevapi Functions */
+typedef HRESULT (WINAPI * MA_PFN_ActivateAudioInterfaceAsync)(const wchar_t* deviceInterfacePath, const IID* riid, MA_PROPVARIANT* activationParams, ma_IActivateAudioInterfaceCompletionHandler* completionHandler, ma_IActivateAudioInterfaceAsyncOperation** activationOperation);
+#endif
+
+/* Avrt Functions */
+typedef HANDLE (WINAPI * MA_PFN_AvSetMmThreadCharacteristicsA)(const char* TaskName, DWORD* TaskIndex);
+typedef BOOL   (WINAPI * MA_PFN_AvRevertMmThreadCharacteristics)(HANDLE AvrtHandle);
+
+#if !defined(MA_WIN32_DESKTOP) && !defined(MA_WIN32_GDK)
+typedef struct ma_completion_handler_uwp ma_completion_handler_uwp;
+
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_completion_handler_uwp* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_completion_handler_uwp* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_completion_handler_uwp* pThis);
+
+    /* IActivateAudioInterfaceCompletionHandler */
+    HRESULT (STDMETHODCALLTYPE * ActivateCompleted)(ma_completion_handler_uwp* pThis, ma_IActivateAudioInterfaceAsyncOperation* pActivateOperation);
+} ma_completion_handler_uwp_vtbl;
+struct ma_completion_handler_uwp
+{
+    ma_completion_handler_uwp_vtbl* lpVtbl;
+    MA_ATOMIC(4, ma_uint32) counter;
+    HANDLE hEvent;
+};
+
+static HRESULT STDMETHODCALLTYPE ma_completion_handler_uwp_QueryInterface(ma_completion_handler_uwp* pThis, const IID* const riid, void** ppObject)
+{
+    /*
+    We need to "implement" IAgileObject which is just an indicator that's used internally by WASAPI for some multithreading management. To
+    "implement" this, we just make sure we return pThis when the IAgileObject is requested.
+    */
+    if (!ma_is_guid_equal(riid, &MA_IID_IUnknown) && !ma_is_guid_equal(riid, &MA_IID_IActivateAudioInterfaceCompletionHandler) && !ma_is_guid_equal(riid, &MA_IID_IAgileObject)) {
+        *ppObject = NULL;
+        return E_NOINTERFACE;
+    }
+
+    /* Getting here means the IID is IUnknown or IMMNotificationClient. */
+    *ppObject = (void*)pThis;
+    ((ma_completion_handler_uwp_vtbl*)pThis->lpVtbl)->AddRef(pThis);
+    return S_OK;
+}
+
+static ULONG STDMETHODCALLTYPE ma_completion_handler_uwp_AddRef(ma_completion_handler_uwp* pThis)
+{
+    return (ULONG)ma_atomic_fetch_add_32(&pThis->counter, 1) + 1;
+}
+
+static ULONG STDMETHODCALLTYPE ma_completion_handler_uwp_Release(ma_completion_handler_uwp* pThis)
+{
+    ma_uint32 newRefCount = ma_atomic_fetch_sub_32(&pThis->counter, 1) - 1;
+    if (newRefCount == 0) {
+        return 0;   /* We don't free anything here because we never allocate the object on the heap. */
+    }
+
+    return (ULONG)newRefCount;
+}
+
+static HRESULT STDMETHODCALLTYPE ma_completion_handler_uwp_ActivateCompleted(ma_completion_handler_uwp* pThis, ma_IActivateAudioInterfaceAsyncOperation* pActivateOperation)
+{
+    (void)pActivateOperation;
+    SetEvent(pThis->hEvent);
+    return S_OK;
+}
+
+
+static ma_completion_handler_uwp_vtbl g_maCompletionHandlerVtblInstance = {
+    ma_completion_handler_uwp_QueryInterface,
+    ma_completion_handler_uwp_AddRef,
+    ma_completion_handler_uwp_Release,
+    ma_completion_handler_uwp_ActivateCompleted
+};
+
+static ma_result ma_completion_handler_uwp_init(ma_completion_handler_uwp* pHandler)
+{
+    MA_ASSERT(pHandler != NULL);
+    MA_ZERO_OBJECT(pHandler);
+
+    pHandler->lpVtbl = &g_maCompletionHandlerVtblInstance;
+    pHandler->counter = 1;
+    pHandler->hEvent = CreateEventA(NULL, FALSE, FALSE, NULL);
+    if (pHandler->hEvent == NULL) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_completion_handler_uwp_uninit(ma_completion_handler_uwp* pHandler)
+{
+    if (pHandler->hEvent != NULL) {
+        CloseHandle(pHandler->hEvent);
+    }
+}
+
+static void ma_completion_handler_uwp_wait(ma_completion_handler_uwp* pHandler)
+{
+    WaitForSingleObject((HANDLE)pHandler->hEvent, INFINITE);
+}
+#endif  /* !MA_WIN32_DESKTOP */
+
+/* We need a virtual table for our notification client object that's used for detecting changes to the default device. */
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_QueryInterface(ma_IMMNotificationClient* pThis, const IID* const riid, void** ppObject)
+{
+    /*
+    We care about two interfaces - IUnknown and IMMNotificationClient. If the requested IID is something else
+    we just return E_NOINTERFACE. Otherwise we need to increment the reference counter and return S_OK.
+    */
+    if (!ma_is_guid_equal(riid, &MA_IID_IUnknown) && !ma_is_guid_equal(riid, &MA_IID_IMMNotificationClient)) {
+        *ppObject = NULL;
+        return E_NOINTERFACE;
+    }
+
+    /* Getting here means the IID is IUnknown or IMMNotificationClient. */
+    *ppObject = (void*)pThis;
+    ((ma_IMMNotificationClientVtbl*)pThis->lpVtbl)->AddRef(pThis);
+    return S_OK;
+}
+
+static ULONG STDMETHODCALLTYPE ma_IMMNotificationClient_AddRef(ma_IMMNotificationClient* pThis)
+{
+    return (ULONG)ma_atomic_fetch_add_32(&pThis->counter, 1) + 1;
+}
+
+static ULONG STDMETHODCALLTYPE ma_IMMNotificationClient_Release(ma_IMMNotificationClient* pThis)
+{
+    ma_uint32 newRefCount = ma_atomic_fetch_sub_32(&pThis->counter, 1) - 1;
+    if (newRefCount == 0) {
+        return 0;   /* We don't free anything here because we never allocate the object on the heap. */
+    }
+
+    return (ULONG)newRefCount;
+}
+
+static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnDeviceStateChanged(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID, DWORD dwNewState)
+{
+    ma_bool32 isThisDevice = MA_FALSE;
+    ma_bool32 isCapture    = MA_FALSE;
+    ma_bool32 isPlayback   = MA_FALSE;
+
+#ifdef MA_DEBUG_OUTPUT
+    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnDeviceStateChanged(pDeviceID=%S, dwNewState=%u)\n", (pDeviceID != NULL) ? pDeviceID : L"(NULL)", (unsigned int)dwNewState);*/
+#endif
+
+    /*
+    There have been reports of a hang when a playback device is disconnected. The idea with this code is to explicitly stop the device if we detect
+    that the device is disabled or has been unplugged.
+    */
+    if (pThis->pDevice->wasapi.allowCaptureAutoStreamRouting && (pThis->pDevice->type == ma_device_type_capture || pThis->pDevice->type == ma_device_type_duplex || pThis->pDevice->type == ma_device_type_loopback)) {
+        isCapture = MA_TRUE;
+        if (ma_strcmp_WCHAR(pThis->pDevice->capture.id.wasapi, pDeviceID) == 0) {
+            isThisDevice = MA_TRUE;
+        }
+    }
+
+    if (pThis->pDevice->wasapi.allowPlaybackAutoStreamRouting && (pThis->pDevice->type == ma_device_type_playback || pThis->pDevice->type == ma_device_type_duplex)) {
+        isPlayback = MA_TRUE;
+        if (ma_strcmp_WCHAR(pThis->pDevice->playback.id.wasapi, pDeviceID) == 0) {
+            isThisDevice = MA_TRUE;
+        }
+    }
+
+
+    /*
+    If the device ID matches our device we need to mark our device as detached and stop it. When a
+    device is added in OnDeviceAdded(), we'll restart it. We only mark it as detached if the device
+    was started at the time of being removed.
+    */
+    if (isThisDevice) {
+        if ((dwNewState & MA_MM_DEVICE_STATE_ACTIVE) == 0) {
+            /*
+            Unplugged or otherwise unavailable. Mark as detached if we were in a playing state. We'll
+            use this to determine whether or not we need to automatically start the device when it's
+            plugged back in again.
+            */
+            if (ma_device_get_state(pThis->pDevice) == ma_device_state_started) {
+                if (isPlayback) {
+                    pThis->pDevice->wasapi.isDetachedPlayback = MA_TRUE;
+                }
+                if (isCapture) {
+                    pThis->pDevice->wasapi.isDetachedCapture = MA_TRUE;
+                }
+
+                ma_device_stop(pThis->pDevice);
+            }
+        }
+
+        if ((dwNewState & MA_MM_DEVICE_STATE_ACTIVE) != 0) {
+            /* The device was activated. If we were detached, we need to start it again. */
+            ma_bool8 tryRestartingDevice = MA_FALSE;
+
+            if (isPlayback) {
+                if (pThis->pDevice->wasapi.isDetachedPlayback) {
+                    pThis->pDevice->wasapi.isDetachedPlayback = MA_FALSE;
+                    ma_device_reroute__wasapi(pThis->pDevice, ma_device_type_playback);
+                    tryRestartingDevice = MA_TRUE;
+                }
+            }
+
+            if (isCapture) {
+                if (pThis->pDevice->wasapi.isDetachedCapture) {
+                    pThis->pDevice->wasapi.isDetachedCapture = MA_FALSE;
+                    ma_device_reroute__wasapi(pThis->pDevice, (pThis->pDevice->type == ma_device_type_loopback) ? ma_device_type_loopback : ma_device_type_capture);
+                    tryRestartingDevice = MA_TRUE;
+                }
+            }
+
+            if (tryRestartingDevice) {
+                if (pThis->pDevice->wasapi.isDetachedPlayback == MA_FALSE && pThis->pDevice->wasapi.isDetachedCapture == MA_FALSE) {
+                    ma_device_start(pThis->pDevice);
+                }
+            }
+        }
+    }
+
+    return S_OK;
+}
+
+static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnDeviceAdded(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID)
+{
+#ifdef MA_DEBUG_OUTPUT
+    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnDeviceAdded(pDeviceID=%S)\n", (pDeviceID != NULL) ? pDeviceID : L"(NULL)");*/
+#endif
+
+    /* We don't need to worry about this event for our purposes. */
+    (void)pThis;
+    (void)pDeviceID;
+    return S_OK;
+}
+
+static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnDeviceRemoved(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID)
+{
+#ifdef MA_DEBUG_OUTPUT
+    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnDeviceRemoved(pDeviceID=%S)\n", (pDeviceID != NULL) ? pDeviceID : L"(NULL)");*/
+#endif
+
+    /* We don't need to worry about this event for our purposes. */
+    (void)pThis;
+    (void)pDeviceID;
+    return S_OK;
+}
+
+static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnDefaultDeviceChanged(ma_IMMNotificationClient* pThis, ma_EDataFlow dataFlow, ma_ERole role, const WCHAR* pDefaultDeviceID)
+{
+#ifdef MA_DEBUG_OUTPUT
+    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnDefaultDeviceChanged(dataFlow=%d, role=%d, pDefaultDeviceID=%S)\n", dataFlow, role, (pDefaultDeviceID != NULL) ? pDefaultDeviceID : L"(NULL)");*/
+#endif
+
+    (void)role;
+
+    /* We only care about devices with the same data flow as the current device. */
+    if ((pThis->pDevice->type == ma_device_type_playback && dataFlow != ma_eRender)  ||
+        (pThis->pDevice->type == ma_device_type_capture  && dataFlow != ma_eCapture) ||
+        (pThis->pDevice->type == ma_device_type_loopback && dataFlow != ma_eRender)) {
+        ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Stream rerouting abandoned because dataFlow does match device type.\n");
+        return S_OK;
+    }
+
+    /* We need to consider dataFlow as ma_eCapture if device is ma_device_type_loopback */
+    if (pThis->pDevice->type == ma_device_type_loopback) {
+        dataFlow = ma_eCapture;
+    }
+
+    /* Don't do automatic stream routing if we're not allowed. */
+    if ((dataFlow == ma_eRender  && pThis->pDevice->wasapi.allowPlaybackAutoStreamRouting == MA_FALSE) ||
+        (dataFlow == ma_eCapture && pThis->pDevice->wasapi.allowCaptureAutoStreamRouting  == MA_FALSE)) {
+        ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Stream rerouting abandoned because automatic stream routing has been disabled by the device config.\n");
+        return S_OK;
+    }
+
+    /*
+    Not currently supporting automatic stream routing in exclusive mode. This is not working correctly on my machine due to
+    AUDCLNT_E_DEVICE_IN_USE errors when reinitializing the device. If this is a bug in miniaudio, we can try re-enabling this once
+    it's fixed.
+    */
+    if ((dataFlow == ma_eRender  && pThis->pDevice->playback.shareMode == ma_share_mode_exclusive) ||
+        (dataFlow == ma_eCapture && pThis->pDevice->capture.shareMode  == ma_share_mode_exclusive)) {
+        ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Stream rerouting abandoned because the device shared mode is exclusive.\n");
+        return S_OK;
+    }
+
+
+
+    /*
+    Second attempt at device rerouting. We're going to retrieve the device's state at the time of
+    the route change. We're then going to stop the device, reinitialize the device, and then start
+    it again if the state before stopping was ma_device_state_started.
+    */
+    {
+        ma_uint32 previousState = ma_device_get_state(pThis->pDevice);
+        ma_bool8 restartDevice = MA_FALSE;
+
+        if (previousState == ma_device_state_uninitialized || previousState == ma_device_state_starting) {
+            ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Stream rerouting abandoned because the device is in the process of starting.\n");
+            return S_OK;
+        }
+
+        if (previousState == ma_device_state_started) {
+            ma_device_stop(pThis->pDevice);
+            restartDevice = MA_TRUE;
+        }
+
+        if (pDefaultDeviceID != NULL) { /* <-- The input device ID will be null if there's no other device available. */
+            ma_mutex_lock(&pThis->pDevice->wasapi.rerouteLock);
+            {
+                if (dataFlow == ma_eRender) {
+                    ma_device_reroute__wasapi(pThis->pDevice, ma_device_type_playback);
+
+                    if (pThis->pDevice->wasapi.isDetachedPlayback) {
+                        pThis->pDevice->wasapi.isDetachedPlayback = MA_FALSE;
+
+                        if (pThis->pDevice->type == ma_device_type_duplex && pThis->pDevice->wasapi.isDetachedCapture) {
+                            restartDevice = MA_FALSE;   /* It's a duplex device and the capture side is detached. We cannot be restarting the device just yet. */
+                        }
+                        else {
+                            restartDevice = MA_TRUE;    /* It's not a duplex device, or the capture side is also attached so we can go ahead and restart the device. */
+                        }
+                    }
+                }
+                else {
+                    ma_device_reroute__wasapi(pThis->pDevice, (pThis->pDevice->type == ma_device_type_loopback) ? ma_device_type_loopback : ma_device_type_capture);
+
+                    if (pThis->pDevice->wasapi.isDetachedCapture) {
+                        pThis->pDevice->wasapi.isDetachedCapture = MA_FALSE;
+
+                        if (pThis->pDevice->type == ma_device_type_duplex && pThis->pDevice->wasapi.isDetachedPlayback) {
+                            restartDevice = MA_FALSE;   /* It's a duplex device and the playback side is detached. We cannot be restarting the device just yet. */
+                        }
+                        else {
+                            restartDevice = MA_TRUE;    /* It's not a duplex device, or the playback side is also attached so we can go ahead and restart the device. */
+                        }
+                    }
+                }
+            }
+            ma_mutex_unlock(&pThis->pDevice->wasapi.rerouteLock);
+
+            if (restartDevice) {
+                ma_device_start(pThis->pDevice);
+            }
+        }
+    }
+
+    return S_OK;
+}
+
+static HRESULT STDMETHODCALLTYPE ma_IMMNotificationClient_OnPropertyValueChanged(ma_IMMNotificationClient* pThis, const WCHAR* pDeviceID, const PROPERTYKEY key)
+{
+#ifdef MA_DEBUG_OUTPUT
+    /*ma_log_postf(ma_device_get_log(pThis->pDevice), MA_LOG_LEVEL_DEBUG, "IMMNotificationClient_OnPropertyValueChanged(pDeviceID=%S)\n", (pDeviceID != NULL) ? pDeviceID : L"(NULL)");*/
+#endif
+
+    (void)pThis;
+    (void)pDeviceID;
+    (void)key;
+    return S_OK;
+}
+
+static ma_IMMNotificationClientVtbl g_maNotificationCientVtbl = {
+    ma_IMMNotificationClient_QueryInterface,
+    ma_IMMNotificationClient_AddRef,
+    ma_IMMNotificationClient_Release,
+    ma_IMMNotificationClient_OnDeviceStateChanged,
+    ma_IMMNotificationClient_OnDeviceAdded,
+    ma_IMMNotificationClient_OnDeviceRemoved,
+    ma_IMMNotificationClient_OnDefaultDeviceChanged,
+    ma_IMMNotificationClient_OnPropertyValueChanged
+};
+#endif  /* MA_WIN32_DESKTOP */
+
+static const char* ma_to_usage_string__wasapi(ma_wasapi_usage usage)
+{
+    switch (usage)
+    {
+        case ma_wasapi_usage_default:   return NULL;
+        case ma_wasapi_usage_games:     return "Games";
+        case ma_wasapi_usage_pro_audio: return "Pro Audio";
+        default: break;
+    }
+
+    return NULL;
+}
+
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+typedef ma_IMMDevice ma_WASAPIDeviceInterface;
+#else
+typedef ma_IUnknown ma_WASAPIDeviceInterface;
+#endif
+
+
+#define MA_CONTEXT_COMMAND_QUIT__WASAPI                 1
+#define MA_CONTEXT_COMMAND_CREATE_IAUDIOCLIENT__WASAPI  2
+#define MA_CONTEXT_COMMAND_RELEASE_IAUDIOCLIENT__WASAPI 3
+
+static ma_context_command__wasapi ma_context_init_command__wasapi(int code)
+{
+    ma_context_command__wasapi cmd;
+
+    MA_ZERO_OBJECT(&cmd);
+    cmd.code = code;
+
+    return cmd;
+}
+
+static ma_result ma_context_post_command__wasapi(ma_context* pContext, const ma_context_command__wasapi* pCmd)
+{
+    /* For now we are doing everything synchronously, but I might relax this later if the need arises. */
+    ma_result result;
+    ma_bool32 isUsingLocalEvent = MA_FALSE;
+    ma_event localEvent;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pCmd     != NULL);
+
+    if (pCmd->pEvent == NULL) {
+        isUsingLocalEvent = MA_TRUE;
+
+        result = ma_event_init(&localEvent);
+        if (result != MA_SUCCESS) {
+            return result;  /* Failed to create the event for this command. */
+        }
+    }
+
+    /* Here is where we add the command to the list. If there's not enough room we'll spin until there is. */
+    ma_mutex_lock(&pContext->wasapi.commandLock);
+    {
+        ma_uint32 index;
+
+        /* Spin until we've got some space available. */
+        while (pContext->wasapi.commandCount == ma_countof(pContext->wasapi.commands)) {
+            ma_yield();
+        }
+
+        /* Space is now available. Can safely add to the list. */
+        index = (pContext->wasapi.commandIndex + pContext->wasapi.commandCount) % ma_countof(pContext->wasapi.commands);
+        pContext->wasapi.commands[index]        = *pCmd;
+        pContext->wasapi.commands[index].pEvent = &localEvent;
+        pContext->wasapi.commandCount += 1;
+
+        /* Now that the command has been added, release the semaphore so ma_context_next_command__wasapi() can return. */
+        ma_semaphore_release(&pContext->wasapi.commandSem);
+    }
+    ma_mutex_unlock(&pContext->wasapi.commandLock);
+
+    if (isUsingLocalEvent) {
+        ma_event_wait(&localEvent);
+        ma_event_uninit(&localEvent);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_next_command__wasapi(ma_context* pContext, ma_context_command__wasapi* pCmd)
+{
+    ma_result result = MA_SUCCESS;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pCmd     != NULL);
+
+    result = ma_semaphore_wait(&pContext->wasapi.commandSem);
+    if (result == MA_SUCCESS) {
+        ma_mutex_lock(&pContext->wasapi.commandLock);
+        {
+            *pCmd = pContext->wasapi.commands[pContext->wasapi.commandIndex];
+            pContext->wasapi.commandIndex  = (pContext->wasapi.commandIndex + 1) % ma_countof(pContext->wasapi.commands);
+            pContext->wasapi.commandCount -= 1;
+        }
+        ma_mutex_unlock(&pContext->wasapi.commandLock);
+    }
+
+    return result;
+}
+
+static ma_thread_result MA_THREADCALL ma_context_command_thread__wasapi(void* pUserData)
+{
+    ma_result result;
+    ma_context* pContext = (ma_context*)pUserData;
+    MA_ASSERT(pContext != NULL);
+
+    for (;;) {
+        ma_context_command__wasapi cmd;
+        result = ma_context_next_command__wasapi(pContext, &cmd);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        switch (cmd.code)
+        {
+            case MA_CONTEXT_COMMAND_QUIT__WASAPI:
+            {
+                /* Do nothing. Handled after the switch. */
+            } break;
+
+            case MA_CONTEXT_COMMAND_CREATE_IAUDIOCLIENT__WASAPI:
+            {
+                if (cmd.data.createAudioClient.deviceType == ma_device_type_playback) {
+                    *cmd.data.createAudioClient.pResult = ma_result_from_HRESULT(ma_IAudioClient_GetService((ma_IAudioClient*)cmd.data.createAudioClient.pAudioClient, &MA_IID_IAudioRenderClient, cmd.data.createAudioClient.ppAudioClientService));
+                } else {
+                    *cmd.data.createAudioClient.pResult = ma_result_from_HRESULT(ma_IAudioClient_GetService((ma_IAudioClient*)cmd.data.createAudioClient.pAudioClient, &MA_IID_IAudioCaptureClient, cmd.data.createAudioClient.ppAudioClientService));
+                }
+            } break;
+
+            case MA_CONTEXT_COMMAND_RELEASE_IAUDIOCLIENT__WASAPI:
+            {
+                if (cmd.data.releaseAudioClient.deviceType == ma_device_type_playback) {
+                    if (cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientPlayback != NULL) {
+                        ma_IAudioClient_Release((ma_IAudioClient*)cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientPlayback);
+                        cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientPlayback = NULL;
+                    }
+                }
+
+                if (cmd.data.releaseAudioClient.deviceType == ma_device_type_capture) {
+                    if (cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientCapture != NULL) {
+                        ma_IAudioClient_Release((ma_IAudioClient*)cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientCapture);
+                        cmd.data.releaseAudioClient.pDevice->wasapi.pAudioClientCapture = NULL;
+                    }
+                }
+            } break;
+
+            default:
+            {
+                /* Unknown command. Ignore it, but trigger an assert in debug mode so we're aware of it. */
+                MA_ASSERT(MA_FALSE);
+            } break;
+        }
+
+        if (cmd.pEvent != NULL) {
+            ma_event_signal(cmd.pEvent);
+        }
+
+        if (cmd.code == MA_CONTEXT_COMMAND_QUIT__WASAPI) {
+            break;  /* Received a quit message. Get out of here. */
+        }
+    }
+
+    return (ma_thread_result)0;
+}
+
+static ma_result ma_device_create_IAudioClient_service__wasapi(ma_context* pContext, ma_device_type deviceType, ma_IAudioClient* pAudioClient, void** ppAudioClientService)
+{
+    ma_result result;
+    ma_result cmdResult;
+    ma_context_command__wasapi cmd = ma_context_init_command__wasapi(MA_CONTEXT_COMMAND_CREATE_IAUDIOCLIENT__WASAPI);
+    cmd.data.createAudioClient.deviceType           = deviceType;
+    cmd.data.createAudioClient.pAudioClient         = (void*)pAudioClient;
+    cmd.data.createAudioClient.ppAudioClientService = ppAudioClientService;
+    cmd.data.createAudioClient.pResult              = &cmdResult;   /* Declared locally, but won't be dereferenced after this function returns since execution of the command will wait here. */
+
+    result = ma_context_post_command__wasapi(pContext, &cmd);  /* This will not return until the command has actually been run. */
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return *cmd.data.createAudioClient.pResult;
+}
+
+#if 0   /* Not used at the moment, but leaving here for future use. */
+static ma_result ma_device_release_IAudioClient_service__wasapi(ma_device* pDevice, ma_device_type deviceType)
+{
+    ma_result result;
+    ma_context_command__wasapi cmd = ma_context_init_command__wasapi(MA_CONTEXT_COMMAND_RELEASE_IAUDIOCLIENT__WASAPI);
+    cmd.data.releaseAudioClient.pDevice    = pDevice;
+    cmd.data.releaseAudioClient.deviceType = deviceType;
+
+    result = ma_context_post_command__wasapi(pDevice->pContext, &cmd);  /* This will not return until the command has actually been run. */
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+#endif
+
+
+static void ma_add_native_data_format_to_device_info_from_WAVEFORMATEX(const MA_WAVEFORMATEX* pWF, ma_share_mode shareMode, ma_device_info* pInfo)
+{
+    MA_ASSERT(pWF != NULL);
+    MA_ASSERT(pInfo != NULL);
+
+    if (pInfo->nativeDataFormatCount >= ma_countof(pInfo->nativeDataFormats)) {
+        return; /* Too many data formats. Need to ignore this one. Don't think this should ever happen with WASAPI. */
+    }
+
+    pInfo->nativeDataFormats[pInfo->nativeDataFormatCount].format     = ma_format_from_WAVEFORMATEX(pWF);
+    pInfo->nativeDataFormats[pInfo->nativeDataFormatCount].channels   = pWF->nChannels;
+    pInfo->nativeDataFormats[pInfo->nativeDataFormatCount].sampleRate = pWF->nSamplesPerSec;
+    pInfo->nativeDataFormats[pInfo->nativeDataFormatCount].flags      = (shareMode == ma_share_mode_exclusive) ? MA_DATA_FORMAT_FLAG_EXCLUSIVE_MODE : 0;
+    pInfo->nativeDataFormatCount += 1;
+}
+
+static ma_result ma_context_get_device_info_from_IAudioClient__wasapi(ma_context* pContext, /*ma_IMMDevice**/void* pMMDevice, ma_IAudioClient* pAudioClient, ma_device_info* pInfo)
+{
+    HRESULT hr;
+    MA_WAVEFORMATEX* pWF = NULL;
+
+    MA_ASSERT(pAudioClient != NULL);
+    MA_ASSERT(pInfo != NULL);
+
+    /* Shared Mode. We use GetMixFormat() here. */
+    hr = ma_IAudioClient_GetMixFormat((ma_IAudioClient*)pAudioClient, (MA_WAVEFORMATEX**)&pWF);
+    if (SUCCEEDED(hr)) {
+        ma_add_native_data_format_to_device_info_from_WAVEFORMATEX(pWF, ma_share_mode_shared, pInfo);
+    } else {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to retrieve mix format for device info retrieval.");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    /*
+    Exclusive Mode. We repeatedly call IsFormatSupported() here. This is not currently supported on
+    UWP. Failure to retrieve the exclusive mode format is not considered an error, so from here on
+    out, MA_SUCCESS is guaranteed to be returned.
+    */
+    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    {
+        ma_IPropertyStore *pProperties;
+
+        /*
+        The first thing to do is get the format from PKEY_AudioEngine_DeviceFormat. This should give us a channel count we assume is
+        correct which will simplify our searching.
+        */
+        hr = ma_IMMDevice_OpenPropertyStore((ma_IMMDevice*)pMMDevice, STGM_READ, &pProperties);
+        if (SUCCEEDED(hr)) {
+            MA_PROPVARIANT var;
+            ma_PropVariantInit(&var);
+
+            hr = ma_IPropertyStore_GetValue(pProperties, &MA_PKEY_AudioEngine_DeviceFormat, &var);
+            if (SUCCEEDED(hr)) {
+                pWF = (MA_WAVEFORMATEX*)var.blob.pBlobData;
+
+                /*
+                In my testing, the format returned by PKEY_AudioEngine_DeviceFormat is suitable for exclusive mode so we check this format
+                first. If this fails, fall back to a search.
+                */
+                hr = ma_IAudioClient_IsFormatSupported((ma_IAudioClient*)pAudioClient, MA_AUDCLNT_SHAREMODE_EXCLUSIVE, pWF, NULL);
+                if (SUCCEEDED(hr)) {
+                    /* The format returned by PKEY_AudioEngine_DeviceFormat is supported. */
+                    ma_add_native_data_format_to_device_info_from_WAVEFORMATEX(pWF, ma_share_mode_exclusive, pInfo);
+                } else {
+                    /*
+                    The format returned by PKEY_AudioEngine_DeviceFormat is not supported, so fall back to a search. We assume the channel
+                    count returned by MA_PKEY_AudioEngine_DeviceFormat is valid and correct. For simplicity we're only returning one format.
+                    */
+                    ma_uint32 channels = pWF->nChannels;
+                    ma_channel defaultChannelMap[MA_MAX_CHANNELS];
+                    MA_WAVEFORMATEXTENSIBLE wf;
+                    ma_bool32 found;
+                    ma_uint32 iFormat;
+
+                    /* Make sure we don't overflow the channel map. */
+                    if (channels > MA_MAX_CHANNELS) {
+                        channels = MA_MAX_CHANNELS;
+                    }
+
+                    ma_channel_map_init_standard(ma_standard_channel_map_microsoft, defaultChannelMap, ma_countof(defaultChannelMap), channels);
+
+                    MA_ZERO_OBJECT(&wf);
+                    wf.cbSize     = sizeof(wf);
+                    wf.wFormatTag = WAVE_FORMAT_EXTENSIBLE;
+                    wf.nChannels  = (WORD)channels;
+                    wf.dwChannelMask     = ma_channel_map_to_channel_mask__win32(defaultChannelMap, channels);
+
+                    found = MA_FALSE;
+                    for (iFormat = 0; iFormat < ma_countof(g_maFormatPriorities); ++iFormat) {
+                        ma_format format = g_maFormatPriorities[iFormat];
+                        ma_uint32 iSampleRate;
+
+                        wf.wBitsPerSample       = (WORD)(ma_get_bytes_per_sample(format)*8);
+                        wf.nBlockAlign          = (WORD)(wf.nChannels * wf.wBitsPerSample / 8);
+                        wf.nAvgBytesPerSec      = wf.nBlockAlign * wf.nSamplesPerSec;
+                        wf.Samples.wValidBitsPerSample = /*(format == ma_format_s24_32) ? 24 :*/ wf.wBitsPerSample;
+                        if (format == ma_format_f32) {
+                            wf.SubFormat = MA_GUID_KSDATAFORMAT_SUBTYPE_IEEE_FLOAT;
+                        } else {
+                            wf.SubFormat = MA_GUID_KSDATAFORMAT_SUBTYPE_PCM;
+                        }
+
+                        for (iSampleRate = 0; iSampleRate < ma_countof(g_maStandardSampleRatePriorities); ++iSampleRate) {
+                            wf.nSamplesPerSec = g_maStandardSampleRatePriorities[iSampleRate];
+
+                            hr = ma_IAudioClient_IsFormatSupported((ma_IAudioClient*)pAudioClient, MA_AUDCLNT_SHAREMODE_EXCLUSIVE, (MA_WAVEFORMATEX*)&wf, NULL);
+                            if (SUCCEEDED(hr)) {
+                                ma_add_native_data_format_to_device_info_from_WAVEFORMATEX((MA_WAVEFORMATEX*)&wf, ma_share_mode_exclusive, pInfo);
+                                found = MA_TRUE;
+                                break;
+                            }
+                        }
+
+                        if (found) {
+                            break;
+                        }
+                    }
+
+                    ma_PropVariantClear(pContext, &var);
+
+                    if (!found) {
+                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "[WASAPI] Failed to find suitable device format for device info retrieval.");
+                    }
+                }
+            } else {
+                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "[WASAPI] Failed to retrieve device format for device info retrieval.");
+            }
+
+            ma_IPropertyStore_Release(pProperties);
+        } else {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "[WASAPI] Failed to open property store for device info retrieval.");
+        }
+    }
+    #else
+    {
+        (void)pMMDevice;    /* Unused. */
+    }
+    #endif
+
+    return MA_SUCCESS;
+}
+
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+static ma_EDataFlow ma_device_type_to_EDataFlow(ma_device_type deviceType)
+{
+    if (deviceType == ma_device_type_playback) {
+        return ma_eRender;
+    } else if (deviceType == ma_device_type_capture) {
+        return ma_eCapture;
+    } else {
+        MA_ASSERT(MA_FALSE);
+        return ma_eRender; /* Should never hit this. */
+    }
+}
+
+static ma_result ma_context_create_IMMDeviceEnumerator__wasapi(ma_context* pContext, ma_IMMDeviceEnumerator** ppDeviceEnumerator)
+{
+    HRESULT hr;
+    ma_IMMDeviceEnumerator* pDeviceEnumerator;
+
+    MA_ASSERT(pContext           != NULL);
+    MA_ASSERT(ppDeviceEnumerator != NULL);
+
+    *ppDeviceEnumerator = NULL; /* Safety. */
+
+    hr = ma_CoCreateInstance(pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
+    if (FAILED(hr)) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create device enumerator.");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    *ppDeviceEnumerator = pDeviceEnumerator;
+
+    return MA_SUCCESS;
+}
+
+static WCHAR* ma_context_get_default_device_id_from_IMMDeviceEnumerator__wasapi(ma_context* pContext, ma_IMMDeviceEnumerator* pDeviceEnumerator, ma_device_type deviceType)
+{
+    HRESULT hr;
+    ma_IMMDevice* pMMDefaultDevice = NULL;
+    WCHAR* pDefaultDeviceID = NULL;
+    ma_EDataFlow dataFlow;
+    ma_ERole role;
+
+    MA_ASSERT(pContext          != NULL);
+    MA_ASSERT(pDeviceEnumerator != NULL);
+
+    (void)pContext;
+
+    /* Grab the EDataFlow type from the device type. */
+    dataFlow = ma_device_type_to_EDataFlow(deviceType);
+
+    /* The role is always eConsole, but we may make this configurable later. */
+    role = ma_eConsole;
+
+    hr = ma_IMMDeviceEnumerator_GetDefaultAudioEndpoint(pDeviceEnumerator, dataFlow, role, &pMMDefaultDevice);
+    if (FAILED(hr)) {
+        return NULL;
+    }
+
+    hr = ma_IMMDevice_GetId(pMMDefaultDevice, &pDefaultDeviceID);
+
+    ma_IMMDevice_Release(pMMDefaultDevice);
+    pMMDefaultDevice = NULL;
+
+    if (FAILED(hr)) {
+        return NULL;
+    }
+
+    return pDefaultDeviceID;
+}
+
+static WCHAR* ma_context_get_default_device_id__wasapi(ma_context* pContext, ma_device_type deviceType)    /* Free the returned pointer with ma_CoTaskMemFree() */
+{
+    ma_result result;
+    ma_IMMDeviceEnumerator* pDeviceEnumerator;
+    WCHAR* pDefaultDeviceID = NULL;
+
+    MA_ASSERT(pContext != NULL);
+
+    result = ma_context_create_IMMDeviceEnumerator__wasapi(pContext, &pDeviceEnumerator);
+    if (result != MA_SUCCESS) {
+        return NULL;
+    }
+
+    pDefaultDeviceID = ma_context_get_default_device_id_from_IMMDeviceEnumerator__wasapi(pContext, pDeviceEnumerator, deviceType);
+
+    ma_IMMDeviceEnumerator_Release(pDeviceEnumerator);
+    return pDefaultDeviceID;
+}
+
+static ma_result ma_context_get_MMDevice__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_IMMDevice** ppMMDevice)
+{
+    ma_IMMDeviceEnumerator* pDeviceEnumerator;
+    HRESULT hr;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(ppMMDevice != NULL);
+
+    /*
+    This weird COM init/uninit here is a hack to work around a crash when changing devices. What is happening is
+    WASAPI fires a callback from another thread when the device is changed. It's from that thread where this
+    function is getting called. What I'm suspecting is that the other thread is not initializing COM which in turn
+    results in CoCreateInstance() failing.
+
+    The community has reported that this seems to fix the crash. There are future plans to move all WASAPI operation
+    over to a single thread to make everything safer, but in the meantime while we wait for that to come online I'm
+    happy enough to use this hack instead.
+    */
+    ma_CoInitializeEx(pContext, NULL, MA_COINIT_VALUE);
+    {
+        hr = ma_CoCreateInstance(pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
+    }
+    ma_CoUninitialize(pContext);
+
+    if (FAILED(hr)) {   /* <-- This is checking the call above to ma_CoCreateInstance(). */
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create IMMDeviceEnumerator.\n");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    if (pDeviceID == NULL) {
+        hr = ma_IMMDeviceEnumerator_GetDefaultAudioEndpoint(pDeviceEnumerator, (deviceType == ma_device_type_capture) ? ma_eCapture : ma_eRender, ma_eConsole, ppMMDevice);
+    } else {
+        hr = ma_IMMDeviceEnumerator_GetDevice(pDeviceEnumerator, pDeviceID->wasapi, ppMMDevice);
+    }
+
+    ma_IMMDeviceEnumerator_Release(pDeviceEnumerator);
+    if (FAILED(hr)) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to retrieve IMMDevice.\n");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_id_from_MMDevice__wasapi(ma_context* pContext, ma_IMMDevice* pMMDevice, ma_device_id* pDeviceID)
+{
+    WCHAR* pDeviceIDString;
+    HRESULT hr;
+
+    MA_ASSERT(pDeviceID != NULL);
+
+    hr = ma_IMMDevice_GetId(pMMDevice, &pDeviceIDString);
+    if (SUCCEEDED(hr)) {
+        size_t idlen = ma_strlen_WCHAR(pDeviceIDString);
+        if (idlen+1 > ma_countof(pDeviceID->wasapi)) {
+            ma_CoTaskMemFree(pContext, pDeviceIDString);
+            MA_ASSERT(MA_FALSE);  /* NOTE: If this is triggered, please report it. It means the format of the ID must have changed and is too long to fit in our fixed sized buffer. */
+            return MA_ERROR;
+        }
+
+        MA_COPY_MEMORY(pDeviceID->wasapi, pDeviceIDString, idlen * sizeof(wchar_t));
+        pDeviceID->wasapi[idlen] = '\0';
+
+        ma_CoTaskMemFree(pContext, pDeviceIDString);
+
+        return MA_SUCCESS;
+    }
+
+    return MA_ERROR;
+}
+
+static ma_result ma_context_get_device_info_from_MMDevice__wasapi(ma_context* pContext, ma_IMMDevice* pMMDevice, WCHAR* pDefaultDeviceID, ma_bool32 onlySimpleInfo, ma_device_info* pInfo)
+{
+    ma_result result;
+    HRESULT hr;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pMMDevice != NULL);
+    MA_ASSERT(pInfo != NULL);
+
+    /* ID. */
+    result = ma_context_get_device_id_from_MMDevice__wasapi(pContext, pMMDevice, &pInfo->id);
+    if (result == MA_SUCCESS) {
+        if (pDefaultDeviceID != NULL) {
+            if (ma_strcmp_WCHAR(pInfo->id.wasapi, pDefaultDeviceID) == 0) {
+                pInfo->isDefault = MA_TRUE;
+            }
+        }
+    }
+
+    /* Description / Friendly Name */
+    {
+        ma_IPropertyStore *pProperties;
+        hr = ma_IMMDevice_OpenPropertyStore(pMMDevice, STGM_READ, &pProperties);
+        if (SUCCEEDED(hr)) {
+            MA_PROPVARIANT var;
+
+            ma_PropVariantInit(&var);
+            hr = ma_IPropertyStore_GetValue(pProperties, &MA_PKEY_Device_FriendlyName, &var);
+            if (SUCCEEDED(hr)) {
+                WideCharToMultiByte(CP_UTF8, 0, var.pwszVal, -1, pInfo->name, sizeof(pInfo->name), 0, FALSE);
+                ma_PropVariantClear(pContext, &var);
+            }
+
+            ma_IPropertyStore_Release(pProperties);
+        }
+    }
+
+    /* Format */
+    if (!onlySimpleInfo) {
+        ma_IAudioClient* pAudioClient;
+        hr = ma_IMMDevice_Activate(pMMDevice, &MA_IID_IAudioClient, CLSCTX_ALL, NULL, (void**)&pAudioClient);
+        if (SUCCEEDED(hr)) {
+            result = ma_context_get_device_info_from_IAudioClient__wasapi(pContext, pMMDevice, pAudioClient, pInfo);
+
+            ma_IAudioClient_Release(pAudioClient);
+            return result;
+        } else {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to activate audio client for device info retrieval.");
+            return ma_result_from_HRESULT(hr);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_enumerate_devices_by_type__wasapi(ma_context* pContext, ma_IMMDeviceEnumerator* pDeviceEnumerator, ma_device_type deviceType, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_result result = MA_SUCCESS;
+    UINT deviceCount;
+    HRESULT hr;
+    ma_uint32 iDevice;
+    WCHAR* pDefaultDeviceID = NULL;
+    ma_IMMDeviceCollection* pDeviceCollection = NULL;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    /* Grab the default device. We use this to know whether or not flag the returned device info as being the default. */
+    pDefaultDeviceID = ma_context_get_default_device_id_from_IMMDeviceEnumerator__wasapi(pContext, pDeviceEnumerator, deviceType);
+
+    /* We need to enumerate the devices which returns a device collection. */
+    hr = ma_IMMDeviceEnumerator_EnumAudioEndpoints(pDeviceEnumerator, ma_device_type_to_EDataFlow(deviceType), MA_MM_DEVICE_STATE_ACTIVE, &pDeviceCollection);
+    if (SUCCEEDED(hr)) {
+        hr = ma_IMMDeviceCollection_GetCount(pDeviceCollection, &deviceCount);
+        if (FAILED(hr)) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to get device count.\n");
+            result = ma_result_from_HRESULT(hr);
+            goto done;
+        }
+
+        for (iDevice = 0; iDevice < deviceCount; ++iDevice) {
+            ma_device_info deviceInfo;
+            ma_IMMDevice* pMMDevice;
+
+            MA_ZERO_OBJECT(&deviceInfo);
+
+            hr = ma_IMMDeviceCollection_Item(pDeviceCollection, iDevice, &pMMDevice);
+            if (SUCCEEDED(hr)) {
+                result = ma_context_get_device_info_from_MMDevice__wasapi(pContext, pMMDevice, pDefaultDeviceID, MA_TRUE, &deviceInfo);   /* MA_TRUE = onlySimpleInfo. */
+
+                ma_IMMDevice_Release(pMMDevice);
+                if (result == MA_SUCCESS) {
+                    ma_bool32 cbResult = callback(pContext, deviceType, &deviceInfo, pUserData);
+                    if (cbResult == MA_FALSE) {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+done:
+    if (pDefaultDeviceID != NULL) {
+        ma_CoTaskMemFree(pContext, pDefaultDeviceID);
+        pDefaultDeviceID = NULL;
+    }
+
+    if (pDeviceCollection != NULL) {
+        ma_IMMDeviceCollection_Release(pDeviceCollection);
+        pDeviceCollection = NULL;
+    }
+
+    return result;
+}
+
+static ma_result ma_context_get_IAudioClient_Desktop__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, MA_PROPVARIANT* pActivationParams, ma_IAudioClient** ppAudioClient, ma_IMMDevice** ppMMDevice)
+{
+    ma_result result;
+    HRESULT hr;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(ppAudioClient != NULL);
+    MA_ASSERT(ppMMDevice != NULL);
+
+    result = ma_context_get_MMDevice__wasapi(pContext, deviceType, pDeviceID, ppMMDevice);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    hr = ma_IMMDevice_Activate(*ppMMDevice, &MA_IID_IAudioClient, CLSCTX_ALL, pActivationParams, (void**)ppAudioClient);
+    if (FAILED(hr)) {
+        return ma_result_from_HRESULT(hr);
+    }
+
+    return MA_SUCCESS;
+}
+#else
+static ma_result ma_context_get_IAudioClient_UWP__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, MA_PROPVARIANT* pActivationParams, ma_IAudioClient** ppAudioClient, ma_IUnknown** ppActivatedInterface)
+{
+    ma_IActivateAudioInterfaceAsyncOperation *pAsyncOp = NULL;
+    ma_completion_handler_uwp completionHandler;
+    IID iid;
+    WCHAR* iidStr;
+    HRESULT hr;
+    ma_result result;
+    HRESULT activateResult;
+    ma_IUnknown* pActivatedInterface;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(ppAudioClient != NULL);
+
+    if (pDeviceID != NULL) {
+        iidStr = (WCHAR*)pDeviceID->wasapi;
+    } else {
+        if (deviceType == ma_device_type_capture) {
+            iid = MA_IID_DEVINTERFACE_AUDIO_CAPTURE;
+        } else {
+            iid = MA_IID_DEVINTERFACE_AUDIO_RENDER;
+        }
+
+    #if defined(__cplusplus)
+        hr = StringFromIID(iid, &iidStr);
+    #else
+        hr = StringFromIID(&iid, &iidStr);
+    #endif
+        if (FAILED(hr)) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to convert device IID to string for ActivateAudioInterfaceAsync(). Out of memory.\n");
+            return ma_result_from_HRESULT(hr);
+        }
+    }
+
+    result = ma_completion_handler_uwp_init(&completionHandler);
+    if (result != MA_SUCCESS) {
+        ma_CoTaskMemFree(pContext, iidStr);
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create event for waiting for ActivateAudioInterfaceAsync().\n");
+        return result;
+    }
+
+    hr = ((MA_PFN_ActivateAudioInterfaceAsync)pContext->wasapi.ActivateAudioInterfaceAsync)(iidStr, &MA_IID_IAudioClient, pActivationParams, (ma_IActivateAudioInterfaceCompletionHandler*)&completionHandler, (ma_IActivateAudioInterfaceAsyncOperation**)&pAsyncOp);
+    if (FAILED(hr)) {
+        ma_completion_handler_uwp_uninit(&completionHandler);
+        ma_CoTaskMemFree(pContext, iidStr);
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] ActivateAudioInterfaceAsync() failed.\n");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    if (pDeviceID == NULL) {
+        ma_CoTaskMemFree(pContext, iidStr);
+    }
+
+    /* Wait for the async operation for finish. */
+    ma_completion_handler_uwp_wait(&completionHandler);
+    ma_completion_handler_uwp_uninit(&completionHandler);
+
+    hr = ma_IActivateAudioInterfaceAsyncOperation_GetActivateResult(pAsyncOp, &activateResult, &pActivatedInterface);
+    ma_IActivateAudioInterfaceAsyncOperation_Release(pAsyncOp);
+
+    if (FAILED(hr) || FAILED(activateResult)) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to activate device.\n");
+        return FAILED(hr) ? ma_result_from_HRESULT(hr) : ma_result_from_HRESULT(activateResult);
+    }
+
+    /* Here is where we grab the IAudioClient interface. */
+    hr = ma_IUnknown_QueryInterface(pActivatedInterface, &MA_IID_IAudioClient, (void**)ppAudioClient);
+    if (FAILED(hr)) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to query IAudioClient interface.\n");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    if (ppActivatedInterface) {
+        *ppActivatedInterface = pActivatedInterface;
+    } else {
+        ma_IUnknown_Release(pActivatedInterface);
+    }
+
+    return MA_SUCCESS;
+}
+#endif
+
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/audioclientactivationparams/ne-audioclientactivationparams-audioclient_activation_type */
+typedef enum
+{
+    MA_AUDIOCLIENT_ACTIVATION_TYPE_DEFAULT,
+    MA_AUDIOCLIENT_ACTIVATION_TYPE_PROCESS_LOOPBACK
+} MA_AUDIOCLIENT_ACTIVATION_TYPE;
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/audioclientactivationparams/ne-audioclientactivationparams-process_loopback_mode */
+typedef enum
+{
+    MA_PROCESS_LOOPBACK_MODE_INCLUDE_TARGET_PROCESS_TREE,
+    MA_PROCESS_LOOPBACK_MODE_EXCLUDE_TARGET_PROCESS_TREE
+} MA_PROCESS_LOOPBACK_MODE;
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/audioclientactivationparams/ns-audioclientactivationparams-audioclient_process_loopback_params */
+typedef struct
+{
+    DWORD TargetProcessId;
+    MA_PROCESS_LOOPBACK_MODE ProcessLoopbackMode;
+} MA_AUDIOCLIENT_PROCESS_LOOPBACK_PARAMS;
+
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(push)
+    #pragma warning(disable:4201)   /* nonstandard extension used: nameless struct/union */
+#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wpedantic" /* For ISO C99 doesn't support unnamed structs/unions [-Wpedantic] */
+    #if defined(__clang__)
+        #pragma GCC diagnostic ignored "-Wc11-extensions"   /* anonymous unions are a C11 extension */
+    #endif
+#endif
+/* https://docs.microsoft.com/en-us/windows/win32/api/audioclientactivationparams/ns-audioclientactivationparams-audioclient_activation_params */
+typedef struct
+{
+    MA_AUDIOCLIENT_ACTIVATION_TYPE ActivationType;
+    union
+    {
+        MA_AUDIOCLIENT_PROCESS_LOOPBACK_PARAMS ProcessLoopbackParams;
+    };
+} MA_AUDIOCLIENT_ACTIVATION_PARAMS;
+#if defined(_MSC_VER) && !defined(__clang__)
+    #pragma warning(pop)
+#elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
+    #pragma GCC diagnostic pop
+#endif
+
+#define MA_VIRTUAL_AUDIO_DEVICE_PROCESS_LOOPBACK L"VAD\\Process_Loopback"
+
+static ma_result ma_context_get_IAudioClient__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_uint32 loopbackProcessID, ma_bool32 loopbackProcessExclude, ma_IAudioClient** ppAudioClient, ma_WASAPIDeviceInterface** ppDeviceInterface)
+{
+    ma_result result;
+    ma_bool32 usingProcessLoopback = MA_FALSE;
+    MA_AUDIOCLIENT_ACTIVATION_PARAMS audioclientActivationParams;
+    MA_PROPVARIANT activationParams;
+    MA_PROPVARIANT* pActivationParams = NULL;
+    ma_device_id virtualDeviceID;
+
+    /* Activation parameters specific to loopback mode. Note that process-specific loopback will only work when a default device ID is specified. */
+    if (deviceType == ma_device_type_loopback && loopbackProcessID != 0 && pDeviceID == NULL) {
+        usingProcessLoopback = MA_TRUE;
+    }
+
+    if (usingProcessLoopback) {
+        MA_ZERO_OBJECT(&audioclientActivationParams);
+        audioclientActivationParams.ActivationType                            = MA_AUDIOCLIENT_ACTIVATION_TYPE_PROCESS_LOOPBACK;
+        audioclientActivationParams.ProcessLoopbackParams.ProcessLoopbackMode = (loopbackProcessExclude) ? MA_PROCESS_LOOPBACK_MODE_EXCLUDE_TARGET_PROCESS_TREE : MA_PROCESS_LOOPBACK_MODE_INCLUDE_TARGET_PROCESS_TREE;
+        audioclientActivationParams.ProcessLoopbackParams.TargetProcessId     = (DWORD)loopbackProcessID;
+
+        ma_PropVariantInit(&activationParams);
+        activationParams.vt             = MA_VT_BLOB;
+        activationParams.blob.cbSize    = sizeof(audioclientActivationParams);
+        activationParams.blob.pBlobData = (BYTE*)&audioclientActivationParams;
+        pActivationParams = &activationParams;
+
+        /* When requesting a specific device ID we need to use a special device ID. */
+        MA_COPY_MEMORY(virtualDeviceID.wasapi, MA_VIRTUAL_AUDIO_DEVICE_PROCESS_LOOPBACK, (wcslen(MA_VIRTUAL_AUDIO_DEVICE_PROCESS_LOOPBACK) + 1) * sizeof(wchar_t)); /* +1 for the null terminator. */
+        pDeviceID = &virtualDeviceID;
+    } else {
+        pActivationParams = NULL;   /* No activation parameters required. */
+    }
+
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    result = ma_context_get_IAudioClient_Desktop__wasapi(pContext, deviceType, pDeviceID, pActivationParams, ppAudioClient, ppDeviceInterface);
+#else
+    result = ma_context_get_IAudioClient_UWP__wasapi(pContext, deviceType, pDeviceID, pActivationParams, ppAudioClient, ppDeviceInterface);
+#endif
+
+    /*
+    If loopback mode was requested with a process ID and initialization failed, it could be because it's
+    trying to run on an older version of Windows where it's not supported. We need to let the caller
+    know about this with a log message.
+    */
+    if (result != MA_SUCCESS) {
+        if (usingProcessLoopback) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Loopback mode requested to %s process ID %u, but initialization failed. Support for this feature begins with Windows 10 Build 20348. Confirm your version of Windows or consider not using process-specific loopback.\n", (loopbackProcessExclude) ? "exclude" : "include", loopbackProcessID);
+        }
+    }
+
+    return result;
+}
+
+
+static ma_result ma_context_enumerate_devices__wasapi(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    /* Different enumeration for desktop and UWP. */
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    /* Desktop */
+    HRESULT hr;
+    ma_IMMDeviceEnumerator* pDeviceEnumerator;
+
+    hr = ma_CoCreateInstance(pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
+    if (FAILED(hr)) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create device enumerator.");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    ma_context_enumerate_devices_by_type__wasapi(pContext, pDeviceEnumerator, ma_device_type_playback, callback, pUserData);
+    ma_context_enumerate_devices_by_type__wasapi(pContext, pDeviceEnumerator, ma_device_type_capture,  callback, pUserData);
+
+    ma_IMMDeviceEnumerator_Release(pDeviceEnumerator);
+#else
+    /*
+    UWP
+
+    The MMDevice API is only supported on desktop applications. For now, while I'm still figuring out how to properly enumerate
+    over devices without using MMDevice, I'm restricting devices to defaults.
+
+    Hint: DeviceInformation::FindAllAsync() with DeviceClass.AudioCapture/AudioRender. https://blogs.windows.com/buildingapps/2014/05/15/real-time-audio-in-windows-store-and-windows-phone-apps/
+    */
+    if (callback) {
+        ma_bool32 cbResult = MA_TRUE;
+
+        /* Playback. */
+        if (cbResult) {
+            ma_device_info deviceInfo;
+            MA_ZERO_OBJECT(&deviceInfo);
+            ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+            deviceInfo.isDefault = MA_TRUE;
+            cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+        }
+
+        /* Capture. */
+        if (cbResult) {
+            ma_device_info deviceInfo;
+            MA_ZERO_OBJECT(&deviceInfo);
+            ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+            deviceInfo.isDefault = MA_TRUE;
+            cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+        }
+    }
+#endif
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    ma_result result;
+    ma_IMMDevice* pMMDevice = NULL;
+    WCHAR* pDefaultDeviceID = NULL;
+
+    result = ma_context_get_MMDevice__wasapi(pContext, deviceType, pDeviceID, &pMMDevice);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* We need the default device ID so we can set the isDefault flag in the device info. */
+    pDefaultDeviceID = ma_context_get_default_device_id__wasapi(pContext, deviceType);
+
+    result = ma_context_get_device_info_from_MMDevice__wasapi(pContext, pMMDevice, pDefaultDeviceID, MA_FALSE, pDeviceInfo);   /* MA_FALSE = !onlySimpleInfo. */
+
+    if (pDefaultDeviceID != NULL) {
+        ma_CoTaskMemFree(pContext, pDefaultDeviceID);
+        pDefaultDeviceID = NULL;
+    }
+
+    ma_IMMDevice_Release(pMMDevice);
+
+    return result;
+#else
+    ma_IAudioClient* pAudioClient;
+    ma_result result;
+
+    /* UWP currently only uses default devices. */
+    if (deviceType == ma_device_type_playback) {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+    } else {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+    }
+
+    result = ma_context_get_IAudioClient_UWP__wasapi(pContext, deviceType, pDeviceID, NULL, &pAudioClient, NULL);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_context_get_device_info_from_IAudioClient__wasapi(pContext, NULL, pAudioClient, pDeviceInfo);
+
+    pDeviceInfo->isDefault = MA_TRUE;  /* UWP only supports default devices. */
+
+    ma_IAudioClient_Release(pAudioClient);
+    return result;
+#endif
+}
+
+static ma_result ma_device_uninit__wasapi(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    {
+        if (pDevice->wasapi.pDeviceEnumerator) {
+            ((ma_IMMDeviceEnumerator*)pDevice->wasapi.pDeviceEnumerator)->lpVtbl->UnregisterEndpointNotificationCallback((ma_IMMDeviceEnumerator*)pDevice->wasapi.pDeviceEnumerator, &pDevice->wasapi.notificationClient);
+            ma_IMMDeviceEnumerator_Release((ma_IMMDeviceEnumerator*)pDevice->wasapi.pDeviceEnumerator);
+        }
+
+        ma_mutex_uninit(&pDevice->wasapi.rerouteLock);
+    }
+    #endif
+
+    if (pDevice->wasapi.pRenderClient) {
+        if (pDevice->wasapi.pMappedBufferPlayback != NULL) {
+            ma_IAudioRenderClient_ReleaseBuffer((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient, pDevice->wasapi.mappedBufferPlaybackCap, 0);
+            pDevice->wasapi.pMappedBufferPlayback   = NULL;
+            pDevice->wasapi.mappedBufferPlaybackCap = 0;
+            pDevice->wasapi.mappedBufferPlaybackLen = 0;
+        }
+
+        ma_IAudioRenderClient_Release((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient);
+    }
+    if (pDevice->wasapi.pCaptureClient) {
+        if (pDevice->wasapi.pMappedBufferCapture != NULL) {
+            ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
+            pDevice->wasapi.pMappedBufferCapture   = NULL;
+            pDevice->wasapi.mappedBufferCaptureCap = 0;
+            pDevice->wasapi.mappedBufferCaptureLen = 0;
+        }
+
+        ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
+    }
+
+    if (pDevice->wasapi.pAudioClientPlayback) {
+        ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
+    }
+    if (pDevice->wasapi.pAudioClientCapture) {
+        ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
+    }
+
+    if (pDevice->wasapi.hEventPlayback) {
+        CloseHandle((HANDLE)pDevice->wasapi.hEventPlayback);
+    }
+    if (pDevice->wasapi.hEventCapture) {
+        CloseHandle((HANDLE)pDevice->wasapi.hEventCapture);
+    }
+
+    return MA_SUCCESS;
+}
+
+
+typedef struct
+{
+    /* Input. */
+    ma_format formatIn;
+    ma_uint32 channelsIn;
+    ma_uint32 sampleRateIn;
+    ma_channel channelMapIn[MA_MAX_CHANNELS];
+    ma_uint32 periodSizeInFramesIn;
+    ma_uint32 periodSizeInMillisecondsIn;
+    ma_uint32 periodsIn;
+    ma_share_mode shareMode;
+    ma_performance_profile performanceProfile;
+    ma_bool32 noAutoConvertSRC;
+    ma_bool32 noDefaultQualitySRC;
+    ma_bool32 noHardwareOffloading;
+    ma_uint32 loopbackProcessID;
+    ma_bool32 loopbackProcessExclude;
+
+    /* Output. */
+    ma_IAudioClient* pAudioClient;
+    ma_IAudioRenderClient* pRenderClient;
+    ma_IAudioCaptureClient* pCaptureClient;
+    ma_format formatOut;
+    ma_uint32 channelsOut;
+    ma_uint32 sampleRateOut;
+    ma_channel channelMapOut[MA_MAX_CHANNELS];
+    ma_uint32 periodSizeInFramesOut;
+    ma_uint32 periodsOut;
+    ma_bool32 usingAudioClient3;
+    char deviceName[256];
+    ma_device_id id;
+} ma_device_init_internal_data__wasapi;
+
+static ma_result ma_device_init_internal__wasapi(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_init_internal_data__wasapi* pData)
+{
+    HRESULT hr;
+    ma_result result = MA_SUCCESS;
+    const char* errorMsg = "";
+    MA_AUDCLNT_SHAREMODE shareMode = MA_AUDCLNT_SHAREMODE_SHARED;
+    DWORD streamFlags = 0;
+    MA_REFERENCE_TIME periodDurationInMicroseconds;
+    ma_bool32 wasInitializedUsingIAudioClient3 = MA_FALSE;
+    MA_WAVEFORMATEXTENSIBLE wf;
+    ma_WASAPIDeviceInterface* pDeviceInterface = NULL;
+    ma_IAudioClient2* pAudioClient2;
+    ma_uint32 nativeSampleRate;
+    ma_bool32 usingProcessLoopback = MA_FALSE;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pData != NULL);
+
+    /* This function is only used to initialize one device type: either playback, capture or loopback. Never full-duplex. */
+    if (deviceType == ma_device_type_duplex) {
+        return MA_INVALID_ARGS;
+    }
+
+    usingProcessLoopback = deviceType == ma_device_type_loopback && pData->loopbackProcessID != 0 && pDeviceID == NULL;
+
+    pData->pAudioClient = NULL;
+    pData->pRenderClient = NULL;
+    pData->pCaptureClient = NULL;
+
+    streamFlags = MA_AUDCLNT_STREAMFLAGS_EVENTCALLBACK;
+    if (!pData->noAutoConvertSRC && pData->sampleRateIn != 0 && pData->shareMode != ma_share_mode_exclusive) {    /* <-- Exclusive streams must use the native sample rate. */
+        streamFlags |= MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM;
+    }
+    if (!pData->noDefaultQualitySRC && pData->sampleRateIn != 0 && (streamFlags & MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM) != 0) {
+        streamFlags |= MA_AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY;
+    }
+    if (deviceType == ma_device_type_loopback) {
+        streamFlags |= MA_AUDCLNT_STREAMFLAGS_LOOPBACK;
+    }
+
+    result = ma_context_get_IAudioClient__wasapi(pContext, deviceType, pDeviceID, pData->loopbackProcessID, pData->loopbackProcessExclude, &pData->pAudioClient, &pDeviceInterface);
+    if (result != MA_SUCCESS) {
+        goto done;
+    }
+
+    MA_ZERO_OBJECT(&wf);
+
+    /* Try enabling hardware offloading. */
+    if (!pData->noHardwareOffloading) {
+        hr = ma_IAudioClient_QueryInterface(pData->pAudioClient, &MA_IID_IAudioClient2, (void**)&pAudioClient2);
+        if (SUCCEEDED(hr)) {
+            BOOL isHardwareOffloadingSupported = 0;
+            hr = ma_IAudioClient2_IsOffloadCapable(pAudioClient2, MA_AudioCategory_Other, &isHardwareOffloadingSupported);
+            if (SUCCEEDED(hr) && isHardwareOffloadingSupported) {
+                ma_AudioClientProperties clientProperties;
+                MA_ZERO_OBJECT(&clientProperties);
+                clientProperties.cbSize = sizeof(clientProperties);
+                clientProperties.bIsOffload = 1;
+                clientProperties.eCategory = MA_AudioCategory_Other;
+                ma_IAudioClient2_SetClientProperties(pAudioClient2, &clientProperties);
+            }
+
+            pAudioClient2->lpVtbl->Release(pAudioClient2);
+        }
+    }
+
+    /* Here is where we try to determine the best format to use with the device. If the client if wanting exclusive mode, first try finding the best format for that. If this fails, fall back to shared mode. */
+    result = MA_FORMAT_NOT_SUPPORTED;
+    if (pData->shareMode == ma_share_mode_exclusive) {
+    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+        /* In exclusive mode on desktop we always use the backend's native format. */
+        ma_IPropertyStore* pStore = NULL;
+        hr = ma_IMMDevice_OpenPropertyStore(pDeviceInterface, STGM_READ, &pStore);
+        if (SUCCEEDED(hr)) {
+            MA_PROPVARIANT prop;
+            ma_PropVariantInit(&prop);
+            hr = ma_IPropertyStore_GetValue(pStore, &MA_PKEY_AudioEngine_DeviceFormat, &prop);
+            if (SUCCEEDED(hr)) {
+                MA_WAVEFORMATEX* pActualFormat = (MA_WAVEFORMATEX*)prop.blob.pBlobData;
+                hr = ma_IAudioClient_IsFormatSupported((ma_IAudioClient*)pData->pAudioClient, MA_AUDCLNT_SHAREMODE_EXCLUSIVE, pActualFormat, NULL);
+                if (SUCCEEDED(hr)) {
+                    MA_COPY_MEMORY(&wf, pActualFormat, sizeof(MA_WAVEFORMATEXTENSIBLE));
+                }
+
+                ma_PropVariantClear(pContext, &prop);
+            }
+
+            ma_IPropertyStore_Release(pStore);
+        }
+    #else
+        /*
+        I do not know how to query the device's native format on UWP so for now I'm just disabling support for
+        exclusive mode. The alternative is to enumerate over different formats and check IsFormatSupported()
+        until you find one that works.
+
+        TODO: Add support for exclusive mode to UWP.
+        */
+        hr = S_FALSE;
+    #endif
+
+        if (hr == S_OK) {
+            shareMode = MA_AUDCLNT_SHAREMODE_EXCLUSIVE;
+            result = MA_SUCCESS;
+        } else {
+            result = MA_SHARE_MODE_NOT_SUPPORTED;
+        }
+    } else {
+        /* In shared mode we are always using the format reported by the operating system. */
+        MA_WAVEFORMATEXTENSIBLE* pNativeFormat = NULL;
+        hr = ma_IAudioClient_GetMixFormat((ma_IAudioClient*)pData->pAudioClient, (MA_WAVEFORMATEX**)&pNativeFormat);
+        if (hr != S_OK) {
+            /* When using process-specific loopback, GetMixFormat() seems to always fail. */
+            if (usingProcessLoopback) {
+                wf.wFormatTag      = WAVE_FORMAT_IEEE_FLOAT;
+                wf.nChannels       = 2;
+                wf.nSamplesPerSec  = 44100;
+                wf.wBitsPerSample  = 32;
+                wf.nBlockAlign     = wf.nChannels * wf.wBitsPerSample / 8;
+                wf.nAvgBytesPerSec = wf.nSamplesPerSec * wf.nBlockAlign;
+                wf.cbSize          = sizeof(MA_WAVEFORMATEX);
+
+                result = MA_SUCCESS;
+            } else {
+                result = MA_FORMAT_NOT_SUPPORTED;
+            }
+        } else {
+            /*
+            I've seen cases where cbSize will be set to sizeof(WAVEFORMATEX) even though the structure itself
+            is given the format tag of WAVE_FORMAT_EXTENSIBLE. If the format tag is WAVE_FORMAT_EXTENSIBLE
+            want to make sure we copy the whole WAVEFORMATEXTENSIBLE structure. Otherwise we'll have to be
+            safe and only copy the WAVEFORMATEX part.
+            */
+            if (pNativeFormat->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
+                MA_COPY_MEMORY(&wf, pNativeFormat, sizeof(MA_WAVEFORMATEXTENSIBLE));
+            } else {
+                /* I've seen a case where cbSize was set to 0. Assume sizeof(WAVEFORMATEX) in this case. */
+                size_t cbSize = pNativeFormat->cbSize;
+                if (cbSize == 0) {
+                    cbSize = sizeof(MA_WAVEFORMATEX);
+                }
+
+                /* Make sure we don't copy more than the capacity of `wf`. */
+                if (cbSize > sizeof(wf)) {
+                    cbSize = sizeof(wf);
+                }
+
+                MA_COPY_MEMORY(&wf, pNativeFormat, cbSize);
+            }
+
+            result = MA_SUCCESS;
+        }
+
+        ma_CoTaskMemFree(pContext, pNativeFormat);
+
+        shareMode = MA_AUDCLNT_SHAREMODE_SHARED;
+    }
+
+    /* Return an error if we still haven't found a format. */
+    if (result != MA_SUCCESS) {
+        errorMsg = "[WASAPI] Failed to find best device mix format.";
+        goto done;
+    }
+
+    /*
+    Override the native sample rate with the one requested by the caller, but only if we're not using the default sample rate. We'll use
+    WASAPI to perform the sample rate conversion.
+    */
+    nativeSampleRate = wf.nSamplesPerSec;
+    if (streamFlags & MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM) {
+        wf.nSamplesPerSec = (pData->sampleRateIn != 0) ? pData->sampleRateIn : MA_DEFAULT_SAMPLE_RATE;
+        wf.nAvgBytesPerSec = wf.nSamplesPerSec * wf.nBlockAlign;
+    }
+
+    pData->formatOut = ma_format_from_WAVEFORMATEX((MA_WAVEFORMATEX*)&wf);
+    if (pData->formatOut == ma_format_unknown) {
+        /*
+        The format isn't supported. This is almost certainly because the exclusive mode format isn't supported by miniaudio. We need to return MA_SHARE_MODE_NOT_SUPPORTED
+        in this case so that the caller can detect it and fall back to shared mode if desired. We should never get here if shared mode was requested, but just for
+        completeness we'll check for it and return MA_FORMAT_NOT_SUPPORTED.
+        */
+        if (shareMode == MA_AUDCLNT_SHAREMODE_EXCLUSIVE) {
+            result = MA_SHARE_MODE_NOT_SUPPORTED;
+        } else {
+            result = MA_FORMAT_NOT_SUPPORTED;
+        }
+
+        errorMsg = "[WASAPI] Native format not supported.";
+        goto done;
+    }
+
+    pData->channelsOut = wf.nChannels;
+    pData->sampleRateOut = wf.nSamplesPerSec;
+
+    /*
+    Get the internal channel map based on the channel mask. There is a possibility that GetMixFormat() returns
+    a WAVEFORMATEX instead of a WAVEFORMATEXTENSIBLE, in which case the channel mask will be undefined. In this
+    case we'll just use the default channel map.
+    */
+    if (wf.wFormatTag == WAVE_FORMAT_EXTENSIBLE || wf.cbSize >= sizeof(MA_WAVEFORMATEXTENSIBLE)) {
+        ma_channel_mask_to_channel_map__win32(wf.dwChannelMask, pData->channelsOut, pData->channelMapOut);
+    } else {
+        ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pData->channelMapOut, ma_countof(pData->channelMapOut), pData->channelsOut);
+    }
+
+    /* Period size. */
+    pData->periodsOut = (pData->periodsIn != 0) ? pData->periodsIn : MA_DEFAULT_PERIODS;
+    pData->periodSizeInFramesOut = pData->periodSizeInFramesIn;
+    if (pData->periodSizeInFramesOut == 0) {
+        if (pData->periodSizeInMillisecondsIn == 0) {
+            if (pData->performanceProfile == ma_performance_profile_low_latency) {
+                pData->periodSizeInFramesOut = ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY, wf.nSamplesPerSec);
+            } else {
+                pData->periodSizeInFramesOut = ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE, wf.nSamplesPerSec);
+            }
+        } else {
+            pData->periodSizeInFramesOut = ma_calculate_buffer_size_in_frames_from_milliseconds(pData->periodSizeInMillisecondsIn, wf.nSamplesPerSec);
+        }
+    }
+
+    periodDurationInMicroseconds = ((ma_uint64)pData->periodSizeInFramesOut * 1000 * 1000) / wf.nSamplesPerSec;
+
+
+    /* Slightly different initialization for shared and exclusive modes. We try exclusive mode first, and if it fails, fall back to shared mode. */
+    if (shareMode == MA_AUDCLNT_SHAREMODE_EXCLUSIVE) {
+        MA_REFERENCE_TIME bufferDuration = periodDurationInMicroseconds * pData->periodsOut * 10;
+
+        /*
+        If the periodicity is too small, Initialize() will fail with AUDCLNT_E_INVALID_DEVICE_PERIOD. In this case we should just keep increasing
+        it and trying it again.
+        */
+        hr = E_FAIL;
+        for (;;) {
+            hr = ma_IAudioClient_Initialize((ma_IAudioClient*)pData->pAudioClient, shareMode, streamFlags, bufferDuration, bufferDuration, (MA_WAVEFORMATEX*)&wf, NULL);
+            if (hr == MA_AUDCLNT_E_INVALID_DEVICE_PERIOD) {
+                if (bufferDuration > 500*10000) {
+                    break;
+                } else {
+                    if (bufferDuration == 0) {  /* <-- Just a sanity check to prevent an infinite loop. Should never happen, but it makes me feel better. */
+                        break;
+                    }
+
+                    bufferDuration = bufferDuration * 2;
+                    continue;
+                }
+            } else {
+                break;
+            }
+        }
+
+        if (hr == MA_AUDCLNT_E_BUFFER_SIZE_NOT_ALIGNED) {
+            ma_uint32 bufferSizeInFrames;
+            hr = ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pData->pAudioClient, &bufferSizeInFrames);
+            if (SUCCEEDED(hr)) {
+                bufferDuration = (MA_REFERENCE_TIME)((10000.0 * 1000 / wf.nSamplesPerSec * bufferSizeInFrames) + 0.5);
+
+                /* Unfortunately we need to release and re-acquire the audio client according to MSDN. Seems silly - why not just call IAudioClient_Initialize() again?! */
+                ma_IAudioClient_Release((ma_IAudioClient*)pData->pAudioClient);
+
+            #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+                hr = ma_IMMDevice_Activate(pDeviceInterface, &MA_IID_IAudioClient, CLSCTX_ALL, NULL, (void**)&pData->pAudioClient);
+            #else
+                hr = ma_IUnknown_QueryInterface(pDeviceInterface, &MA_IID_IAudioClient, (void**)&pData->pAudioClient);
+            #endif
+
+                if (SUCCEEDED(hr)) {
+                    hr = ma_IAudioClient_Initialize((ma_IAudioClient*)pData->pAudioClient, shareMode, streamFlags, bufferDuration, bufferDuration, (MA_WAVEFORMATEX*)&wf, NULL);
+                }
+            }
+        }
+
+        if (FAILED(hr)) {
+            /* Failed to initialize in exclusive mode. Don't fall back to shared mode - instead tell the client about it. They can reinitialize in shared mode if they want. */
+            if (hr == E_ACCESSDENIED) {
+                errorMsg = "[WASAPI] Failed to initialize device in exclusive mode. Access denied.", result = MA_ACCESS_DENIED;
+            } else if (hr == MA_AUDCLNT_E_DEVICE_IN_USE) {
+                errorMsg = "[WASAPI] Failed to initialize device in exclusive mode. Device in use.", result = MA_BUSY;
+            } else {
+                errorMsg = "[WASAPI] Failed to initialize device in exclusive mode."; result = ma_result_from_HRESULT(hr);
+            }
+            goto done;
+        }
+    }
+
+    if (shareMode == MA_AUDCLNT_SHAREMODE_SHARED) {
+        /*
+        Low latency shared mode via IAudioClient3.
+
+        NOTE
+        ====
+        Contrary to the documentation on MSDN (https://docs.microsoft.com/en-us/windows/win32/api/audioclient/nf-audioclient-iaudioclient3-initializesharedaudiostream), the
+        use of AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM and AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY with IAudioClient3_InitializeSharedAudioStream() absolutely does not work. Using
+        any of these flags will result in HRESULT code 0x88890021. The other problem is that calling IAudioClient3_GetSharedModeEnginePeriod() with a sample rate different to
+        that returned by IAudioClient_GetMixFormat() also results in an error. I'm therefore disabling low-latency shared mode with AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM.
+        */
+        #ifndef MA_WASAPI_NO_LOW_LATENCY_SHARED_MODE
+        {
+            if ((streamFlags & MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM) == 0 || nativeSampleRate == wf.nSamplesPerSec) {
+                ma_IAudioClient3* pAudioClient3 = NULL;
+                hr = ma_IAudioClient_QueryInterface(pData->pAudioClient, &MA_IID_IAudioClient3, (void**)&pAudioClient3);
+                if (SUCCEEDED(hr)) {
+                    ma_uint32 defaultPeriodInFrames;
+                    ma_uint32 fundamentalPeriodInFrames;
+                    ma_uint32 minPeriodInFrames;
+                    ma_uint32 maxPeriodInFrames;
+                    hr = ma_IAudioClient3_GetSharedModeEnginePeriod(pAudioClient3, (MA_WAVEFORMATEX*)&wf, &defaultPeriodInFrames, &fundamentalPeriodInFrames, &minPeriodInFrames, &maxPeriodInFrames);
+                    if (SUCCEEDED(hr)) {
+                        ma_uint32 desiredPeriodInFrames = pData->periodSizeInFramesOut;
+                        ma_uint32 actualPeriodInFrames  = desiredPeriodInFrames;
+
+                        /* Make sure the period size is a multiple of fundamentalPeriodInFrames. */
+                        actualPeriodInFrames = actualPeriodInFrames / fundamentalPeriodInFrames;
+                        actualPeriodInFrames = actualPeriodInFrames * fundamentalPeriodInFrames;
+
+                        /* The period needs to be clamped between minPeriodInFrames and maxPeriodInFrames. */
+                        actualPeriodInFrames = ma_clamp(actualPeriodInFrames, minPeriodInFrames, maxPeriodInFrames);
+
+                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] Trying IAudioClient3_InitializeSharedAudioStream(actualPeriodInFrames=%d)\n", actualPeriodInFrames);
+                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    defaultPeriodInFrames=%d\n", defaultPeriodInFrames);
+                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    fundamentalPeriodInFrames=%d\n", fundamentalPeriodInFrames);
+                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    minPeriodInFrames=%d\n", minPeriodInFrames);
+                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    maxPeriodInFrames=%d\n", maxPeriodInFrames);
+
+                        /* If the client requested a largish buffer than we don't actually want to use low latency shared mode because it forces small buffers. */
+                        if (actualPeriodInFrames >= desiredPeriodInFrames) {
+                            /*
+                            MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM | MA_AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY must not be in the stream flags. If either of these are specified,
+                            IAudioClient3_InitializeSharedAudioStream() will fail.
+                            */
+                            hr = ma_IAudioClient3_InitializeSharedAudioStream(pAudioClient3, streamFlags & ~(MA_AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM | MA_AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY), actualPeriodInFrames, (MA_WAVEFORMATEX*)&wf, NULL);
+                            if (SUCCEEDED(hr)) {
+                                wasInitializedUsingIAudioClient3 = MA_TRUE;
+                                pData->periodSizeInFramesOut = actualPeriodInFrames;
+
+                                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] Using IAudioClient3\n");
+                                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "    periodSizeInFramesOut=%d\n", pData->periodSizeInFramesOut);
+                            } else {
+                                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] IAudioClient3_InitializeSharedAudioStream failed. Falling back to IAudioClient.\n");
+                            }
+                        } else {
+                            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] Not using IAudioClient3 because the desired period size is larger than the maximum supported by IAudioClient3.\n");
+                        }
+                    } else {
+                        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] IAudioClient3_GetSharedModeEnginePeriod failed. Falling back to IAudioClient.\n");
+                    }
+
+                    ma_IAudioClient3_Release(pAudioClient3);
+                    pAudioClient3 = NULL;
+                }
+            }
+        }
+        #else
+        {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[WASAPI] Not using IAudioClient3 because MA_WASAPI_NO_LOW_LATENCY_SHARED_MODE is enabled.\n");
+        }
+        #endif
+
+        /* If we don't have an IAudioClient3 then we need to use the normal initialization routine. */
+        if (!wasInitializedUsingIAudioClient3) {
+            MA_REFERENCE_TIME bufferDuration = periodDurationInMicroseconds * pData->periodsOut * 10;   /* <-- Multiply by 10 for microseconds to 100-nanoseconds. */
+            hr = ma_IAudioClient_Initialize((ma_IAudioClient*)pData->pAudioClient, shareMode, streamFlags, bufferDuration, 0, (const MA_WAVEFORMATEX*)&wf, NULL);
+            if (FAILED(hr)) {
+                if (hr == E_ACCESSDENIED) {
+                    errorMsg = "[WASAPI] Failed to initialize device. Access denied.", result = MA_ACCESS_DENIED;
+                } else if (hr == MA_AUDCLNT_E_DEVICE_IN_USE) {
+                    errorMsg = "[WASAPI] Failed to initialize device. Device in use.", result = MA_BUSY;
+                } else {
+                    errorMsg = "[WASAPI] Failed to initialize device.", result = ma_result_from_HRESULT(hr);
+                }
+
+                goto done;
+            }
+        }
+    }
+
+    if (!wasInitializedUsingIAudioClient3) {
+        ma_uint32 bufferSizeInFrames = 0;
+        hr = ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pData->pAudioClient, &bufferSizeInFrames);
+        if (FAILED(hr)) {
+            errorMsg = "[WASAPI] Failed to get audio client's actual buffer size.", result = ma_result_from_HRESULT(hr);
+            goto done;
+        }
+
+        /*
+        When using process loopback mode, retrieval of the buffer size seems to result in totally
+        incorrect values. In this case we'll just assume it's the same size as what we requested
+        when we initialized the client.
+        */
+        if (usingProcessLoopback) {
+            bufferSizeInFrames = (ma_uint32)((periodDurationInMicroseconds * pData->periodsOut) * pData->sampleRateOut / 1000000);
+        }
+
+        pData->periodSizeInFramesOut = bufferSizeInFrames / pData->periodsOut;
+    }
+
+    pData->usingAudioClient3 = wasInitializedUsingIAudioClient3;
+
+
+    if (deviceType == ma_device_type_playback) {
+        result = ma_device_create_IAudioClient_service__wasapi(pContext, deviceType, (ma_IAudioClient*)pData->pAudioClient, (void**)&pData->pRenderClient);
+    } else {
+        result = ma_device_create_IAudioClient_service__wasapi(pContext, deviceType, (ma_IAudioClient*)pData->pAudioClient, (void**)&pData->pCaptureClient);
+    }
+
+    /*if (FAILED(hr)) {*/
+    if (result != MA_SUCCESS) {
+        errorMsg = "[WASAPI] Failed to get audio client service.";
+        goto done;
+    }
+
+
+    /* Grab the name of the device. */
+    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    {
+        ma_IPropertyStore *pProperties;
+        hr = ma_IMMDevice_OpenPropertyStore(pDeviceInterface, STGM_READ, &pProperties);
+        if (SUCCEEDED(hr)) {
+            MA_PROPVARIANT varName;
+            ma_PropVariantInit(&varName);
+            hr = ma_IPropertyStore_GetValue(pProperties, &MA_PKEY_Device_FriendlyName, &varName);
+            if (SUCCEEDED(hr)) {
+                WideCharToMultiByte(CP_UTF8, 0, varName.pwszVal, -1, pData->deviceName, sizeof(pData->deviceName), 0, FALSE);
+                ma_PropVariantClear(pContext, &varName);
+            }
+
+            ma_IPropertyStore_Release(pProperties);
+        }
+    }
+    #endif
+
+    /*
+    For the WASAPI backend we need to know the actual IDs of the device in order to do automatic
+    stream routing so that IDs can be compared and we can determine which device has been detached
+    and whether or not it matches with our ma_device.
+    */
+    #if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    {
+        /* Desktop */
+        ma_context_get_device_id_from_MMDevice__wasapi(pContext, pDeviceInterface, &pData->id);
+    }
+    #else
+    {
+        /* UWP */
+        /* TODO: Implement me. Need to figure out how to get the ID of the default device. */
+    }
+    #endif
+
+done:
+    /* Clean up. */
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    if (pDeviceInterface != NULL) {
+        ma_IMMDevice_Release(pDeviceInterface);
+    }
+#else
+    if (pDeviceInterface != NULL) {
+        ma_IUnknown_Release(pDeviceInterface);
+    }
+#endif
+
+    if (result != MA_SUCCESS) {
+        if (pData->pRenderClient) {
+            ma_IAudioRenderClient_Release((ma_IAudioRenderClient*)pData->pRenderClient);
+            pData->pRenderClient = NULL;
+        }
+        if (pData->pCaptureClient) {
+            ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pData->pCaptureClient);
+            pData->pCaptureClient = NULL;
+        }
+        if (pData->pAudioClient) {
+            ma_IAudioClient_Release((ma_IAudioClient*)pData->pAudioClient);
+            pData->pAudioClient = NULL;
+        }
+
+        if (errorMsg != NULL && errorMsg[0] != '\0') {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "%s\n", errorMsg);
+        }
+
+        return result;
+    } else {
+        return MA_SUCCESS;
+    }
+}
+
+static ma_result ma_device_reinit__wasapi(ma_device* pDevice, ma_device_type deviceType)
+{
+    ma_device_init_internal_data__wasapi data;
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /* We only re-initialize the playback or capture device. Never a full-duplex device. */
+    if (deviceType == ma_device_type_duplex) {
+        return MA_INVALID_ARGS;
+    }
+
+
+    /*
+    Before reinitializing the device we need to free the previous audio clients.
+
+    There's a known memory leak here. We will be calling this from the routing change callback that
+    is fired by WASAPI. If we attempt to release the IAudioClient we will deadlock. In my opinion
+    this is a bug. I'm not sure what I need to do to handle this cleanly, but I think we'll probably
+    need some system where we post an event, but delay the execution of it until the callback has
+    returned. I'm not sure how to do this reliably, however. I have set up some infrastructure for
+    a command thread which might be useful for this.
+    */
+    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_loopback) {
+        if (pDevice->wasapi.pCaptureClient) {
+            ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
+            pDevice->wasapi.pCaptureClient = NULL;
+        }
+
+        if (pDevice->wasapi.pAudioClientCapture) {
+            /*ma_device_release_IAudioClient_service__wasapi(pDevice, ma_device_type_capture);*/
+            pDevice->wasapi.pAudioClientCapture = NULL;
+        }
+    }
+
+    if (deviceType == ma_device_type_playback) {
+        if (pDevice->wasapi.pRenderClient) {
+            ma_IAudioRenderClient_Release((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient);
+            pDevice->wasapi.pRenderClient = NULL;
+        }
+
+        if (pDevice->wasapi.pAudioClientPlayback) {
+            /*ma_device_release_IAudioClient_service__wasapi(pDevice, ma_device_type_playback);*/
+            pDevice->wasapi.pAudioClientPlayback = NULL;
+        }
+    }
+
+
+    if (deviceType == ma_device_type_playback) {
+        data.formatIn               = pDevice->playback.format;
+        data.channelsIn             = pDevice->playback.channels;
+        MA_COPY_MEMORY(data.channelMapIn, pDevice->playback.channelMap, sizeof(pDevice->playback.channelMap));
+        data.shareMode              = pDevice->playback.shareMode;
+    } else {
+        data.formatIn               = pDevice->capture.format;
+        data.channelsIn             = pDevice->capture.channels;
+        MA_COPY_MEMORY(data.channelMapIn, pDevice->capture.channelMap, sizeof(pDevice->capture.channelMap));
+        data.shareMode              = pDevice->capture.shareMode;
+    }
+
+    data.sampleRateIn               = pDevice->sampleRate;
+    data.periodSizeInFramesIn       = pDevice->wasapi.originalPeriodSizeInFrames;
+    data.periodSizeInMillisecondsIn = pDevice->wasapi.originalPeriodSizeInMilliseconds;
+    data.periodsIn                  = pDevice->wasapi.originalPeriods;
+    data.performanceProfile         = pDevice->wasapi.originalPerformanceProfile;
+    data.noAutoConvertSRC           = pDevice->wasapi.noAutoConvertSRC;
+    data.noDefaultQualitySRC        = pDevice->wasapi.noDefaultQualitySRC;
+    data.noHardwareOffloading       = pDevice->wasapi.noHardwareOffloading;
+    data.loopbackProcessID          = pDevice->wasapi.loopbackProcessID;
+    data.loopbackProcessExclude     = pDevice->wasapi.loopbackProcessExclude;
+    result = ma_device_init_internal__wasapi(pDevice->pContext, deviceType, NULL, &data);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* At this point we have some new objects ready to go. We need to uninitialize the previous ones and then set the new ones. */
+    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_loopback) {
+        pDevice->wasapi.pAudioClientCapture         = data.pAudioClient;
+        pDevice->wasapi.pCaptureClient              = data.pCaptureClient;
+
+        pDevice->capture.internalFormat             = data.formatOut;
+        pDevice->capture.internalChannels           = data.channelsOut;
+        pDevice->capture.internalSampleRate         = data.sampleRateOut;
+        MA_COPY_MEMORY(pDevice->capture.internalChannelMap, data.channelMapOut, sizeof(data.channelMapOut));
+        pDevice->capture.internalPeriodSizeInFrames = data.periodSizeInFramesOut;
+        pDevice->capture.internalPeriods            = data.periodsOut;
+        ma_strcpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), data.deviceName);
+
+        ma_IAudioClient_SetEventHandle((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture, (HANDLE)pDevice->wasapi.hEventCapture);
+
+        pDevice->wasapi.periodSizeInFramesCapture = data.periodSizeInFramesOut;
+        ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture, &pDevice->wasapi.actualBufferSizeInFramesCapture);
+
+        /* We must always have a valid ID. */
+        ma_strcpy_s_WCHAR(pDevice->capture.id.wasapi, sizeof(pDevice->capture.id.wasapi), data.id.wasapi);
+    }
+
+    if (deviceType == ma_device_type_playback) {
+        pDevice->wasapi.pAudioClientPlayback         = data.pAudioClient;
+        pDevice->wasapi.pRenderClient                = data.pRenderClient;
+
+        pDevice->playback.internalFormat             = data.formatOut;
+        pDevice->playback.internalChannels           = data.channelsOut;
+        pDevice->playback.internalSampleRate         = data.sampleRateOut;
+        MA_COPY_MEMORY(pDevice->playback.internalChannelMap, data.channelMapOut, sizeof(data.channelMapOut));
+        pDevice->playback.internalPeriodSizeInFrames = data.periodSizeInFramesOut;
+        pDevice->playback.internalPeriods            = data.periodsOut;
+        ma_strcpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), data.deviceName);
+
+        ma_IAudioClient_SetEventHandle((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, (HANDLE)pDevice->wasapi.hEventPlayback);
+
+        pDevice->wasapi.periodSizeInFramesPlayback = data.periodSizeInFramesOut;
+        ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, &pDevice->wasapi.actualBufferSizeInFramesPlayback);
+
+        /* We must always have a valid ID because rerouting will look at it. */
+        ma_strcpy_s_WCHAR(pDevice->playback.id.wasapi, sizeof(pDevice->playback.id.wasapi), data.id.wasapi);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init__wasapi(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    ma_result result = MA_SUCCESS;
+
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    HRESULT hr;
+    ma_IMMDeviceEnumerator* pDeviceEnumerator;
+#endif
+
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ZERO_OBJECT(&pDevice->wasapi);
+    pDevice->wasapi.usage                  = pConfig->wasapi.usage;
+    pDevice->wasapi.noAutoConvertSRC       = pConfig->wasapi.noAutoConvertSRC;
+    pDevice->wasapi.noDefaultQualitySRC    = pConfig->wasapi.noDefaultQualitySRC;
+    pDevice->wasapi.noHardwareOffloading   = pConfig->wasapi.noHardwareOffloading;
+    pDevice->wasapi.loopbackProcessID      = pConfig->wasapi.loopbackProcessID;
+    pDevice->wasapi.loopbackProcessExclude = pConfig->wasapi.loopbackProcessExclude;
+
+    /* Exclusive mode is not allowed with loopback. */
+    if (pConfig->deviceType == ma_device_type_loopback && pConfig->playback.shareMode == ma_share_mode_exclusive) {
+        return MA_INVALID_DEVICE_CONFIG;
+    }
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
+        ma_device_init_internal_data__wasapi data;
+        data.formatIn                   = pDescriptorCapture->format;
+        data.channelsIn                 = pDescriptorCapture->channels;
+        data.sampleRateIn               = pDescriptorCapture->sampleRate;
+        MA_COPY_MEMORY(data.channelMapIn, pDescriptorCapture->channelMap, sizeof(pDescriptorCapture->channelMap));
+        data.periodSizeInFramesIn       = pDescriptorCapture->periodSizeInFrames;
+        data.periodSizeInMillisecondsIn = pDescriptorCapture->periodSizeInMilliseconds;
+        data.periodsIn                  = pDescriptorCapture->periodCount;
+        data.shareMode                  = pDescriptorCapture->shareMode;
+        data.performanceProfile         = pConfig->performanceProfile;
+        data.noAutoConvertSRC           = pConfig->wasapi.noAutoConvertSRC;
+        data.noDefaultQualitySRC        = pConfig->wasapi.noDefaultQualitySRC;
+        data.noHardwareOffloading       = pConfig->wasapi.noHardwareOffloading;
+        data.loopbackProcessID          = pConfig->wasapi.loopbackProcessID;
+        data.loopbackProcessExclude     = pConfig->wasapi.loopbackProcessExclude;
+
+        result = ma_device_init_internal__wasapi(pDevice->pContext, (pConfig->deviceType == ma_device_type_loopback) ? ma_device_type_loopback : ma_device_type_capture, pDescriptorCapture->pDeviceID, &data);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pDevice->wasapi.pAudioClientCapture              = data.pAudioClient;
+        pDevice->wasapi.pCaptureClient                   = data.pCaptureClient;
+        pDevice->wasapi.originalPeriodSizeInMilliseconds = pDescriptorCapture->periodSizeInMilliseconds;
+        pDevice->wasapi.originalPeriodSizeInFrames       = pDescriptorCapture->periodSizeInFrames;
+        pDevice->wasapi.originalPeriods                  = pDescriptorCapture->periodCount;
+        pDevice->wasapi.originalPerformanceProfile       = pConfig->performanceProfile;
+
+        /*
+        The event for capture needs to be manual reset for the same reason as playback. We keep the initial state set to unsignaled,
+        however, because we want to block until we actually have something for the first call to ma_device_read().
+        */
+        pDevice->wasapi.hEventCapture = (ma_handle)CreateEventA(NULL, FALSE, FALSE, NULL);  /* Auto reset, unsignaled by default. */
+        if (pDevice->wasapi.hEventCapture == NULL) {
+            result = ma_result_from_GetLastError(GetLastError());
+
+            if (pDevice->wasapi.pCaptureClient != NULL) {
+                ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
+                pDevice->wasapi.pCaptureClient = NULL;
+            }
+            if (pDevice->wasapi.pAudioClientCapture != NULL) {
+                ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
+                pDevice->wasapi.pAudioClientCapture = NULL;
+            }
+
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create event for capture.");
+            return result;
+        }
+        ma_IAudioClient_SetEventHandle((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture, (HANDLE)pDevice->wasapi.hEventCapture);
+
+        pDevice->wasapi.periodSizeInFramesCapture = data.periodSizeInFramesOut;
+        ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture, &pDevice->wasapi.actualBufferSizeInFramesCapture);
+
+        /* We must always have a valid ID. */
+        ma_strcpy_s_WCHAR(pDevice->capture.id.wasapi, sizeof(pDevice->capture.id.wasapi), data.id.wasapi);
+
+        /* The descriptor needs to be updated with actual values. */
+        pDescriptorCapture->format             = data.formatOut;
+        pDescriptorCapture->channels           = data.channelsOut;
+        pDescriptorCapture->sampleRate         = data.sampleRateOut;
+        MA_COPY_MEMORY(pDescriptorCapture->channelMap, data.channelMapOut, sizeof(data.channelMapOut));
+        pDescriptorCapture->periodSizeInFrames = data.periodSizeInFramesOut;
+        pDescriptorCapture->periodCount        = data.periodsOut;
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_device_init_internal_data__wasapi data;
+        data.formatIn                   = pDescriptorPlayback->format;
+        data.channelsIn                 = pDescriptorPlayback->channels;
+        data.sampleRateIn               = pDescriptorPlayback->sampleRate;
+        MA_COPY_MEMORY(data.channelMapIn, pDescriptorPlayback->channelMap, sizeof(pDescriptorPlayback->channelMap));
+        data.periodSizeInFramesIn       = pDescriptorPlayback->periodSizeInFrames;
+        data.periodSizeInMillisecondsIn = pDescriptorPlayback->periodSizeInMilliseconds;
+        data.periodsIn                  = pDescriptorPlayback->periodCount;
+        data.shareMode                  = pDescriptorPlayback->shareMode;
+        data.performanceProfile         = pConfig->performanceProfile;
+        data.noAutoConvertSRC           = pConfig->wasapi.noAutoConvertSRC;
+        data.noDefaultQualitySRC        = pConfig->wasapi.noDefaultQualitySRC;
+        data.noHardwareOffloading       = pConfig->wasapi.noHardwareOffloading;
+        data.loopbackProcessID          = pConfig->wasapi.loopbackProcessID;
+        data.loopbackProcessExclude     = pConfig->wasapi.loopbackProcessExclude;
+
+        result = ma_device_init_internal__wasapi(pDevice->pContext, ma_device_type_playback, pDescriptorPlayback->pDeviceID, &data);
+        if (result != MA_SUCCESS) {
+            if (pConfig->deviceType == ma_device_type_duplex) {
+                if (pDevice->wasapi.pCaptureClient != NULL) {
+                    ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
+                    pDevice->wasapi.pCaptureClient = NULL;
+                }
+                if (pDevice->wasapi.pAudioClientCapture != NULL) {
+                    ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
+                    pDevice->wasapi.pAudioClientCapture = NULL;
+                }
+
+                CloseHandle((HANDLE)pDevice->wasapi.hEventCapture);
+                pDevice->wasapi.hEventCapture = NULL;
+            }
+            return result;
+        }
+
+        pDevice->wasapi.pAudioClientPlayback             = data.pAudioClient;
+        pDevice->wasapi.pRenderClient                    = data.pRenderClient;
+        pDevice->wasapi.originalPeriodSizeInMilliseconds = pDescriptorPlayback->periodSizeInMilliseconds;
+        pDevice->wasapi.originalPeriodSizeInFrames       = pDescriptorPlayback->periodSizeInFrames;
+        pDevice->wasapi.originalPeriods                  = pDescriptorPlayback->periodCount;
+        pDevice->wasapi.originalPerformanceProfile       = pConfig->performanceProfile;
+
+        /*
+        The event for playback is needs to be manual reset because we want to explicitly control the fact that it becomes signalled
+        only after the whole available space has been filled, never before.
+
+        The playback event also needs to be initially set to a signaled state so that the first call to ma_device_write() is able
+        to get passed WaitForMultipleObjects().
+        */
+        pDevice->wasapi.hEventPlayback = (ma_handle)CreateEventA(NULL, FALSE, TRUE, NULL);  /* Auto reset, signaled by default. */
+        if (pDevice->wasapi.hEventPlayback == NULL) {
+            result = ma_result_from_GetLastError(GetLastError());
+
+            if (pConfig->deviceType == ma_device_type_duplex) {
+                if (pDevice->wasapi.pCaptureClient != NULL) {
+                    ma_IAudioCaptureClient_Release((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient);
+                    pDevice->wasapi.pCaptureClient = NULL;
+                }
+                if (pDevice->wasapi.pAudioClientCapture != NULL) {
+                    ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
+                    pDevice->wasapi.pAudioClientCapture = NULL;
+                }
+
+                CloseHandle((HANDLE)pDevice->wasapi.hEventCapture);
+                pDevice->wasapi.hEventCapture = NULL;
+            }
+
+            if (pDevice->wasapi.pRenderClient != NULL) {
+                ma_IAudioRenderClient_Release((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient);
+                pDevice->wasapi.pRenderClient = NULL;
+            }
+            if (pDevice->wasapi.pAudioClientPlayback != NULL) {
+                ma_IAudioClient_Release((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
+                pDevice->wasapi.pAudioClientPlayback = NULL;
+            }
+
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create event for playback.");
+            return result;
+        }
+        ma_IAudioClient_SetEventHandle((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, (HANDLE)pDevice->wasapi.hEventPlayback);
+
+        pDevice->wasapi.periodSizeInFramesPlayback = data.periodSizeInFramesOut;
+        ma_IAudioClient_GetBufferSize((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, &pDevice->wasapi.actualBufferSizeInFramesPlayback);
+
+        /* We must always have a valid ID because rerouting will look at it. */
+        ma_strcpy_s_WCHAR(pDevice->playback.id.wasapi, sizeof(pDevice->playback.id.wasapi), data.id.wasapi);
+
+        /* The descriptor needs to be updated with actual values. */
+        pDescriptorPlayback->format             = data.formatOut;
+        pDescriptorPlayback->channels           = data.channelsOut;
+        pDescriptorPlayback->sampleRate         = data.sampleRateOut;
+        MA_COPY_MEMORY(pDescriptorPlayback->channelMap, data.channelMapOut, sizeof(data.channelMapOut));
+        pDescriptorPlayback->periodSizeInFrames = data.periodSizeInFramesOut;
+        pDescriptorPlayback->periodCount        = data.periodsOut;
+    }
+
+    /*
+    We need to register a notification client to detect when the device has been disabled, unplugged or re-routed (when the default device changes). When
+    we are connecting to the default device we want to do automatic stream routing when the device is disabled or unplugged. Otherwise we want to just
+    stop the device outright and let the application handle it.
+    */
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    if (pConfig->wasapi.noAutoStreamRouting == MA_FALSE) {
+        if ((pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) && pConfig->capture.pDeviceID == NULL) {
+            pDevice->wasapi.allowCaptureAutoStreamRouting = MA_TRUE;
+        }
+        if ((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pConfig->playback.pDeviceID == NULL) {
+            pDevice->wasapi.allowPlaybackAutoStreamRouting = MA_TRUE;
+        }
+    }
+
+    ma_mutex_init(&pDevice->wasapi.rerouteLock);
+
+    hr = ma_CoCreateInstance(pDevice->pContext, &MA_CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &MA_IID_IMMDeviceEnumerator, (void**)&pDeviceEnumerator);
+    if (FAILED(hr)) {
+        ma_device_uninit__wasapi(pDevice);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to create device enumerator.");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    pDevice->wasapi.notificationClient.lpVtbl  = (void*)&g_maNotificationCientVtbl;
+    pDevice->wasapi.notificationClient.counter = 1;
+    pDevice->wasapi.notificationClient.pDevice = pDevice;
+
+    hr = pDeviceEnumerator->lpVtbl->RegisterEndpointNotificationCallback(pDeviceEnumerator, &pDevice->wasapi.notificationClient);
+    if (SUCCEEDED(hr)) {
+        pDevice->wasapi.pDeviceEnumerator = (ma_ptr)pDeviceEnumerator;
+    } else {
+        /* Not the end of the world if we fail to register the notification callback. We just won't support automatic stream routing. */
+        ma_IMMDeviceEnumerator_Release(pDeviceEnumerator);
+    }
+#endif
+
+    ma_atomic_bool32_set(&pDevice->wasapi.isStartedCapture,  MA_FALSE);
+    ma_atomic_bool32_set(&pDevice->wasapi.isStartedPlayback, MA_FALSE);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device__get_available_frames__wasapi(ma_device* pDevice, ma_IAudioClient* pAudioClient, ma_uint32* pFrameCount)
+{
+    ma_uint32 paddingFramesCount;
+    HRESULT hr;
+    ma_share_mode shareMode;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pFrameCount != NULL);
+
+    *pFrameCount = 0;
+
+    if ((ma_ptr)pAudioClient != pDevice->wasapi.pAudioClientPlayback && (ma_ptr)pAudioClient != pDevice->wasapi.pAudioClientCapture) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /*
+    I've had a report that GetCurrentPadding() is returning a frame count of 0 which is preventing
+    higher level function calls from doing anything because it thinks nothing is available. I have
+    taken a look at the documentation and it looks like this is unnecessary in exclusive mode.
+
+    From Microsoft's documentation:
+
+        For an exclusive-mode rendering or capture stream that was initialized with the
+        AUDCLNT_STREAMFLAGS_EVENTCALLBACK flag, the client typically has no use for the padding
+        value reported by GetCurrentPadding. Instead, the client accesses an entire buffer during
+        each processing pass.
+
+    Considering this, I'm going to skip GetCurrentPadding() for exclusive mode and just report the
+    entire buffer. This depends on the caller making sure they wait on the event handler.
+    */
+    shareMode = ((ma_ptr)pAudioClient == pDevice->wasapi.pAudioClientPlayback) ? pDevice->playback.shareMode : pDevice->capture.shareMode;
+    if (shareMode == ma_share_mode_shared) {
+        /* Shared mode. */
+        hr = ma_IAudioClient_GetCurrentPadding(pAudioClient, &paddingFramesCount);
+        if (FAILED(hr)) {
+            return ma_result_from_HRESULT(hr);
+        }
+
+        if ((ma_ptr)pAudioClient == pDevice->wasapi.pAudioClientPlayback) {
+            *pFrameCount = pDevice->wasapi.actualBufferSizeInFramesPlayback - paddingFramesCount;
+        } else {
+            *pFrameCount = paddingFramesCount;
+        }
+    } else {
+        /* Exclusive mode. */
+        if ((ma_ptr)pAudioClient == pDevice->wasapi.pAudioClientPlayback) {
+            *pFrameCount = pDevice->wasapi.actualBufferSizeInFramesPlayback;
+        } else {
+            *pFrameCount = pDevice->wasapi.actualBufferSizeInFramesCapture;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_device_reroute__wasapi(ma_device* pDevice, ma_device_type deviceType)
+{
+    ma_result result;
+
+    if (deviceType == ma_device_type_duplex) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "=== CHANGING DEVICE ===\n");
+
+    result = ma_device_reinit__wasapi(pDevice, deviceType);
+    if (result != MA_SUCCESS) {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[WASAPI] Reinitializing device after route change failed.\n");
+        return result;
+    }
+
+    ma_device__post_init_setup(pDevice, deviceType);
+    ma_device__on_notification_rerouted(pDevice);
+
+    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "=== DEVICE CHANGED ===\n");
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start__wasapi_nolock(ma_device* pDevice)
+{
+    HRESULT hr;
+
+    if (pDevice->pContext->wasapi.hAvrt) {
+        const char* pTaskName = ma_to_usage_string__wasapi(pDevice->wasapi.usage);
+        if (pTaskName) {
+            DWORD idx = 0;
+            pDevice->wasapi.hAvrtHandle = (ma_handle)((MA_PFN_AvSetMmThreadCharacteristicsA)pDevice->pContext->wasapi.AvSetMmThreadCharacteristicsA)(pTaskName, &idx);
+        }
+    }
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
+        hr = ma_IAudioClient_Start((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
+        if (FAILED(hr)) {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to start internal capture device. HRESULT = %d.", (int)hr);
+            return ma_result_from_HRESULT(hr);
+        }
+
+        ma_atomic_bool32_set(&pDevice->wasapi.isStartedCapture, MA_TRUE);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        hr = ma_IAudioClient_Start((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
+        if (FAILED(hr)) {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to start internal playback device. HRESULT = %d.", (int)hr);
+            return ma_result_from_HRESULT(hr);
+        }
+
+        ma_atomic_bool32_set(&pDevice->wasapi.isStartedPlayback, MA_TRUE);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start__wasapi(ma_device* pDevice)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /* Wait for any rerouting to finish before attempting to start the device. */
+    ma_mutex_lock(&pDevice->wasapi.rerouteLock);
+    {
+        result = ma_device_start__wasapi_nolock(pDevice);
+    }
+    ma_mutex_unlock(&pDevice->wasapi.rerouteLock);
+
+    return result;
+}
+
+static ma_result ma_device_stop__wasapi_nolock(ma_device* pDevice)
+{
+    ma_result result;
+    HRESULT hr;
+
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->wasapi.hAvrtHandle) {
+        ((MA_PFN_AvRevertMmThreadCharacteristics)pDevice->pContext->wasapi.AvRevertMmThreadcharacteristics)((HANDLE)pDevice->wasapi.hAvrtHandle);
+        pDevice->wasapi.hAvrtHandle = NULL;
+    }
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
+        /* If we have a mapped buffer we need to release it. */
+        if (pDevice->wasapi.pMappedBufferCapture != NULL) {
+            ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
+            pDevice->wasapi.pMappedBufferCapture = NULL;
+            pDevice->wasapi.mappedBufferCaptureCap = 0;
+            pDevice->wasapi.mappedBufferCaptureLen = 0;
+        }
+
+        hr = ma_IAudioClient_Stop((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
+        if (FAILED(hr)) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to stop internal capture device.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        /* The audio client needs to be reset otherwise restarting will fail. */
+        hr = ma_IAudioClient_Reset((ma_IAudioClient*)pDevice->wasapi.pAudioClientCapture);
+        if (FAILED(hr)) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to reset internal capture device.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        ma_atomic_bool32_set(&pDevice->wasapi.isStartedCapture, MA_FALSE);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->wasapi.pMappedBufferPlayback != NULL) {
+            ma_silence_pcm_frames(
+                ma_offset_pcm_frames_ptr(pDevice->wasapi.pMappedBufferPlayback, pDevice->wasapi.mappedBufferPlaybackLen, pDevice->playback.internalFormat, pDevice->playback.internalChannels),
+                pDevice->wasapi.mappedBufferPlaybackCap - pDevice->wasapi.mappedBufferPlaybackLen,
+                pDevice->playback.internalFormat, pDevice->playback.internalChannels
+            );
+            ma_IAudioRenderClient_ReleaseBuffer((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient, pDevice->wasapi.mappedBufferPlaybackCap, 0);
+            pDevice->wasapi.pMappedBufferPlayback = NULL;
+            pDevice->wasapi.mappedBufferPlaybackCap = 0;
+            pDevice->wasapi.mappedBufferPlaybackLen = 0;
+        }
+
+        /*
+        The buffer needs to be drained before stopping the device. Not doing this will result in the last few frames not getting output to
+        the speakers. This is a problem for very short sounds because it'll result in a significant portion of it not getting played.
+        */
+        if (ma_atomic_bool32_get(&pDevice->wasapi.isStartedPlayback)) {
+            /* We need to make sure we put a timeout here or else we'll risk getting stuck in a deadlock in some cases. */
+            DWORD waitTime = (pDevice->wasapi.actualBufferSizeInFramesPlayback * 1000) / pDevice->playback.internalSampleRate;
+
+            if (pDevice->playback.shareMode == ma_share_mode_exclusive) {
+                WaitForSingleObject((HANDLE)pDevice->wasapi.hEventPlayback, waitTime);
+            } else {
+                ma_uint32 prevFramesAvailablePlayback = (ma_uint32)-1;
+                ma_uint32 framesAvailablePlayback;
+                for (;;) {
+                    result = ma_device__get_available_frames__wasapi(pDevice, (ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback, &framesAvailablePlayback);
+                    if (result != MA_SUCCESS) {
+                        break;
+                    }
+
+                    if (framesAvailablePlayback >= pDevice->wasapi.actualBufferSizeInFramesPlayback) {
+                        break;
+                    }
+
+                    /*
+                    Just a safety check to avoid an infinite loop. If this iteration results in a situation where the number of available frames
+                    has not changed, get out of the loop. I don't think this should ever happen, but I think it's nice to have just in case.
+                    */
+                    if (framesAvailablePlayback == prevFramesAvailablePlayback) {
+                        break;
+                    }
+                    prevFramesAvailablePlayback = framesAvailablePlayback;
+
+                    ResetEvent((HANDLE)pDevice->wasapi.hEventPlayback); /* Manual reset. */
+                    WaitForSingleObject((HANDLE)pDevice->wasapi.hEventPlayback, waitTime);
+                }
+            }
+        }
+
+        hr = ma_IAudioClient_Stop((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback);
+        if (FAILED(hr)) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to stop internal playback device.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        /* The audio client needs to be reset otherwise restarting will fail. */
+        {
+            ma_int32 retries = 5;
+
+            while ((hr = ma_IAudioClient_Reset((ma_IAudioClient*)pDevice->wasapi.pAudioClientPlayback)) == MA_AUDCLNT_E_BUFFER_OPERATION_PENDING && retries > 0) {
+                ma_sleep(10);
+                retries -= 1;
+            }
+        }
+
+        if (FAILED(hr)) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to reset internal playback device.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        ma_atomic_bool32_set(&pDevice->wasapi.isStartedPlayback, MA_FALSE);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__wasapi(ma_device* pDevice)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /* Wait for any rerouting to finish before attempting to stop the device. */
+    ma_mutex_lock(&pDevice->wasapi.rerouteLock);
+    {
+        result = ma_device_stop__wasapi_nolock(pDevice);
+    }
+    ma_mutex_unlock(&pDevice->wasapi.rerouteLock);
+
+    return result;
+}
+
+
+#ifndef MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS
+#define MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS 5000
+#endif
+
+static ma_result ma_device_read__wasapi(ma_device* pDevice, void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint32 totalFramesProcessed = 0;
+
+    /*
+    When reading, we need to get a buffer and process all of it before releasing it. Because the
+    frame count (frameCount) can be different to the size of the buffer, we'll need to cache the
+    pointer to the buffer.
+    */
+
+    /* Keep running until we've processed the requested number of frames. */
+    while (ma_device_get_state(pDevice) == ma_device_state_started && totalFramesProcessed < frameCount) {
+        ma_uint32 framesRemaining = frameCount - totalFramesProcessed;
+
+        /* If we have a mapped data buffer, consume that first. */
+        if (pDevice->wasapi.pMappedBufferCapture != NULL) {
+            /* We have a cached data pointer so consume that before grabbing another one from WASAPI. */
+            ma_uint32 framesToProcessNow = framesRemaining;
+            if (framesToProcessNow > pDevice->wasapi.mappedBufferCaptureLen) {
+                framesToProcessNow = pDevice->wasapi.mappedBufferCaptureLen;
+            }
+
+            /* Now just copy the data over to the output buffer. */
+            ma_copy_pcm_frames(
+                ma_offset_pcm_frames_ptr(pFrames, totalFramesProcessed, pDevice->capture.internalFormat, pDevice->capture.internalChannels),
+                ma_offset_pcm_frames_const_ptr(pDevice->wasapi.pMappedBufferCapture, pDevice->wasapi.mappedBufferCaptureCap - pDevice->wasapi.mappedBufferCaptureLen, pDevice->capture.internalFormat, pDevice->capture.internalChannels),
+                framesToProcessNow,
+                pDevice->capture.internalFormat, pDevice->capture.internalChannels
+            );
+
+            totalFramesProcessed                   += framesToProcessNow;
+            pDevice->wasapi.mappedBufferCaptureLen -= framesToProcessNow;
+
+            /* If the data buffer has been fully consumed we need to release it. */
+            if (pDevice->wasapi.mappedBufferCaptureLen == 0) {
+                ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
+                pDevice->wasapi.pMappedBufferCapture   = NULL;
+                pDevice->wasapi.mappedBufferCaptureCap = 0;
+            }
+        } else {
+            /* We don't have any cached data pointer, so grab another one. */
+            HRESULT hr;
+            DWORD flags = 0;
+
+            /* First just ask WASAPI for a data buffer. If it's not available, we'll wait for more. */
+            hr = ma_IAudioCaptureClient_GetBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, (BYTE**)&pDevice->wasapi.pMappedBufferCapture, &pDevice->wasapi.mappedBufferCaptureCap, &flags, NULL, NULL);
+            if (hr == S_OK) {
+                /* We got a data buffer. Continue to the next loop iteration which will then read from the mapped pointer. */
+                pDevice->wasapi.mappedBufferCaptureLen = pDevice->wasapi.mappedBufferCaptureCap;
+
+                /*
+                There have been reports that indicate that at times the AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY is reported for every
+                call to IAudioCaptureClient_GetBuffer() above which results in spamming of the debug messages below. To partially
+                work around this, I'm only outputting these messages when MA_DEBUG_OUTPUT is explicitly defined. The better solution
+                would be to figure out why the flag is always getting reported.
+                */
+                #if defined(MA_DEBUG_OUTPUT)
+                {
+                    if (flags != 0) {
+                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Capture Flags: %ld\n", flags);
+
+                        if ((flags & MA_AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY) != 0) {
+                            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity (possible overrun). Attempting recovery. mappedBufferCaptureCap=%d\n", pDevice->wasapi.mappedBufferCaptureCap);
+                        }
+                    }
+                }
+                #endif
+
+                /* Overrun detection. */
+                if ((flags & MA_AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY) != 0) {
+                    /* Glitched. Probably due to an overrun. */
+
+                    /*
+                    If we got an overrun it probably means we're straddling the end of the buffer. In normal capture
+                    mode this is the fault of the client application because they're responsible for ensuring data is
+                    processed fast enough. In duplex mode, however, the processing of audio is tied to the playback
+                    device, so this can possibly be the result of a timing de-sync.
+
+                    In capture mode we're not going to do any kind of recovery because the real fix is for the client
+                    application to process faster. In duplex mode, we'll treat this as a desync and reset the buffers
+                    to prevent a never-ending sequence of glitches due to straddling the end of the buffer.
+                    */
+                    if (pDevice->type == ma_device_type_duplex) {
+                        /*
+                        Experiment:
+
+                        If we empty out the *entire* buffer we may end up putting ourselves into an underrun position
+                        which isn't really any better than the overrun we're probably in right now. Instead we'll just
+                        empty out about half.
+                        */
+                        ma_uint32 i;
+                        ma_uint32 periodCount = (pDevice->wasapi.actualBufferSizeInFramesCapture / pDevice->wasapi.periodSizeInFramesCapture);
+                        ma_uint32 iterationCount = periodCount / 2;
+                        if ((periodCount % 2) > 0) {
+                            iterationCount += 1;
+                        }
+
+                        for (i = 0; i < iterationCount; i += 1) {
+                            hr = ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
+                            if (FAILED(hr)) {
+                                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity recovery: IAudioCaptureClient_ReleaseBuffer() failed with %ld.\n", hr);
+                                break;
+                            }
+
+                            flags = 0;
+                            hr = ma_IAudioCaptureClient_GetBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, (BYTE**)&pDevice->wasapi.pMappedBufferCapture, &pDevice->wasapi.mappedBufferCaptureCap, &flags, NULL, NULL);
+                            if (hr == MA_AUDCLNT_S_BUFFER_EMPTY || FAILED(hr)) {
+                                /*
+                                The buffer has been completely emptied or an error occurred. In this case we'll need
+                                to reset the state of the mapped buffer which will trigger the next iteration to get
+                                a fresh buffer from WASAPI.
+                                */
+                                pDevice->wasapi.pMappedBufferCapture   = NULL;
+                                pDevice->wasapi.mappedBufferCaptureCap = 0;
+                                pDevice->wasapi.mappedBufferCaptureLen = 0;
+
+                                if (hr == MA_AUDCLNT_S_BUFFER_EMPTY) {
+                                    if ((flags & MA_AUDCLNT_BUFFERFLAGS_DATA_DISCONTINUITY) != 0) {
+                                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity recovery: Buffer emptied, and data discontinuity still reported.\n");
+                                    } else {
+                                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity recovery: Buffer emptied.\n");
+                                    }
+                                }
+
+                                if (FAILED(hr)) {
+                                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[WASAPI] Data discontinuity recovery: IAudioCaptureClient_GetBuffer() failed with %ld.\n", hr);
+                                }
+
+                                break;
+                            }
+                        }
+
+                        /* If at this point we have a valid buffer mapped, make sure the buffer length is set appropriately. */
+                        if (pDevice->wasapi.pMappedBufferCapture != NULL) {
+                            pDevice->wasapi.mappedBufferCaptureLen = pDevice->wasapi.mappedBufferCaptureCap;
+                        }
+                    }
+                }
+
+                continue;
+            } else {
+                if (hr == MA_AUDCLNT_S_BUFFER_EMPTY || hr == MA_AUDCLNT_E_BUFFER_ERROR) {
+                    /*
+                    No data is available. We need to wait for more. There's two situations to consider
+                    here. The first is normal capture mode. If this times out it probably means the
+                    microphone isn't delivering data for whatever reason. In this case we'll just
+                    abort the read and return whatever we were able to get. The other situations is
+                    loopback mode, in which case a timeout probably just means the nothing is playing
+                    through the speakers.
+                    */
+
+                    /* Experiment: Use a shorter timeout for loopback mode. */
+                    DWORD timeoutInMilliseconds = MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS;
+                    if (pDevice->type == ma_device_type_loopback) {
+                        timeoutInMilliseconds = 10;
+                    }
+
+                    if (WaitForSingleObject((HANDLE)pDevice->wasapi.hEventCapture, timeoutInMilliseconds) != WAIT_OBJECT_0) {
+                        if (pDevice->type == ma_device_type_loopback) {
+                            continue;   /* Keep waiting in loopback mode. */
+                        } else {
+                            result = MA_ERROR;
+                            break;      /* Wait failed. */
+                        }
+                    }
+
+                    /* At this point we should be able to loop back to the start of the loop and try retrieving a data buffer again. */
+                } else {
+                    /* An error occurred and we need to abort. */
+                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to retrieve internal buffer from capture device in preparation for reading from the device. HRESULT = %d. Stopping device.\n", (int)hr);
+                    result = ma_result_from_HRESULT(hr);
+                    break;
+                }
+            }
+        }
+    }
+
+    /*
+    If we were unable to process the entire requested frame count, but we still have a mapped buffer,
+    there's a good chance either an error occurred or the device was stopped mid-read. In this case
+    we'll need to make sure the buffer is released.
+    */
+    if (totalFramesProcessed < frameCount && pDevice->wasapi.pMappedBufferCapture != NULL) {
+        ma_IAudioCaptureClient_ReleaseBuffer((ma_IAudioCaptureClient*)pDevice->wasapi.pCaptureClient, pDevice->wasapi.mappedBufferCaptureCap);
+        pDevice->wasapi.pMappedBufferCapture   = NULL;
+        pDevice->wasapi.mappedBufferCaptureCap = 0;
+        pDevice->wasapi.mappedBufferCaptureLen = 0;
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = totalFramesProcessed;
+    }
+
+    return result;
+}
+
+static ma_result ma_device_write__wasapi(ma_device* pDevice, const void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint32 totalFramesProcessed = 0;
+
+    /* Keep writing to the device until it's stopped or we've consumed all of our input. */
+    while (ma_device_get_state(pDevice) == ma_device_state_started && totalFramesProcessed < frameCount) {
+        ma_uint32 framesRemaining = frameCount - totalFramesProcessed;
+
+        /*
+        We're going to do this in a similar way to capture. We'll first check if the cached data pointer
+        is valid, and if so, read from that. Otherwise We will call IAudioRenderClient_GetBuffer() with
+        a requested buffer size equal to our actual period size. If it returns AUDCLNT_E_BUFFER_TOO_LARGE
+        it means we need to wait for some data to become available.
+        */
+        if (pDevice->wasapi.pMappedBufferPlayback != NULL) {
+            /* We still have some space available in the mapped data buffer. Write to it. */
+            ma_uint32 framesToProcessNow = framesRemaining;
+            if (framesToProcessNow > (pDevice->wasapi.mappedBufferPlaybackCap - pDevice->wasapi.mappedBufferPlaybackLen)) {
+                framesToProcessNow = (pDevice->wasapi.mappedBufferPlaybackCap - pDevice->wasapi.mappedBufferPlaybackLen);
+            }
+
+            /* Now just copy the data over to the output buffer. */
+            ma_copy_pcm_frames(
+                ma_offset_pcm_frames_ptr(pDevice->wasapi.pMappedBufferPlayback, pDevice->wasapi.mappedBufferPlaybackLen, pDevice->playback.internalFormat, pDevice->playback.internalChannels),
+                ma_offset_pcm_frames_const_ptr(pFrames, totalFramesProcessed, pDevice->playback.internalFormat, pDevice->playback.internalChannels),
+                framesToProcessNow,
+                pDevice->playback.internalFormat, pDevice->playback.internalChannels
+            );
+
+            totalFramesProcessed                    += framesToProcessNow;
+            pDevice->wasapi.mappedBufferPlaybackLen += framesToProcessNow;
+
+            /* If the data buffer has been fully consumed we need to release it. */
+            if (pDevice->wasapi.mappedBufferPlaybackLen == pDevice->wasapi.mappedBufferPlaybackCap) {
+                ma_IAudioRenderClient_ReleaseBuffer((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient, pDevice->wasapi.mappedBufferPlaybackCap, 0);
+                pDevice->wasapi.pMappedBufferPlayback   = NULL;
+                pDevice->wasapi.mappedBufferPlaybackCap = 0;
+                pDevice->wasapi.mappedBufferPlaybackLen = 0;
+
+                /*
+                In exclusive mode we need to wait here. Exclusive mode is weird because GetBuffer() never
+                seems to return AUDCLNT_E_BUFFER_TOO_LARGE, which is what we normally use to determine
+                whether or not we need to wait for more data.
+                */
+                if (pDevice->playback.shareMode == ma_share_mode_exclusive) {
+                    if (WaitForSingleObject((HANDLE)pDevice->wasapi.hEventPlayback, MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS) != WAIT_OBJECT_0) {
+                        result = MA_ERROR;
+                        break;   /* Wait failed. Probably timed out. */
+                    }
+                }
+            }
+        } else {
+            /* We don't have a mapped data buffer so we'll need to get one. */
+            HRESULT hr;
+            ma_uint32 bufferSizeInFrames;
+
+            /* Special rules for exclusive mode. */
+            if (pDevice->playback.shareMode == ma_share_mode_exclusive) {
+                bufferSizeInFrames = pDevice->wasapi.actualBufferSizeInFramesPlayback;
+            } else {
+                bufferSizeInFrames = pDevice->wasapi.periodSizeInFramesPlayback;
+            }
+
+            hr = ma_IAudioRenderClient_GetBuffer((ma_IAudioRenderClient*)pDevice->wasapi.pRenderClient, bufferSizeInFrames, (BYTE**)&pDevice->wasapi.pMappedBufferPlayback);
+            if (hr == S_OK) {
+                /* We have data available. */
+                pDevice->wasapi.mappedBufferPlaybackCap = bufferSizeInFrames;
+                pDevice->wasapi.mappedBufferPlaybackLen = 0;
+            } else {
+                if (hr == MA_AUDCLNT_E_BUFFER_TOO_LARGE || hr == MA_AUDCLNT_E_BUFFER_ERROR) {
+                    /* Not enough data available. We need to wait for more. */
+                    if (WaitForSingleObject((HANDLE)pDevice->wasapi.hEventPlayback, MA_WASAPI_WAIT_TIMEOUT_MILLISECONDS) != WAIT_OBJECT_0) {
+                        result = MA_ERROR;
+                        break;   /* Wait failed. Probably timed out. */
+                    }
+                } else {
+                    /* Some error occurred. We'll need to abort. */
+                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WASAPI] Failed to retrieve internal buffer from playback device in preparation for writing to the device. HRESULT = %d. Stopping device.\n", (int)hr);
+                    result = ma_result_from_HRESULT(hr);
+                    break;
+                }
+            }
+        }
+    }
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = totalFramesProcessed;
+    }
+
+    return result;
+}
+
+static ma_result ma_device_data_loop_wakeup__wasapi(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
+        SetEvent((HANDLE)pDevice->wasapi.hEventCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        SetEvent((HANDLE)pDevice->wasapi.hEventPlayback);
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_context_uninit__wasapi(ma_context* pContext)
+{
+    ma_context_command__wasapi cmd = ma_context_init_command__wasapi(MA_CONTEXT_COMMAND_QUIT__WASAPI);
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_wasapi);
+
+    ma_context_post_command__wasapi(pContext, &cmd);
+    ma_thread_wait(&pContext->wasapi.commandThread);
+
+    if (pContext->wasapi.hAvrt) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->wasapi.hAvrt);
+        pContext->wasapi.hAvrt = NULL;
+    }
+
+    #if defined(MA_WIN32_UWP)
+    {
+        if (pContext->wasapi.hMMDevapi) {
+            ma_dlclose(ma_context_get_log(pContext), pContext->wasapi.hMMDevapi);
+            pContext->wasapi.hMMDevapi = NULL;
+        }
+    }
+    #endif
+
+    /* Only after the thread has been terminated can we uninitialize the sync objects for the command thread. */
+    ma_semaphore_uninit(&pContext->wasapi.commandSem);
+    ma_mutex_uninit(&pContext->wasapi.commandLock);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__wasapi(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    ma_result result = MA_SUCCESS;
+
+    MA_ASSERT(pContext != NULL);
+
+    (void)pConfig;
+
+#ifdef MA_WIN32_DESKTOP
+    /*
+    WASAPI is only supported in Vista SP1 and newer. The reason for SP1 and not the base version of Vista is that event-driven
+    exclusive mode does not work until SP1.
+
+    Unfortunately older compilers don't define these functions so we need to dynamically load them in order to avoid a link error.
+    */
+    {
+        ma_OSVERSIONINFOEXW osvi;
+        ma_handle kernel32DLL;
+        ma_PFNVerifyVersionInfoW _VerifyVersionInfoW;
+        ma_PFNVerSetConditionMask _VerSetConditionMask;
+
+        kernel32DLL = ma_dlopen(ma_context_get_log(pContext), "kernel32.dll");
+        if (kernel32DLL == NULL) {
+            return MA_NO_BACKEND;
+        }
+
+        _VerifyVersionInfoW  = (ma_PFNVerifyVersionInfoW )ma_dlsym(ma_context_get_log(pContext), kernel32DLL, "VerifyVersionInfoW");
+        _VerSetConditionMask = (ma_PFNVerSetConditionMask)ma_dlsym(ma_context_get_log(pContext), kernel32DLL, "VerSetConditionMask");
+        if (_VerifyVersionInfoW == NULL || _VerSetConditionMask == NULL) {
+            ma_dlclose(ma_context_get_log(pContext), kernel32DLL);
+            return MA_NO_BACKEND;
+        }
+
+        MA_ZERO_OBJECT(&osvi);
+        osvi.dwOSVersionInfoSize = sizeof(osvi);
+        osvi.dwMajorVersion = ((MA_WIN32_WINNT_VISTA >> 8) & 0xFF);
+        osvi.dwMinorVersion = ((MA_WIN32_WINNT_VISTA >> 0) & 0xFF);
+        osvi.wServicePackMajor = 1;
+        if (_VerifyVersionInfoW(&osvi, MA_VER_MAJORVERSION | MA_VER_MINORVERSION | MA_VER_SERVICEPACKMAJOR, _VerSetConditionMask(_VerSetConditionMask(_VerSetConditionMask(0, MA_VER_MAJORVERSION, MA_VER_GREATER_EQUAL), MA_VER_MINORVERSION, MA_VER_GREATER_EQUAL), MA_VER_SERVICEPACKMAJOR, MA_VER_GREATER_EQUAL))) {
+            result = MA_SUCCESS;
+        } else {
+            result = MA_NO_BACKEND;
+        }
+
+        ma_dlclose(ma_context_get_log(pContext), kernel32DLL);
+    }
+#endif
+
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    MA_ZERO_OBJECT(&pContext->wasapi);
+
+
+    #if defined(MA_WIN32_UWP)
+    {
+        /* Link to mmdevapi so we can get access to ActivateAudioInterfaceAsync(). */
+        pContext->wasapi.hMMDevapi = ma_dlopen(ma_context_get_log(pContext), "mmdevapi.dll");
+        if (pContext->wasapi.hMMDevapi) {
+            pContext->wasapi.ActivateAudioInterfaceAsync = ma_dlsym(ma_context_get_log(pContext), pContext->wasapi.hMMDevapi, "ActivateAudioInterfaceAsync");
+            if (pContext->wasapi.ActivateAudioInterfaceAsync == NULL) {
+                ma_dlclose(ma_context_get_log(pContext), pContext->wasapi.hMMDevapi);
+                return MA_NO_BACKEND;   /* ActivateAudioInterfaceAsync() could not be loaded. */
+            }
+        } else {
+            return MA_NO_BACKEND;   /* Failed to load mmdevapi.dll which is required for ActivateAudioInterfaceAsync() */
+        }
+    }
+    #endif
+
+    /* Optionally use the Avrt API to specify the audio thread's latency sensitivity requirements */
+    pContext->wasapi.hAvrt = ma_dlopen(ma_context_get_log(pContext), "avrt.dll");
+    if (pContext->wasapi.hAvrt) {
+        pContext->wasapi.AvSetMmThreadCharacteristicsA   = ma_dlsym(ma_context_get_log(pContext), pContext->wasapi.hAvrt, "AvSetMmThreadCharacteristicsA");
+        pContext->wasapi.AvRevertMmThreadcharacteristics = ma_dlsym(ma_context_get_log(pContext), pContext->wasapi.hAvrt, "AvRevertMmThreadCharacteristics");
+
+        /* If either function could not be found, disable use of avrt entirely. */
+        if (!pContext->wasapi.AvSetMmThreadCharacteristicsA || !pContext->wasapi.AvRevertMmThreadcharacteristics) {
+            pContext->wasapi.AvSetMmThreadCharacteristicsA   = NULL;
+            pContext->wasapi.AvRevertMmThreadcharacteristics = NULL;
+            ma_dlclose(ma_context_get_log(pContext), pContext->wasapi.hAvrt);
+            pContext->wasapi.hAvrt = NULL;
+        }
+    }
+
+
+    /*
+    Annoyingly, WASAPI does not allow you to release an IAudioClient object from a different thread
+    than the one that retrieved it with GetService(). This can result in a deadlock in two
+    situations:
+
+        1) When calling ma_device_uninit() from a different thread to ma_device_init(); and
+        2) When uninitializing and reinitializing the internal IAudioClient object in response to
+           automatic stream routing.
+
+    We could define ma_device_uninit() such that it must be called on the same thread as
+    ma_device_init(). We could also just not release the IAudioClient when performing automatic
+    stream routing to avoid the deadlock. Neither of these are acceptable solutions in my view so
+    we're going to have to work around this with a worker thread. This is not ideal, but I can't
+    think of a better way to do this.
+
+    More information about this can be found here:
+
+        https://docs.microsoft.com/en-us/windows/win32/api/audioclient/nn-audioclient-iaudiorenderclient
+
+    Note this section:
+
+        When releasing an IAudioRenderClient interface instance, the client must call the interface's
+        Release method from the same thread as the call to IAudioClient::GetService that created the
+        object.
+    */
+    {
+        result = ma_mutex_init(&pContext->wasapi.commandLock);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        result = ma_semaphore_init(0, &pContext->wasapi.commandSem);
+        if (result != MA_SUCCESS) {
+            ma_mutex_uninit(&pContext->wasapi.commandLock);
+            return result;
+        }
+
+        result = ma_thread_create(&pContext->wasapi.commandThread, ma_thread_priority_normal, 0, ma_context_command_thread__wasapi, pContext, &pContext->allocationCallbacks);
+        if (result != MA_SUCCESS) {
+            ma_semaphore_uninit(&pContext->wasapi.commandSem);
+            ma_mutex_uninit(&pContext->wasapi.commandLock);
+            return result;
+        }
+    }
+
+
+    pCallbacks->onContextInit             = ma_context_init__wasapi;
+    pCallbacks->onContextUninit           = ma_context_uninit__wasapi;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__wasapi;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__wasapi;
+    pCallbacks->onDeviceInit              = ma_device_init__wasapi;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__wasapi;
+    pCallbacks->onDeviceStart             = ma_device_start__wasapi;
+    pCallbacks->onDeviceStop              = ma_device_stop__wasapi;
+    pCallbacks->onDeviceRead              = ma_device_read__wasapi;
+    pCallbacks->onDeviceWrite             = ma_device_write__wasapi;
+    pCallbacks->onDeviceDataLoop          = NULL;
+    pCallbacks->onDeviceDataLoopWakeup    = ma_device_data_loop_wakeup__wasapi;
+
+    return MA_SUCCESS;
+}
+#endif
+
+/******************************************************************************
+
+DirectSound Backend
+
+******************************************************************************/
+#ifdef MA_HAS_DSOUND
+/*#include <dsound.h>*/
+
+/*static const GUID MA_GUID_IID_DirectSoundNotify = {0xb0210783, 0x89cd, 0x11d0, {0xaf, 0x08, 0x00, 0xa0, 0xc9, 0x25, 0xcd, 0x16}};*/
+
+/* miniaudio only uses priority or exclusive modes. */
+#define MA_DSSCL_NORMAL                 1
+#define MA_DSSCL_PRIORITY               2
+#define MA_DSSCL_EXCLUSIVE              3
+#define MA_DSSCL_WRITEPRIMARY           4
+
+#define MA_DSCAPS_PRIMARYMONO           0x00000001
+#define MA_DSCAPS_PRIMARYSTEREO         0x00000002
+#define MA_DSCAPS_PRIMARY8BIT           0x00000004
+#define MA_DSCAPS_PRIMARY16BIT          0x00000008
+#define MA_DSCAPS_CONTINUOUSRATE        0x00000010
+#define MA_DSCAPS_EMULDRIVER            0x00000020
+#define MA_DSCAPS_CERTIFIED             0x00000040
+#define MA_DSCAPS_SECONDARYMONO         0x00000100
+#define MA_DSCAPS_SECONDARYSTEREO       0x00000200
+#define MA_DSCAPS_SECONDARY8BIT         0x00000400
+#define MA_DSCAPS_SECONDARY16BIT        0x00000800
+
+#define MA_DSBCAPS_PRIMARYBUFFER        0x00000001
+#define MA_DSBCAPS_STATIC               0x00000002
+#define MA_DSBCAPS_LOCHARDWARE          0x00000004
+#define MA_DSBCAPS_LOCSOFTWARE          0x00000008
+#define MA_DSBCAPS_CTRL3D               0x00000010
+#define MA_DSBCAPS_CTRLFREQUENCY        0x00000020
+#define MA_DSBCAPS_CTRLPAN              0x00000040
+#define MA_DSBCAPS_CTRLVOLUME           0x00000080
+#define MA_DSBCAPS_CTRLPOSITIONNOTIFY   0x00000100
+#define MA_DSBCAPS_CTRLFX               0x00000200
+#define MA_DSBCAPS_STICKYFOCUS          0x00004000
+#define MA_DSBCAPS_GLOBALFOCUS          0x00008000
+#define MA_DSBCAPS_GETCURRENTPOSITION2  0x00010000
+#define MA_DSBCAPS_MUTE3DATMAXDISTANCE  0x00020000
+#define MA_DSBCAPS_LOCDEFER             0x00040000
+#define MA_DSBCAPS_TRUEPLAYPOSITION     0x00080000
+
+#define MA_DSBPLAY_LOOPING              0x00000001
+#define MA_DSBPLAY_LOCHARDWARE          0x00000002
+#define MA_DSBPLAY_LOCSOFTWARE          0x00000004
+#define MA_DSBPLAY_TERMINATEBY_TIME     0x00000008
+#define MA_DSBPLAY_TERMINATEBY_DISTANCE 0x00000010
+#define MA_DSBPLAY_TERMINATEBY_PRIORITY 0x00000020
+
+#define MA_DSBSTATUS_PLAYING            0x00000001
+#define MA_DSBSTATUS_BUFFERLOST         0x00000002
+#define MA_DSBSTATUS_LOOPING            0x00000004
+#define MA_DSBSTATUS_LOCHARDWARE        0x00000008
+#define MA_DSBSTATUS_LOCSOFTWARE        0x00000010
+#define MA_DSBSTATUS_TERMINATED         0x00000020
+
+#define MA_DSCBSTART_LOOPING            0x00000001
+
+typedef struct
+{
+    DWORD dwSize;
+    DWORD dwFlags;
+    DWORD dwBufferBytes;
+    DWORD dwReserved;
+    MA_WAVEFORMATEX* lpwfxFormat;
+    GUID guid3DAlgorithm;
+} MA_DSBUFFERDESC;
+
+typedef struct
+{
+    DWORD dwSize;
+    DWORD dwFlags;
+    DWORD dwBufferBytes;
+    DWORD dwReserved;
+    MA_WAVEFORMATEX* lpwfxFormat;
+    DWORD dwFXCount;
+    void* lpDSCFXDesc;  /* <-- miniaudio doesn't use this, so set to void*. */
+} MA_DSCBUFFERDESC;
+
+typedef struct
+{
+    DWORD dwSize;
+    DWORD dwFlags;
+    DWORD dwMinSecondarySampleRate;
+    DWORD dwMaxSecondarySampleRate;
+    DWORD dwPrimaryBuffers;
+    DWORD dwMaxHwMixingAllBuffers;
+    DWORD dwMaxHwMixingStaticBuffers;
+    DWORD dwMaxHwMixingStreamingBuffers;
+    DWORD dwFreeHwMixingAllBuffers;
+    DWORD dwFreeHwMixingStaticBuffers;
+    DWORD dwFreeHwMixingStreamingBuffers;
+    DWORD dwMaxHw3DAllBuffers;
+    DWORD dwMaxHw3DStaticBuffers;
+    DWORD dwMaxHw3DStreamingBuffers;
+    DWORD dwFreeHw3DAllBuffers;
+    DWORD dwFreeHw3DStaticBuffers;
+    DWORD dwFreeHw3DStreamingBuffers;
+    DWORD dwTotalHwMemBytes;
+    DWORD dwFreeHwMemBytes;
+    DWORD dwMaxContigFreeHwMemBytes;
+    DWORD dwUnlockTransferRateHwBuffers;
+    DWORD dwPlayCpuOverheadSwBuffers;
+    DWORD dwReserved1;
+    DWORD dwReserved2;
+} MA_DSCAPS;
+
+typedef struct
+{
+    DWORD dwSize;
+    DWORD dwFlags;
+    DWORD dwBufferBytes;
+    DWORD dwUnlockTransferRate;
+    DWORD dwPlayCpuOverhead;
+} MA_DSBCAPS;
+
+typedef struct
+{
+    DWORD dwSize;
+    DWORD dwFlags;
+    DWORD dwFormats;
+    DWORD dwChannels;
+} MA_DSCCAPS;
+
+typedef struct
+{
+    DWORD dwSize;
+    DWORD dwFlags;
+    DWORD dwBufferBytes;
+    DWORD dwReserved;
+} MA_DSCBCAPS;
+
+typedef struct
+{
+    DWORD  dwOffset;
+    HANDLE hEventNotify;
+} MA_DSBPOSITIONNOTIFY;
+
+typedef struct ma_IDirectSound              ma_IDirectSound;
+typedef struct ma_IDirectSoundBuffer        ma_IDirectSoundBuffer;
+typedef struct ma_IDirectSoundCapture       ma_IDirectSoundCapture;
+typedef struct ma_IDirectSoundCaptureBuffer ma_IDirectSoundCaptureBuffer;
+typedef struct ma_IDirectSoundNotify        ma_IDirectSoundNotify;
+
+
+/*
+COM objects. The way these work is that you have a vtable (a list of function pointers, kind of
+like how C++ works internally), and then you have a structure with a single member, which is a
+pointer to the vtable. The vtable is where the methods of the object are defined. Methods need
+to be in a specific order, and parent classes need to have their methods declared first.
+*/
+
+/* IDirectSound */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSound* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSound* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSound* pThis);
+
+    /* IDirectSound */
+    HRESULT (STDMETHODCALLTYPE * CreateSoundBuffer)   (ma_IDirectSound* pThis, const MA_DSBUFFERDESC* pDSBufferDesc, ma_IDirectSoundBuffer** ppDSBuffer, void* pUnkOuter);
+    HRESULT (STDMETHODCALLTYPE * GetCaps)             (ma_IDirectSound* pThis, MA_DSCAPS* pDSCaps);
+    HRESULT (STDMETHODCALLTYPE * DuplicateSoundBuffer)(ma_IDirectSound* pThis, ma_IDirectSoundBuffer* pDSBufferOriginal, ma_IDirectSoundBuffer** ppDSBufferDuplicate);
+    HRESULT (STDMETHODCALLTYPE * SetCooperativeLevel) (ma_IDirectSound* pThis, HWND hwnd, DWORD dwLevel);
+    HRESULT (STDMETHODCALLTYPE * Compact)             (ma_IDirectSound* pThis);
+    HRESULT (STDMETHODCALLTYPE * GetSpeakerConfig)    (ma_IDirectSound* pThis, DWORD* pSpeakerConfig);
+    HRESULT (STDMETHODCALLTYPE * SetSpeakerConfig)    (ma_IDirectSound* pThis, DWORD dwSpeakerConfig);
+    HRESULT (STDMETHODCALLTYPE * Initialize)          (ma_IDirectSound* pThis, const GUID* pGuidDevice);
+} ma_IDirectSoundVtbl;
+struct ma_IDirectSound
+{
+    ma_IDirectSoundVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IDirectSound_QueryInterface(ma_IDirectSound* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IDirectSound_AddRef(ma_IDirectSound* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IDirectSound_Release(ma_IDirectSound* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IDirectSound_CreateSoundBuffer(ma_IDirectSound* pThis, const MA_DSBUFFERDESC* pDSBufferDesc, ma_IDirectSoundBuffer** ppDSBuffer, void* pUnkOuter) { return pThis->lpVtbl->CreateSoundBuffer(pThis, pDSBufferDesc, ppDSBuffer, pUnkOuter); }
+static MA_INLINE HRESULT ma_IDirectSound_GetCaps(ma_IDirectSound* pThis, MA_DSCAPS* pDSCaps)                            { return pThis->lpVtbl->GetCaps(pThis, pDSCaps); }
+static MA_INLINE HRESULT ma_IDirectSound_DuplicateSoundBuffer(ma_IDirectSound* pThis, ma_IDirectSoundBuffer* pDSBufferOriginal, ma_IDirectSoundBuffer** ppDSBufferDuplicate) { return pThis->lpVtbl->DuplicateSoundBuffer(pThis, pDSBufferOriginal, ppDSBufferDuplicate); }
+static MA_INLINE HRESULT ma_IDirectSound_SetCooperativeLevel(ma_IDirectSound* pThis, HWND hwnd, DWORD dwLevel)          { return pThis->lpVtbl->SetCooperativeLevel(pThis, hwnd, dwLevel); }
+static MA_INLINE HRESULT ma_IDirectSound_Compact(ma_IDirectSound* pThis)                                                { return pThis->lpVtbl->Compact(pThis); }
+static MA_INLINE HRESULT ma_IDirectSound_GetSpeakerConfig(ma_IDirectSound* pThis, DWORD* pSpeakerConfig)                { return pThis->lpVtbl->GetSpeakerConfig(pThis, pSpeakerConfig); }
+static MA_INLINE HRESULT ma_IDirectSound_SetSpeakerConfig(ma_IDirectSound* pThis, DWORD dwSpeakerConfig)                { return pThis->lpVtbl->SetSpeakerConfig(pThis, dwSpeakerConfig); }
+static MA_INLINE HRESULT ma_IDirectSound_Initialize(ma_IDirectSound* pThis, const GUID* pGuidDevice)                    { return pThis->lpVtbl->Initialize(pThis, pGuidDevice); }
+
+
+/* IDirectSoundBuffer */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSoundBuffer* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSoundBuffer* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSoundBuffer* pThis);
+
+    /* IDirectSoundBuffer */
+    HRESULT (STDMETHODCALLTYPE * GetCaps)           (ma_IDirectSoundBuffer* pThis, MA_DSBCAPS* pDSBufferCaps);
+    HRESULT (STDMETHODCALLTYPE * GetCurrentPosition)(ma_IDirectSoundBuffer* pThis, DWORD* pCurrentPlayCursor, DWORD* pCurrentWriteCursor);
+    HRESULT (STDMETHODCALLTYPE * GetFormat)         (ma_IDirectSoundBuffer* pThis, MA_WAVEFORMATEX* pFormat, DWORD dwSizeAllocated, DWORD* pSizeWritten);
+    HRESULT (STDMETHODCALLTYPE * GetVolume)         (ma_IDirectSoundBuffer* pThis, LONG* pVolume);
+    HRESULT (STDMETHODCALLTYPE * GetPan)            (ma_IDirectSoundBuffer* pThis, LONG* pPan);
+    HRESULT (STDMETHODCALLTYPE * GetFrequency)      (ma_IDirectSoundBuffer* pThis, DWORD* pFrequency);
+    HRESULT (STDMETHODCALLTYPE * GetStatus)         (ma_IDirectSoundBuffer* pThis, DWORD* pStatus);
+    HRESULT (STDMETHODCALLTYPE * Initialize)        (ma_IDirectSoundBuffer* pThis, ma_IDirectSound* pDirectSound, const MA_DSBUFFERDESC* pDSBufferDesc);
+    HRESULT (STDMETHODCALLTYPE * Lock)              (ma_IDirectSoundBuffer* pThis, DWORD dwOffset, DWORD dwBytes, void** ppAudioPtr1, DWORD* pAudioBytes1, void** ppAudioPtr2, DWORD* pAudioBytes2, DWORD dwFlags);
+    HRESULT (STDMETHODCALLTYPE * Play)              (ma_IDirectSoundBuffer* pThis, DWORD dwReserved1, DWORD dwPriority, DWORD dwFlags);
+    HRESULT (STDMETHODCALLTYPE * SetCurrentPosition)(ma_IDirectSoundBuffer* pThis, DWORD dwNewPosition);
+    HRESULT (STDMETHODCALLTYPE * SetFormat)         (ma_IDirectSoundBuffer* pThis, const MA_WAVEFORMATEX* pFormat);
+    HRESULT (STDMETHODCALLTYPE * SetVolume)         (ma_IDirectSoundBuffer* pThis, LONG volume);
+    HRESULT (STDMETHODCALLTYPE * SetPan)            (ma_IDirectSoundBuffer* pThis, LONG pan);
+    HRESULT (STDMETHODCALLTYPE * SetFrequency)      (ma_IDirectSoundBuffer* pThis, DWORD dwFrequency);
+    HRESULT (STDMETHODCALLTYPE * Stop)              (ma_IDirectSoundBuffer* pThis);
+    HRESULT (STDMETHODCALLTYPE * Unlock)            (ma_IDirectSoundBuffer* pThis, void* pAudioPtr1, DWORD dwAudioBytes1, void* pAudioPtr2, DWORD dwAudioBytes2);
+    HRESULT (STDMETHODCALLTYPE * Restore)           (ma_IDirectSoundBuffer* pThis);
+} ma_IDirectSoundBufferVtbl;
+struct ma_IDirectSoundBuffer
+{
+    ma_IDirectSoundBufferVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_QueryInterface(ma_IDirectSoundBuffer* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IDirectSoundBuffer_AddRef(ma_IDirectSoundBuffer* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IDirectSoundBuffer_Release(ma_IDirectSoundBuffer* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetCaps(ma_IDirectSoundBuffer* pThis, MA_DSBCAPS* pDSBufferCaps)                     { return pThis->lpVtbl->GetCaps(pThis, pDSBufferCaps); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetCurrentPosition(ma_IDirectSoundBuffer* pThis, DWORD* pCurrentPlayCursor, DWORD* pCurrentWriteCursor) { return pThis->lpVtbl->GetCurrentPosition(pThis, pCurrentPlayCursor, pCurrentWriteCursor); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetFormat(ma_IDirectSoundBuffer* pThis, MA_WAVEFORMATEX* pFormat, DWORD dwSizeAllocated, DWORD* pSizeWritten) { return pThis->lpVtbl->GetFormat(pThis, pFormat, dwSizeAllocated, pSizeWritten); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetVolume(ma_IDirectSoundBuffer* pThis, LONG* pVolume)                               { return pThis->lpVtbl->GetVolume(pThis, pVolume); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetPan(ma_IDirectSoundBuffer* pThis, LONG* pPan)                                     { return pThis->lpVtbl->GetPan(pThis, pPan); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetFrequency(ma_IDirectSoundBuffer* pThis, DWORD* pFrequency)                        { return pThis->lpVtbl->GetFrequency(pThis, pFrequency); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_GetStatus(ma_IDirectSoundBuffer* pThis, DWORD* pStatus)                              { return pThis->lpVtbl->GetStatus(pThis, pStatus); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_Initialize(ma_IDirectSoundBuffer* pThis, ma_IDirectSound* pDirectSound, const MA_DSBUFFERDESC* pDSBufferDesc) { return pThis->lpVtbl->Initialize(pThis, pDirectSound, pDSBufferDesc); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_Lock(ma_IDirectSoundBuffer* pThis, DWORD dwOffset, DWORD dwBytes, void** ppAudioPtr1, DWORD* pAudioBytes1, void** ppAudioPtr2, DWORD* pAudioBytes2, DWORD dwFlags) { return pThis->lpVtbl->Lock(pThis, dwOffset, dwBytes, ppAudioPtr1, pAudioBytes1, ppAudioPtr2, pAudioBytes2, dwFlags); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_Play(ma_IDirectSoundBuffer* pThis, DWORD dwReserved1, DWORD dwPriority, DWORD dwFlags) { return pThis->lpVtbl->Play(pThis, dwReserved1, dwPriority, dwFlags); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetCurrentPosition(ma_IDirectSoundBuffer* pThis, DWORD dwNewPosition)                { return pThis->lpVtbl->SetCurrentPosition(pThis, dwNewPosition); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetFormat(ma_IDirectSoundBuffer* pThis, const MA_WAVEFORMATEX* pFormat)              { return pThis->lpVtbl->SetFormat(pThis, pFormat); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetVolume(ma_IDirectSoundBuffer* pThis, LONG volume)                                 { return pThis->lpVtbl->SetVolume(pThis, volume); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetPan(ma_IDirectSoundBuffer* pThis, LONG pan)                                       { return pThis->lpVtbl->SetPan(pThis, pan); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_SetFrequency(ma_IDirectSoundBuffer* pThis, DWORD dwFrequency)                        { return pThis->lpVtbl->SetFrequency(pThis, dwFrequency); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_Stop(ma_IDirectSoundBuffer* pThis)                                                   { return pThis->lpVtbl->Stop(pThis); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_Unlock(ma_IDirectSoundBuffer* pThis, void* pAudioPtr1, DWORD dwAudioBytes1, void* pAudioPtr2, DWORD dwAudioBytes2) { return pThis->lpVtbl->Unlock(pThis, pAudioPtr1, dwAudioBytes1, pAudioPtr2, dwAudioBytes2); }
+static MA_INLINE HRESULT ma_IDirectSoundBuffer_Restore(ma_IDirectSoundBuffer* pThis)                                                { return pThis->lpVtbl->Restore(pThis); }
+
+
+/* IDirectSoundCapture */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSoundCapture* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSoundCapture* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSoundCapture* pThis);
+
+    /* IDirectSoundCapture */
+    HRESULT (STDMETHODCALLTYPE * CreateCaptureBuffer)(ma_IDirectSoundCapture* pThis, const MA_DSCBUFFERDESC* pDSCBufferDesc, ma_IDirectSoundCaptureBuffer** ppDSCBuffer, void* pUnkOuter);
+    HRESULT (STDMETHODCALLTYPE * GetCaps)            (ma_IDirectSoundCapture* pThis, MA_DSCCAPS* pDSCCaps);
+    HRESULT (STDMETHODCALLTYPE * Initialize)         (ma_IDirectSoundCapture* pThis, const GUID* pGuidDevice);
+} ma_IDirectSoundCaptureVtbl;
+struct ma_IDirectSoundCapture
+{
+    ma_IDirectSoundCaptureVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IDirectSoundCapture_QueryInterface     (ma_IDirectSoundCapture* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IDirectSoundCapture_AddRef             (ma_IDirectSoundCapture* pThis)                                    { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IDirectSoundCapture_Release            (ma_IDirectSoundCapture* pThis)                                    { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IDirectSoundCapture_CreateCaptureBuffer(ma_IDirectSoundCapture* pThis, const MA_DSCBUFFERDESC* pDSCBufferDesc, ma_IDirectSoundCaptureBuffer** ppDSCBuffer, void* pUnkOuter) { return pThis->lpVtbl->CreateCaptureBuffer(pThis, pDSCBufferDesc, ppDSCBuffer, pUnkOuter); }
+static MA_INLINE HRESULT ma_IDirectSoundCapture_GetCaps            (ma_IDirectSoundCapture* pThis, MA_DSCCAPS* pDSCCaps)              { return pThis->lpVtbl->GetCaps(pThis, pDSCCaps); }
+static MA_INLINE HRESULT ma_IDirectSoundCapture_Initialize         (ma_IDirectSoundCapture* pThis, const GUID* pGuidDevice)           { return pThis->lpVtbl->Initialize(pThis, pGuidDevice); }
+
+
+/* IDirectSoundCaptureBuffer */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSoundCaptureBuffer* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSoundCaptureBuffer* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSoundCaptureBuffer* pThis);
+
+    /* IDirectSoundCaptureBuffer */
+    HRESULT (STDMETHODCALLTYPE * GetCaps)           (ma_IDirectSoundCaptureBuffer* pThis, MA_DSCBCAPS* pDSCBCaps);
+    HRESULT (STDMETHODCALLTYPE * GetCurrentPosition)(ma_IDirectSoundCaptureBuffer* pThis, DWORD* pCapturePosition, DWORD* pReadPosition);
+    HRESULT (STDMETHODCALLTYPE * GetFormat)         (ma_IDirectSoundCaptureBuffer* pThis, MA_WAVEFORMATEX* pFormat, DWORD dwSizeAllocated, DWORD* pSizeWritten);
+    HRESULT (STDMETHODCALLTYPE * GetStatus)         (ma_IDirectSoundCaptureBuffer* pThis, DWORD* pStatus);
+    HRESULT (STDMETHODCALLTYPE * Initialize)        (ma_IDirectSoundCaptureBuffer* pThis, ma_IDirectSoundCapture* pDirectSoundCapture, const MA_DSCBUFFERDESC* pDSCBufferDesc);
+    HRESULT (STDMETHODCALLTYPE * Lock)              (ma_IDirectSoundCaptureBuffer* pThis, DWORD dwOffset, DWORD dwBytes, void** ppAudioPtr1, DWORD* pAudioBytes1, void** ppAudioPtr2, DWORD* pAudioBytes2, DWORD dwFlags);
+    HRESULT (STDMETHODCALLTYPE * Start)             (ma_IDirectSoundCaptureBuffer* pThis, DWORD dwFlags);
+    HRESULT (STDMETHODCALLTYPE * Stop)              (ma_IDirectSoundCaptureBuffer* pThis);
+    HRESULT (STDMETHODCALLTYPE * Unlock)            (ma_IDirectSoundCaptureBuffer* pThis, void* pAudioPtr1, DWORD dwAudioBytes1, void* pAudioPtr2, DWORD dwAudioBytes2);
+} ma_IDirectSoundCaptureBufferVtbl;
+struct ma_IDirectSoundCaptureBuffer
+{
+    ma_IDirectSoundCaptureBufferVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_QueryInterface(ma_IDirectSoundCaptureBuffer* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IDirectSoundCaptureBuffer_AddRef(ma_IDirectSoundCaptureBuffer* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IDirectSoundCaptureBuffer_Release(ma_IDirectSoundCaptureBuffer* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_GetCaps(ma_IDirectSoundCaptureBuffer* pThis, MA_DSCBCAPS* pDSCBCaps)                        { return pThis->lpVtbl->GetCaps(pThis, pDSCBCaps); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_GetCurrentPosition(ma_IDirectSoundCaptureBuffer* pThis, DWORD* pCapturePosition, DWORD* pReadPosition) { return pThis->lpVtbl->GetCurrentPosition(pThis, pCapturePosition, pReadPosition); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_GetFormat(ma_IDirectSoundCaptureBuffer* pThis, MA_WAVEFORMATEX* pFormat, DWORD dwSizeAllocated, DWORD* pSizeWritten) { return pThis->lpVtbl->GetFormat(pThis, pFormat, dwSizeAllocated, pSizeWritten); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_GetStatus(ma_IDirectSoundCaptureBuffer* pThis, DWORD* pStatus)                              { return pThis->lpVtbl->GetStatus(pThis, pStatus); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Initialize(ma_IDirectSoundCaptureBuffer* pThis, ma_IDirectSoundCapture* pDirectSoundCapture, const MA_DSCBUFFERDESC* pDSCBufferDesc) { return pThis->lpVtbl->Initialize(pThis, pDirectSoundCapture, pDSCBufferDesc); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Lock(ma_IDirectSoundCaptureBuffer* pThis, DWORD dwOffset, DWORD dwBytes, void** ppAudioPtr1, DWORD* pAudioBytes1, void** ppAudioPtr2, DWORD* pAudioBytes2, DWORD dwFlags) { return pThis->lpVtbl->Lock(pThis, dwOffset, dwBytes, ppAudioPtr1, pAudioBytes1, ppAudioPtr2, pAudioBytes2, dwFlags); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Start(ma_IDirectSoundCaptureBuffer* pThis, DWORD dwFlags)                                   { return pThis->lpVtbl->Start(pThis, dwFlags); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Stop(ma_IDirectSoundCaptureBuffer* pThis)                                                   { return pThis->lpVtbl->Stop(pThis); }
+static MA_INLINE HRESULT ma_IDirectSoundCaptureBuffer_Unlock(ma_IDirectSoundCaptureBuffer* pThis, void* pAudioPtr1, DWORD dwAudioBytes1, void* pAudioPtr2, DWORD dwAudioBytes2) { return pThis->lpVtbl->Unlock(pThis, pAudioPtr1, dwAudioBytes1, pAudioPtr2, dwAudioBytes2); }
+
+
+/* IDirectSoundNotify */
+typedef struct
+{
+    /* IUnknown */
+    HRESULT (STDMETHODCALLTYPE * QueryInterface)(ma_IDirectSoundNotify* pThis, const IID* const riid, void** ppObject);
+    ULONG   (STDMETHODCALLTYPE * AddRef)        (ma_IDirectSoundNotify* pThis);
+    ULONG   (STDMETHODCALLTYPE * Release)       (ma_IDirectSoundNotify* pThis);
+
+    /* IDirectSoundNotify */
+    HRESULT (STDMETHODCALLTYPE * SetNotificationPositions)(ma_IDirectSoundNotify* pThis, DWORD dwPositionNotifies, const MA_DSBPOSITIONNOTIFY* pPositionNotifies);
+} ma_IDirectSoundNotifyVtbl;
+struct ma_IDirectSoundNotify
+{
+    ma_IDirectSoundNotifyVtbl* lpVtbl;
+};
+static MA_INLINE HRESULT ma_IDirectSoundNotify_QueryInterface(ma_IDirectSoundNotify* pThis, const IID* const riid, void** ppObject) { return pThis->lpVtbl->QueryInterface(pThis, riid, ppObject); }
+static MA_INLINE ULONG   ma_IDirectSoundNotify_AddRef(ma_IDirectSoundNotify* pThis)                                                 { return pThis->lpVtbl->AddRef(pThis); }
+static MA_INLINE ULONG   ma_IDirectSoundNotify_Release(ma_IDirectSoundNotify* pThis)                                                { return pThis->lpVtbl->Release(pThis); }
+static MA_INLINE HRESULT ma_IDirectSoundNotify_SetNotificationPositions(ma_IDirectSoundNotify* pThis, DWORD dwPositionNotifies, const MA_DSBPOSITIONNOTIFY* pPositionNotifies) { return pThis->lpVtbl->SetNotificationPositions(pThis, dwPositionNotifies, pPositionNotifies); }
+
+
+typedef BOOL    (CALLBACK * ma_DSEnumCallbackAProc)             (GUID* pDeviceGUID, const char* pDeviceDescription, const char* pModule, void* pContext);
+typedef HRESULT (WINAPI   * ma_DirectSoundCreateProc)           (const GUID* pcGuidDevice, ma_IDirectSound** ppDS8, ma_IUnknown* pUnkOuter);
+typedef HRESULT (WINAPI   * ma_DirectSoundEnumerateAProc)       (ma_DSEnumCallbackAProc pDSEnumCallback, void* pContext);
+typedef HRESULT (WINAPI   * ma_DirectSoundCaptureCreateProc)    (const GUID* pcGuidDevice, ma_IDirectSoundCapture** ppDSC8, ma_IUnknown* pUnkOuter);
+typedef HRESULT (WINAPI   * ma_DirectSoundCaptureEnumerateAProc)(ma_DSEnumCallbackAProc pDSEnumCallback, void* pContext);
+
+static ma_uint32 ma_get_best_sample_rate_within_range(ma_uint32 sampleRateMin, ma_uint32 sampleRateMax)
+{
+    /* Normalize the range in case we were given something stupid. */
+    if (sampleRateMin < (ma_uint32)ma_standard_sample_rate_min) {
+        sampleRateMin = (ma_uint32)ma_standard_sample_rate_min;
+    }
+    if (sampleRateMax > (ma_uint32)ma_standard_sample_rate_max) {
+        sampleRateMax = (ma_uint32)ma_standard_sample_rate_max;
+    }
+    if (sampleRateMin > sampleRateMax) {
+        sampleRateMin = sampleRateMax;
+    }
+
+    if (sampleRateMin == sampleRateMax) {
+        return sampleRateMax;
+    } else {
+        size_t iStandardRate;
+        for (iStandardRate = 0; iStandardRate < ma_countof(g_maStandardSampleRatePriorities); ++iStandardRate) {
+            ma_uint32 standardRate = g_maStandardSampleRatePriorities[iStandardRate];
+            if (standardRate >= sampleRateMin && standardRate <= sampleRateMax) {
+                return standardRate;
+            }
+        }
+    }
+
+    /* Should never get here. */
+    MA_ASSERT(MA_FALSE);
+    return 0;
+}
+
+/*
+Retrieves the channel count and channel map for the given speaker configuration. If the speaker configuration is unknown,
+the channel count and channel map will be left unmodified.
+*/
+static void ma_get_channels_from_speaker_config__dsound(DWORD speakerConfig, WORD* pChannelsOut, DWORD* pChannelMapOut)
+{
+    WORD  channels;
+    DWORD channelMap;
+
+    channels = 0;
+    if (pChannelsOut != NULL) {
+        channels = *pChannelsOut;
+    }
+
+    channelMap = 0;
+    if (pChannelMapOut != NULL) {
+        channelMap = *pChannelMapOut;
+    }
+
+    /*
+    The speaker configuration is a combination of speaker config and speaker geometry. The lower 8 bits is what we care about. The upper
+    16 bits is for the geometry.
+    */
+    switch ((BYTE)(speakerConfig)) {
+        case 1 /*DSSPEAKER_HEADPHONE*/:                          channels = 2; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT; break;
+        case 2 /*DSSPEAKER_MONO*/:                               channels = 1; channelMap = SPEAKER_FRONT_CENTER; break;
+        case 3 /*DSSPEAKER_QUAD*/:                               channels = 4; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT; break;
+        case 4 /*DSSPEAKER_STEREO*/:                             channels = 2; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT; break;
+        case 5 /*DSSPEAKER_SURROUND*/:                           channels = 4; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_BACK_CENTER; break;
+        case 6 /*DSSPEAKER_5POINT1_BACK*/ /*DSSPEAKER_5POINT1*/: channels = 6; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT; break;
+        case 7 /*DSSPEAKER_7POINT1_WIDE*/ /*DSSPEAKER_7POINT1*/: channels = 8; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT | SPEAKER_FRONT_LEFT_OF_CENTER | SPEAKER_FRONT_RIGHT_OF_CENTER; break;
+        case 8 /*DSSPEAKER_7POINT1_SURROUND*/:                   channels = 8; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | SPEAKER_BACK_LEFT | SPEAKER_BACK_RIGHT | SPEAKER_SIDE_LEFT | SPEAKER_SIDE_RIGHT; break;
+        case 9 /*DSSPEAKER_5POINT1_SURROUND*/:                   channels = 6; channelMap = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT | SPEAKER_FRONT_CENTER | SPEAKER_LOW_FREQUENCY | SPEAKER_SIDE_LEFT | SPEAKER_SIDE_RIGHT; break;
+        default: break;
+    }
+
+    if (pChannelsOut != NULL) {
+        *pChannelsOut = channels;
+    }
+
+    if (pChannelMapOut != NULL) {
+        *pChannelMapOut = channelMap;
+    }
+}
+
+
+static ma_result ma_context_create_IDirectSound__dsound(ma_context* pContext, ma_share_mode shareMode, const ma_device_id* pDeviceID, ma_IDirectSound** ppDirectSound)
+{
+    ma_IDirectSound* pDirectSound;
+    HWND hWnd;
+    HRESULT hr;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(ppDirectSound != NULL);
+
+    *ppDirectSound = NULL;
+    pDirectSound = NULL;
+
+    if (FAILED(((ma_DirectSoundCreateProc)pContext->dsound.DirectSoundCreate)((pDeviceID == NULL) ? NULL : (const GUID*)pDeviceID->dsound, &pDirectSound, NULL))) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] DirectSoundCreate() failed for playback device.");
+        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+    }
+
+    /* The cooperative level must be set before doing anything else. */
+    hWnd = (HWND)pContext->dsound.hWnd;
+    if (hWnd == 0) {
+        hWnd = ((MA_PFN_GetForegroundWindow)pContext->win32.GetForegroundWindow)();
+        if (hWnd == 0) {
+            hWnd = ((MA_PFN_GetDesktopWindow)pContext->win32.GetDesktopWindow)();
+        }
+    }
+
+    hr = ma_IDirectSound_SetCooperativeLevel(pDirectSound, hWnd, (shareMode == ma_share_mode_exclusive) ? MA_DSSCL_EXCLUSIVE : MA_DSSCL_PRIORITY);
+    if (FAILED(hr)) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_SetCooperateiveLevel() failed for playback device.");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    *ppDirectSound = pDirectSound;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_create_IDirectSoundCapture__dsound(ma_context* pContext, ma_share_mode shareMode, const ma_device_id* pDeviceID, ma_IDirectSoundCapture** ppDirectSoundCapture)
+{
+    ma_IDirectSoundCapture* pDirectSoundCapture;
+    HRESULT hr;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(ppDirectSoundCapture != NULL);
+
+    /* DirectSound does not support exclusive mode for capture. */
+    if (shareMode == ma_share_mode_exclusive) {
+        return MA_SHARE_MODE_NOT_SUPPORTED;
+    }
+
+    *ppDirectSoundCapture = NULL;
+    pDirectSoundCapture = NULL;
+
+    hr = ((ma_DirectSoundCaptureCreateProc)pContext->dsound.DirectSoundCaptureCreate)((pDeviceID == NULL) ? NULL : (const GUID*)pDeviceID->dsound, &pDirectSoundCapture, NULL);
+    if (FAILED(hr)) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] DirectSoundCaptureCreate() failed for capture device.");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    *ppDirectSoundCapture = pDirectSoundCapture;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_format_info_for_IDirectSoundCapture__dsound(ma_context* pContext, ma_IDirectSoundCapture* pDirectSoundCapture, WORD* pChannels, WORD* pBitsPerSample, DWORD* pSampleRate)
+{
+    HRESULT hr;
+    MA_DSCCAPS caps;
+    WORD bitsPerSample;
+    DWORD sampleRate;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pDirectSoundCapture != NULL);
+
+    if (pChannels) {
+        *pChannels = 0;
+    }
+    if (pBitsPerSample) {
+        *pBitsPerSample = 0;
+    }
+    if (pSampleRate) {
+        *pSampleRate = 0;
+    }
+
+    MA_ZERO_OBJECT(&caps);
+    caps.dwSize = sizeof(caps);
+    hr = ma_IDirectSoundCapture_GetCaps(pDirectSoundCapture, &caps);
+    if (FAILED(hr)) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundCapture_GetCaps() failed for capture device.");
+        return ma_result_from_HRESULT(hr);
+    }
+
+    if (pChannels) {
+        *pChannels = (WORD)caps.dwChannels;
+    }
+
+    /* The device can support multiple formats. We just go through the different formats in order of priority and pick the first one. This the same type of system as the WinMM backend. */
+    bitsPerSample = 16;
+    sampleRate = 48000;
+
+    if (caps.dwChannels == 1) {
+        if ((caps.dwFormats & WAVE_FORMAT_48M16) != 0) {
+            sampleRate = 48000;
+        } else if ((caps.dwFormats & WAVE_FORMAT_44M16) != 0) {
+            sampleRate = 44100;
+        } else if ((caps.dwFormats & WAVE_FORMAT_2M16) != 0) {
+            sampleRate = 22050;
+        } else if ((caps.dwFormats & WAVE_FORMAT_1M16) != 0) {
+            sampleRate = 11025;
+        } else if ((caps.dwFormats & WAVE_FORMAT_96M16) != 0) {
+            sampleRate = 96000;
+        } else {
+            bitsPerSample = 8;
+            if ((caps.dwFormats & WAVE_FORMAT_48M08) != 0) {
+                sampleRate = 48000;
+            } else if ((caps.dwFormats & WAVE_FORMAT_44M08) != 0) {
+                sampleRate = 44100;
+            } else if ((caps.dwFormats & WAVE_FORMAT_2M08) != 0) {
+                sampleRate = 22050;
+            } else if ((caps.dwFormats & WAVE_FORMAT_1M08) != 0) {
+                sampleRate = 11025;
+            } else if ((caps.dwFormats & WAVE_FORMAT_96M08) != 0) {
+                sampleRate = 96000;
+            } else {
+                bitsPerSample = 16;  /* Didn't find it. Just fall back to 16-bit. */
+            }
+        }
+    } else if (caps.dwChannels == 2) {
+        if ((caps.dwFormats & WAVE_FORMAT_48S16) != 0) {
+            sampleRate = 48000;
+        } else if ((caps.dwFormats & WAVE_FORMAT_44S16) != 0) {
+            sampleRate = 44100;
+        } else if ((caps.dwFormats & WAVE_FORMAT_2S16) != 0) {
+            sampleRate = 22050;
+        } else if ((caps.dwFormats & WAVE_FORMAT_1S16) != 0) {
+            sampleRate = 11025;
+        } else if ((caps.dwFormats & WAVE_FORMAT_96S16) != 0) {
+            sampleRate = 96000;
+        } else {
+            bitsPerSample = 8;
+            if ((caps.dwFormats & WAVE_FORMAT_48S08) != 0) {
+                sampleRate = 48000;
+            } else if ((caps.dwFormats & WAVE_FORMAT_44S08) != 0) {
+                sampleRate = 44100;
+            } else if ((caps.dwFormats & WAVE_FORMAT_2S08) != 0) {
+                sampleRate = 22050;
+            } else if ((caps.dwFormats & WAVE_FORMAT_1S08) != 0) {
+                sampleRate = 11025;
+            } else if ((caps.dwFormats & WAVE_FORMAT_96S08) != 0) {
+                sampleRate = 96000;
+            } else {
+                bitsPerSample = 16;  /* Didn't find it. Just fall back to 16-bit. */
+            }
+        }
+    }
+
+    if (pBitsPerSample) {
+        *pBitsPerSample = bitsPerSample;
+    }
+    if (pSampleRate) {
+        *pSampleRate = sampleRate;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+typedef struct
+{
+    ma_context* pContext;
+    ma_device_type deviceType;
+    ma_enum_devices_callback_proc callback;
+    void* pUserData;
+    ma_bool32 terminated;
+} ma_context_enumerate_devices_callback_data__dsound;
+
+static BOOL CALLBACK ma_context_enumerate_devices_callback__dsound(GUID* lpGuid, const char* lpcstrDescription, const char* lpcstrModule, void* lpContext)
+{
+    ma_context_enumerate_devices_callback_data__dsound* pData = (ma_context_enumerate_devices_callback_data__dsound*)lpContext;
+    ma_device_info deviceInfo;
+
+    (void)lpcstrModule;
+
+    MA_ZERO_OBJECT(&deviceInfo);
+
+    /* ID. */
+    if (lpGuid != NULL) {
+        MA_COPY_MEMORY(deviceInfo.id.dsound, lpGuid, 16);
+    } else {
+        MA_ZERO_MEMORY(deviceInfo.id.dsound, 16);
+        deviceInfo.isDefault = MA_TRUE;
+    }
+
+    /* Name / Description */
+    ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), lpcstrDescription, (size_t)-1);
+
+
+    /* Call the callback function, but make sure we stop enumerating if the callee requested so. */
+    MA_ASSERT(pData != NULL);
+    pData->terminated = (pData->callback(pData->pContext, pData->deviceType, &deviceInfo, pData->pUserData) == MA_FALSE);
+    if (pData->terminated) {
+        return FALSE;   /* Stop enumeration. */
+    } else {
+        return TRUE;    /* Continue enumeration. */
+    }
+}
+
+static ma_result ma_context_enumerate_devices__dsound(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_context_enumerate_devices_callback_data__dsound data;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    data.pContext = pContext;
+    data.callback = callback;
+    data.pUserData = pUserData;
+    data.terminated = MA_FALSE;
+
+    /* Playback. */
+    if (!data.terminated) {
+        data.deviceType = ma_device_type_playback;
+        ((ma_DirectSoundEnumerateAProc)pContext->dsound.DirectSoundEnumerateA)(ma_context_enumerate_devices_callback__dsound, &data);
+    }
+
+    /* Capture. */
+    if (!data.terminated) {
+        data.deviceType = ma_device_type_capture;
+        ((ma_DirectSoundCaptureEnumerateAProc)pContext->dsound.DirectSoundCaptureEnumerateA)(ma_context_enumerate_devices_callback__dsound, &data);
+    }
+
+    return MA_SUCCESS;
+}
+
+
+typedef struct
+{
+    const ma_device_id* pDeviceID;
+    ma_device_info* pDeviceInfo;
+    ma_bool32 found;
+} ma_context_get_device_info_callback_data__dsound;
+
+static BOOL CALLBACK ma_context_get_device_info_callback__dsound(GUID* lpGuid, const char* lpcstrDescription, const char* lpcstrModule, void* lpContext)
+{
+    ma_context_get_device_info_callback_data__dsound* pData = (ma_context_get_device_info_callback_data__dsound*)lpContext;
+    MA_ASSERT(pData != NULL);
+
+    if ((pData->pDeviceID == NULL || ma_is_guid_null(pData->pDeviceID->dsound)) && (lpGuid == NULL || ma_is_guid_null(lpGuid))) {
+        /* Default device. */
+        ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), lpcstrDescription, (size_t)-1);
+        pData->pDeviceInfo->isDefault = MA_TRUE;
+        pData->found = MA_TRUE;
+        return FALSE;   /* Stop enumeration. */
+    } else {
+        /* Not the default device. */
+        if (lpGuid != NULL && pData->pDeviceID != NULL) {
+            if (memcmp(pData->pDeviceID->dsound, lpGuid, sizeof(pData->pDeviceID->dsound)) == 0) {
+                ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), lpcstrDescription, (size_t)-1);
+                pData->found = MA_TRUE;
+                return FALSE;   /* Stop enumeration. */
+            }
+        }
+    }
+
+    (void)lpcstrModule;
+    return TRUE;
+}
+
+static ma_result ma_context_get_device_info__dsound(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    ma_result result;
+    HRESULT hr;
+
+    if (pDeviceID != NULL) {
+        ma_context_get_device_info_callback_data__dsound data;
+
+        /* ID. */
+        MA_COPY_MEMORY(pDeviceInfo->id.dsound, pDeviceID->dsound, 16);
+
+        /* Name / Description. This is retrieved by enumerating over each device until we find that one that matches the input ID. */
+        data.pDeviceID = pDeviceID;
+        data.pDeviceInfo = pDeviceInfo;
+        data.found = MA_FALSE;
+        if (deviceType == ma_device_type_playback) {
+            ((ma_DirectSoundEnumerateAProc)pContext->dsound.DirectSoundEnumerateA)(ma_context_get_device_info_callback__dsound, &data);
+        } else {
+            ((ma_DirectSoundCaptureEnumerateAProc)pContext->dsound.DirectSoundCaptureEnumerateA)(ma_context_get_device_info_callback__dsound, &data);
+        }
+
+        if (!data.found) {
+            return MA_NO_DEVICE;
+        }
+    } else {
+        /* I don't think there's a way to get the name of the default device with DirectSound. In this case we just need to use defaults. */
+
+        /* ID */
+        MA_ZERO_MEMORY(pDeviceInfo->id.dsound, 16);
+
+        /* Name / Description */
+        if (deviceType == ma_device_type_playback) {
+            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+        } else {
+            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+        }
+
+        pDeviceInfo->isDefault = MA_TRUE;
+    }
+
+    /* Retrieving detailed information is slightly different depending on the device type. */
+    if (deviceType == ma_device_type_playback) {
+        /* Playback. */
+        ma_IDirectSound* pDirectSound;
+        MA_DSCAPS caps;
+        WORD channels;
+
+        result = ma_context_create_IDirectSound__dsound(pContext, ma_share_mode_shared, pDeviceID, &pDirectSound);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        MA_ZERO_OBJECT(&caps);
+        caps.dwSize = sizeof(caps);
+        hr = ma_IDirectSound_GetCaps(pDirectSound, &caps);
+        if (FAILED(hr)) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_GetCaps() failed for playback device.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+
+        /* Channels. Only a single channel count is reported for DirectSound. */
+        if ((caps.dwFlags & MA_DSCAPS_PRIMARYSTEREO) != 0) {
+            /* It supports at least stereo, but could support more. */
+            DWORD speakerConfig;
+
+            channels = 2;
+
+            /* Look at the speaker configuration to get a better idea on the channel count. */
+            hr = ma_IDirectSound_GetSpeakerConfig(pDirectSound, &speakerConfig);
+            if (SUCCEEDED(hr)) {
+                ma_get_channels_from_speaker_config__dsound(speakerConfig, &channels, NULL);
+            }
+        } else {
+            /* It does not support stereo, which means we are stuck with mono. */
+            channels = 1;
+        }
+
+
+        /*
+        In DirectSound, our native formats are centered around sample rates. All formats are supported, and we're only reporting a single channel
+        count. However, DirectSound can report a range of supported sample rates. We're only going to include standard rates known by miniaudio
+        in order to keep the size of this within reason.
+        */
+        if ((caps.dwFlags & MA_DSCAPS_CONTINUOUSRATE) != 0) {
+            /* Multiple sample rates are supported. We'll report in order of our preferred sample rates. */
+            size_t iStandardSampleRate;
+            for (iStandardSampleRate = 0; iStandardSampleRate < ma_countof(g_maStandardSampleRatePriorities); iStandardSampleRate += 1) {
+                ma_uint32 sampleRate = g_maStandardSampleRatePriorities[iStandardSampleRate];
+                if (sampleRate >= caps.dwMinSecondarySampleRate && sampleRate <= caps.dwMaxSecondarySampleRate) {
+                    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = ma_format_unknown;
+                    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
+                    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = sampleRate;
+                    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = 0;
+                    pDeviceInfo->nativeDataFormatCount += 1;
+                }
+            }
+        } else {
+            /* Only a single sample rate is supported. */
+            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = ma_format_unknown;
+            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
+            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = caps.dwMaxSecondarySampleRate;
+            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = 0;
+            pDeviceInfo->nativeDataFormatCount += 1;
+        }
+
+        ma_IDirectSound_Release(pDirectSound);
+    } else {
+        /*
+        Capture. This is a little different to playback due to the say the supported formats are reported. Technically capture
+        devices can support a number of different formats, but for simplicity and consistency with ma_device_init() I'm just
+        reporting the best format.
+        */
+        ma_IDirectSoundCapture* pDirectSoundCapture;
+        WORD channels;
+        WORD bitsPerSample;
+        DWORD sampleRate;
+
+        result = ma_context_create_IDirectSoundCapture__dsound(pContext, ma_share_mode_shared, pDeviceID, &pDirectSoundCapture);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        result = ma_context_get_format_info_for_IDirectSoundCapture__dsound(pContext, pDirectSoundCapture, &channels, &bitsPerSample, &sampleRate);
+        if (result != MA_SUCCESS) {
+            ma_IDirectSoundCapture_Release(pDirectSoundCapture);
+            return result;
+        }
+
+        ma_IDirectSoundCapture_Release(pDirectSoundCapture);
+
+        /* The format is always an integer format and is based on the bits per sample. */
+        if (bitsPerSample == 8) {
+            pDeviceInfo->nativeDataFormats[0].format = ma_format_u8;
+        } else if (bitsPerSample == 16) {
+            pDeviceInfo->nativeDataFormats[0].format = ma_format_s16;
+        } else if (bitsPerSample == 24) {
+            pDeviceInfo->nativeDataFormats[0].format = ma_format_s24;
+        } else if (bitsPerSample == 32) {
+            pDeviceInfo->nativeDataFormats[0].format = ma_format_s32;
+        } else {
+            return MA_FORMAT_NOT_SUPPORTED;
+        }
+
+        pDeviceInfo->nativeDataFormats[0].channels   = channels;
+        pDeviceInfo->nativeDataFormats[0].sampleRate = sampleRate;
+        pDeviceInfo->nativeDataFormats[0].flags      = 0;
+        pDeviceInfo->nativeDataFormatCount = 1;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+
+static ma_result ma_device_uninit__dsound(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->dsound.pCaptureBuffer != NULL) {
+        ma_IDirectSoundCaptureBuffer_Release((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
+    }
+    if (pDevice->dsound.pCapture != NULL) {
+        ma_IDirectSoundCapture_Release((ma_IDirectSoundCapture*)pDevice->dsound.pCapture);
+    }
+
+    if (pDevice->dsound.pPlaybackBuffer != NULL) {
+        ma_IDirectSoundBuffer_Release((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer);
+    }
+    if (pDevice->dsound.pPlaybackPrimaryBuffer != NULL) {
+        ma_IDirectSoundBuffer_Release((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackPrimaryBuffer);
+    }
+    if (pDevice->dsound.pPlayback != NULL) {
+        ma_IDirectSound_Release((ma_IDirectSound*)pDevice->dsound.pPlayback);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_config_to_WAVEFORMATEXTENSIBLE(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, const ma_channel* pChannelMap, MA_WAVEFORMATEXTENSIBLE* pWF)
+{
+    GUID subformat;
+
+    if (format == ma_format_unknown) {
+        format = MA_DEFAULT_FORMAT;
+    }
+
+    if (channels == 0) {
+        channels = MA_DEFAULT_CHANNELS;
+    }
+
+    if (sampleRate == 0) {
+        sampleRate = MA_DEFAULT_SAMPLE_RATE;
+    }
+
+    switch (format)
+    {
+        case ma_format_u8:
+        case ma_format_s16:
+        case ma_format_s24:
+        /*case ma_format_s24_32:*/
+        case ma_format_s32:
+        {
+            subformat = MA_GUID_KSDATAFORMAT_SUBTYPE_PCM;
+        } break;
+
+        case ma_format_f32:
+        {
+            subformat = MA_GUID_KSDATAFORMAT_SUBTYPE_IEEE_FLOAT;
+        } break;
+
+        default:
+        return MA_FORMAT_NOT_SUPPORTED;
+    }
+
+    MA_ZERO_OBJECT(pWF);
+    pWF->cbSize                      = sizeof(*pWF);
+    pWF->wFormatTag                  = WAVE_FORMAT_EXTENSIBLE;
+    pWF->nChannels                   = (WORD)channels;
+    pWF->nSamplesPerSec              = (DWORD)sampleRate;
+    pWF->wBitsPerSample              = (WORD)(ma_get_bytes_per_sample(format)*8);
+    pWF->nBlockAlign                 = (WORD)(pWF->nChannels * pWF->wBitsPerSample / 8);
+    pWF->nAvgBytesPerSec             = pWF->nBlockAlign * pWF->nSamplesPerSec;
+    pWF->Samples.wValidBitsPerSample = pWF->wBitsPerSample;
+    pWF->dwChannelMask               = ma_channel_map_to_channel_mask__win32(pChannelMap, channels);
+    pWF->SubFormat                   = subformat;
+
+    return MA_SUCCESS;
+}
+
+static ma_uint32 ma_calculate_period_size_in_frames_from_descriptor__dsound(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
+{
+    /*
+    DirectSound has a minimum period size of 20ms. In practice, this doesn't seem to be enough for
+    reliable glitch-free processing so going to use 30ms instead.
+    */
+    ma_uint32 minPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(30, nativeSampleRate);
+    ma_uint32 periodSizeInFrames;
+
+    periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, nativeSampleRate, performanceProfile);
+    if (periodSizeInFrames < minPeriodSizeInFrames) {
+        periodSizeInFrames = minPeriodSizeInFrames;
+    }
+
+    return periodSizeInFrames;
+}
+
+static ma_result ma_device_init__dsound(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    ma_result result;
+    HRESULT hr;
+
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ZERO_OBJECT(&pDevice->dsound);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    /*
+    Unfortunately DirectSound uses different APIs and data structures for playback and capture devices. We need to initialize
+    the capture device first because we'll want to match its buffer size and period count on the playback side if we're using
+    full-duplex mode.
+    */
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        MA_WAVEFORMATEXTENSIBLE wf;
+        MA_DSCBUFFERDESC descDS;
+        ma_uint32 periodSizeInFrames;
+        ma_uint32 periodCount;
+        char rawdata[1024]; /* <-- Ugly hack to avoid a malloc() due to a crappy DirectSound API. */
+        MA_WAVEFORMATEXTENSIBLE* pActualFormat;
+
+        result = ma_config_to_WAVEFORMATEXTENSIBLE(pDescriptorCapture->format, pDescriptorCapture->channels, pDescriptorCapture->sampleRate, pDescriptorCapture->channelMap, &wf);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        result = ma_context_create_IDirectSoundCapture__dsound(pDevice->pContext, pDescriptorCapture->shareMode, pDescriptorCapture->pDeviceID, (ma_IDirectSoundCapture**)&pDevice->dsound.pCapture);
+        if (result != MA_SUCCESS) {
+            ma_device_uninit__dsound(pDevice);
+            return result;
+        }
+
+        result = ma_context_get_format_info_for_IDirectSoundCapture__dsound(pDevice->pContext, (ma_IDirectSoundCapture*)pDevice->dsound.pCapture, &wf.nChannels, &wf.wBitsPerSample, &wf.nSamplesPerSec);
+        if (result != MA_SUCCESS) {
+            ma_device_uninit__dsound(pDevice);
+            return result;
+        }
+
+        wf.nBlockAlign                 = (WORD)(wf.nChannels * wf.wBitsPerSample / 8);
+        wf.nAvgBytesPerSec             = wf.nBlockAlign * wf.nSamplesPerSec;
+        wf.Samples.wValidBitsPerSample = wf.wBitsPerSample;
+        wf.SubFormat                   = MA_GUID_KSDATAFORMAT_SUBTYPE_PCM;
+
+        /* The size of the buffer must be a clean multiple of the period count. */
+        periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__dsound(pDescriptorCapture, wf.nSamplesPerSec, pConfig->performanceProfile);
+        periodCount = (pDescriptorCapture->periodCount > 0) ? pDescriptorCapture->periodCount : MA_DEFAULT_PERIODS;
+
+        MA_ZERO_OBJECT(&descDS);
+        descDS.dwSize        = sizeof(descDS);
+        descDS.dwFlags       = 0;
+        descDS.dwBufferBytes = periodSizeInFrames * periodCount * wf.nBlockAlign;
+        descDS.lpwfxFormat   = (MA_WAVEFORMATEX*)&wf;
+        hr = ma_IDirectSoundCapture_CreateCaptureBuffer((ma_IDirectSoundCapture*)pDevice->dsound.pCapture, &descDS, (ma_IDirectSoundCaptureBuffer**)&pDevice->dsound.pCaptureBuffer, NULL);
+        if (FAILED(hr)) {
+            ma_device_uninit__dsound(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundCapture_CreateCaptureBuffer() failed for capture device.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        /* Get the _actual_ properties of the buffer. */
+        pActualFormat = (MA_WAVEFORMATEXTENSIBLE*)rawdata;
+        hr = ma_IDirectSoundCaptureBuffer_GetFormat((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, (MA_WAVEFORMATEX*)pActualFormat, sizeof(rawdata), NULL);
+        if (FAILED(hr)) {
+            ma_device_uninit__dsound(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to retrieve the actual format of the capture device's buffer.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        /* We can now start setting the output data formats. */
+        pDescriptorCapture->format     = ma_format_from_WAVEFORMATEX((MA_WAVEFORMATEX*)pActualFormat);
+        pDescriptorCapture->channels   = pActualFormat->nChannels;
+        pDescriptorCapture->sampleRate = pActualFormat->nSamplesPerSec;
+
+        /* Get the native channel map based on the channel mask. */
+        if (pActualFormat->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
+            ma_channel_mask_to_channel_map__win32(pActualFormat->dwChannelMask, pDescriptorCapture->channels, pDescriptorCapture->channelMap);
+        } else {
+            ma_channel_mask_to_channel_map__win32(wf.dwChannelMask, pDescriptorCapture->channels, pDescriptorCapture->channelMap);
+        }
+
+        /*
+        After getting the actual format the size of the buffer in frames may have actually changed. However, we want this to be as close to what the
+        user has asked for as possible, so let's go ahead and release the old capture buffer and create a new one in this case.
+        */
+        if (periodSizeInFrames != (descDS.dwBufferBytes / ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels) / periodCount)) {
+            descDS.dwBufferBytes = periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels) * periodCount;
+            ma_IDirectSoundCaptureBuffer_Release((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
+
+            hr = ma_IDirectSoundCapture_CreateCaptureBuffer((ma_IDirectSoundCapture*)pDevice->dsound.pCapture, &descDS, (ma_IDirectSoundCaptureBuffer**)&pDevice->dsound.pCaptureBuffer, NULL);
+            if (FAILED(hr)) {
+                ma_device_uninit__dsound(pDevice);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Second attempt at IDirectSoundCapture_CreateCaptureBuffer() failed for capture device.");
+                return ma_result_from_HRESULT(hr);
+            }
+        }
+
+        /* DirectSound should give us a buffer exactly the size we asked for. */
+        pDescriptorCapture->periodSizeInFrames = periodSizeInFrames;
+        pDescriptorCapture->periodCount        = periodCount;
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        MA_WAVEFORMATEXTENSIBLE wf;
+        MA_DSBUFFERDESC descDSPrimary;
+        MA_DSCAPS caps;
+        char rawdata[1024]; /* <-- Ugly hack to avoid a malloc() due to a crappy DirectSound API. */
+        MA_WAVEFORMATEXTENSIBLE* pActualFormat;
+        ma_uint32 periodSizeInFrames;
+        ma_uint32 periodCount;
+        MA_DSBUFFERDESC descDS;
+        WORD nativeChannelCount;
+        DWORD nativeChannelMask = 0;
+
+        result = ma_config_to_WAVEFORMATEXTENSIBLE(pDescriptorPlayback->format, pDescriptorPlayback->channels, pDescriptorPlayback->sampleRate, pDescriptorPlayback->channelMap, &wf);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        result = ma_context_create_IDirectSound__dsound(pDevice->pContext, pDescriptorPlayback->shareMode, pDescriptorPlayback->pDeviceID, (ma_IDirectSound**)&pDevice->dsound.pPlayback);
+        if (result != MA_SUCCESS) {
+            ma_device_uninit__dsound(pDevice);
+            return result;
+        }
+
+        MA_ZERO_OBJECT(&descDSPrimary);
+        descDSPrimary.dwSize  = sizeof(MA_DSBUFFERDESC);
+        descDSPrimary.dwFlags = MA_DSBCAPS_PRIMARYBUFFER | MA_DSBCAPS_CTRLVOLUME;
+        hr = ma_IDirectSound_CreateSoundBuffer((ma_IDirectSound*)pDevice->dsound.pPlayback, &descDSPrimary, (ma_IDirectSoundBuffer**)&pDevice->dsound.pPlaybackPrimaryBuffer, NULL);
+        if (FAILED(hr)) {
+            ma_device_uninit__dsound(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_CreateSoundBuffer() failed for playback device's primary buffer.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+
+        /* We may want to make some adjustments to the format if we are using defaults. */
+        MA_ZERO_OBJECT(&caps);
+        caps.dwSize = sizeof(caps);
+        hr = ma_IDirectSound_GetCaps((ma_IDirectSound*)pDevice->dsound.pPlayback, &caps);
+        if (FAILED(hr)) {
+            ma_device_uninit__dsound(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_GetCaps() failed for playback device.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        if ((caps.dwFlags & MA_DSCAPS_PRIMARYSTEREO) != 0) {
+            DWORD speakerConfig;
+
+            /* It supports at least stereo, but could support more. */
+            nativeChannelCount = 2;
+
+            /* Look at the speaker configuration to get a better idea on the channel count. */
+            if (SUCCEEDED(ma_IDirectSound_GetSpeakerConfig((ma_IDirectSound*)pDevice->dsound.pPlayback, &speakerConfig))) {
+                ma_get_channels_from_speaker_config__dsound(speakerConfig, &nativeChannelCount, &nativeChannelMask);
+            }
+        } else {
+            /* It does not support stereo, which means we are stuck with mono. */
+            nativeChannelCount = 1;
+            nativeChannelMask  = 0x00000001;
+        }
+
+        if (pDescriptorPlayback->channels == 0) {
+            wf.nChannels = nativeChannelCount;
+            wf.dwChannelMask    = nativeChannelMask;
+        }
+
+        if (pDescriptorPlayback->sampleRate == 0) {
+            /* We base the sample rate on the values returned by GetCaps(). */
+            if ((caps.dwFlags & MA_DSCAPS_CONTINUOUSRATE) != 0) {
+                wf.nSamplesPerSec = ma_get_best_sample_rate_within_range(caps.dwMinSecondarySampleRate, caps.dwMaxSecondarySampleRate);
+            } else {
+                wf.nSamplesPerSec = caps.dwMaxSecondarySampleRate;
+            }
+        }
+
+        wf.nBlockAlign     = (WORD)(wf.nChannels * wf.wBitsPerSample / 8);
+        wf.nAvgBytesPerSec = wf.nBlockAlign * wf.nSamplesPerSec;
+
+        /*
+        From MSDN:
+
+        The method succeeds even if the hardware does not support the requested format; DirectSound sets the buffer to the closest
+        supported format. To determine whether this has happened, an application can call the GetFormat method for the primary buffer
+        and compare the result with the format that was requested with the SetFormat method.
+        */
+        hr = ma_IDirectSoundBuffer_SetFormat((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackPrimaryBuffer, (MA_WAVEFORMATEX*)&wf);
+        if (FAILED(hr)) {
+            /*
+            If setting of the format failed we'll try again with some fallback settings. On Windows 98 I have
+            observed that IEEE_FLOAT does not work. We'll therefore enforce PCM. I also had issues where a
+            sample rate of 48000 did not work correctly. Not sure if it was a driver issue or not, but will
+            use 44100 for the sample rate.
+            */
+            wf.cbSize          = 18;    /* NOTE: Don't use sizeof(MA_WAVEFORMATEX) here because it's got an extra 2 bytes due to padding. */
+            wf.wFormatTag      = WAVE_FORMAT_PCM;
+            wf.wBitsPerSample  = 16;
+            wf.nChannels       = nativeChannelCount;
+            wf.nSamplesPerSec  = 44100;
+            wf.nBlockAlign     = wf.nChannels * (wf.wBitsPerSample / 8);
+            wf.nAvgBytesPerSec = wf.nSamplesPerSec * wf.nBlockAlign;
+
+            hr = ma_IDirectSoundBuffer_SetFormat((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackPrimaryBuffer, (MA_WAVEFORMATEX*)&wf);
+            if (FAILED(hr)) {
+                ma_device_uninit__dsound(pDevice);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to set format of playback device's primary buffer.");
+                return ma_result_from_HRESULT(hr);
+            }
+        }
+
+        /* Get the _actual_ properties of the buffer. */
+        pActualFormat = (MA_WAVEFORMATEXTENSIBLE*)rawdata;
+        hr = ma_IDirectSoundBuffer_GetFormat((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackPrimaryBuffer, (MA_WAVEFORMATEX*)pActualFormat, sizeof(rawdata), NULL);
+        if (FAILED(hr)) {
+            ma_device_uninit__dsound(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to retrieve the actual format of the playback device's primary buffer.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        /* We now have enough information to start setting some output properties. */
+        pDescriptorPlayback->format     = ma_format_from_WAVEFORMATEX((MA_WAVEFORMATEX*)pActualFormat);
+        pDescriptorPlayback->channels   = pActualFormat->nChannels;
+        pDescriptorPlayback->sampleRate = pActualFormat->nSamplesPerSec;
+
+        /* Get the internal channel map based on the channel mask. */
+        if (pActualFormat->wFormatTag == WAVE_FORMAT_EXTENSIBLE) {
+            ma_channel_mask_to_channel_map__win32(pActualFormat->dwChannelMask, pDescriptorPlayback->channels, pDescriptorPlayback->channelMap);
+        } else {
+            ma_channel_mask_to_channel_map__win32(wf.dwChannelMask, pDescriptorPlayback->channels, pDescriptorPlayback->channelMap);
+        }
+
+        /* The size of the buffer must be a clean multiple of the period count. */
+        periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__dsound(pDescriptorPlayback, pDescriptorPlayback->sampleRate, pConfig->performanceProfile);
+        periodCount = (pDescriptorPlayback->periodCount > 0) ? pDescriptorPlayback->periodCount : MA_DEFAULT_PERIODS;
+
+        /*
+        Meaning of dwFlags (from MSDN):
+
+        DSBCAPS_CTRLPOSITIONNOTIFY
+          The buffer has position notification capability.
+
+        DSBCAPS_GLOBALFOCUS
+          With this flag set, an application using DirectSound can continue to play its buffers if the user switches focus to
+          another application, even if the new application uses DirectSound.
+
+        DSBCAPS_GETCURRENTPOSITION2
+          In the first version of DirectSound, the play cursor was significantly ahead of the actual playing sound on emulated
+          sound cards; it was directly behind the write cursor. Now, if the DSBCAPS_GETCURRENTPOSITION2 flag is specified, the
+          application can get a more accurate play cursor.
+        */
+        MA_ZERO_OBJECT(&descDS);
+        descDS.dwSize = sizeof(descDS);
+        descDS.dwFlags = MA_DSBCAPS_CTRLPOSITIONNOTIFY | MA_DSBCAPS_GLOBALFOCUS | MA_DSBCAPS_GETCURRENTPOSITION2;
+        descDS.dwBufferBytes = periodSizeInFrames * periodCount * ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels);
+        descDS.lpwfxFormat = (MA_WAVEFORMATEX*)pActualFormat;
+        hr = ma_IDirectSound_CreateSoundBuffer((ma_IDirectSound*)pDevice->dsound.pPlayback, &descDS, (ma_IDirectSoundBuffer**)&pDevice->dsound.pPlaybackBuffer, NULL);
+        if (FAILED(hr)) {
+            ma_device_uninit__dsound(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSound_CreateSoundBuffer() failed for playback device's secondary buffer.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        /* DirectSound should give us a buffer exactly the size we asked for. */
+        pDescriptorPlayback->periodSizeInFrames = periodSizeInFrames;
+        pDescriptorPlayback->periodCount        = periodCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_device_data_loop__dsound(ma_device* pDevice)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint32 bpfDeviceCapture  = ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+    ma_uint32 bpfDevicePlayback = ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+    HRESULT hr;
+    DWORD lockOffsetInBytesCapture;
+    DWORD lockSizeInBytesCapture;
+    DWORD mappedSizeInBytesCapture;
+    DWORD mappedDeviceFramesProcessedCapture;
+    void* pMappedDeviceBufferCapture;
+    DWORD lockOffsetInBytesPlayback;
+    DWORD lockSizeInBytesPlayback;
+    DWORD mappedSizeInBytesPlayback;
+    void* pMappedDeviceBufferPlayback;
+    DWORD prevReadCursorInBytesCapture = 0;
+    DWORD prevPlayCursorInBytesPlayback = 0;
+    ma_bool32 physicalPlayCursorLoopFlagPlayback = 0;
+    DWORD virtualWriteCursorInBytesPlayback = 0;
+    ma_bool32 virtualWriteCursorLoopFlagPlayback = 0;
+    ma_bool32 isPlaybackDeviceStarted = MA_FALSE;
+    ma_uint32 framesWrittenToPlaybackDevice = 0;   /* For knowing whether or not the playback device needs to be started. */
+    ma_uint32 waitTimeInMilliseconds = 1;
+    DWORD playbackBufferStatus = 0;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /* The first thing to do is start the capture device. The playback device is only started after the first period is written. */
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        hr = ma_IDirectSoundCaptureBuffer_Start((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, MA_DSCBSTART_LOOPING);
+        if (FAILED(hr)) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundCaptureBuffer_Start() failed.");
+            return ma_result_from_HRESULT(hr);
+        }
+    }
+
+    while (ma_device_get_state(pDevice) == ma_device_state_started) {
+        switch (pDevice->type)
+        {
+            case ma_device_type_duplex:
+            {
+                DWORD physicalCaptureCursorInBytes;
+                DWORD physicalReadCursorInBytes;
+                hr = ma_IDirectSoundCaptureBuffer_GetCurrentPosition((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, &physicalCaptureCursorInBytes, &physicalReadCursorInBytes);
+                if (FAILED(hr)) {
+                    return ma_result_from_HRESULT(hr);
+                }
+
+                /* If nothing is available we just sleep for a bit and return from this iteration. */
+                if (physicalReadCursorInBytes == prevReadCursorInBytesCapture) {
+                    ma_sleep(waitTimeInMilliseconds);
+                    continue; /* Nothing is available in the capture buffer. */
+                }
+
+                /*
+                The current position has moved. We need to map all of the captured samples and write them to the playback device, making sure
+                we don't return until every frame has been copied over.
+                */
+                if (prevReadCursorInBytesCapture < physicalReadCursorInBytes) {
+                    /* The capture position has not looped. This is the simple case. */
+                    lockOffsetInBytesCapture = prevReadCursorInBytesCapture;
+                    lockSizeInBytesCapture   = (physicalReadCursorInBytes - prevReadCursorInBytesCapture);
+                } else {
+                    /*
+                    The capture position has looped. This is the more complex case. Map to the end of the buffer. If this does not return anything,
+                    do it again from the start.
+                    */
+                    if (prevReadCursorInBytesCapture < pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture) {
+                        /* Lock up to the end of the buffer. */
+                        lockOffsetInBytesCapture = prevReadCursorInBytesCapture;
+                        lockSizeInBytesCapture   = (pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture) - prevReadCursorInBytesCapture;
+                    } else {
+                        /* Lock starting from the start of the buffer. */
+                        lockOffsetInBytesCapture = 0;
+                        lockSizeInBytesCapture   = physicalReadCursorInBytes;
+                    }
+                }
+
+                if (lockSizeInBytesCapture == 0) {
+                    ma_sleep(waitTimeInMilliseconds);
+                    continue; /* Nothing is available in the capture buffer. */
+                }
+
+                hr = ma_IDirectSoundCaptureBuffer_Lock((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, lockOffsetInBytesCapture, lockSizeInBytesCapture, &pMappedDeviceBufferCapture, &mappedSizeInBytesCapture, NULL, NULL, 0);
+                if (FAILED(hr)) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to map buffer from capture device in preparation for writing to the device.");
+                    return ma_result_from_HRESULT(hr);
+                }
+
+
+                /* At this point we have some input data that we need to output. We do not return until every mapped frame of the input data is written to the playback device. */
+                mappedDeviceFramesProcessedCapture = 0;
+
+                for (;;) {  /* Keep writing to the playback device. */
+                    ma_uint8  inputFramesInClientFormat[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+                    ma_uint32 inputFramesInClientFormatCap = sizeof(inputFramesInClientFormat) / ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
+                    ma_uint8  outputFramesInClientFormat[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+                    ma_uint32 outputFramesInClientFormatCap = sizeof(outputFramesInClientFormat) / ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
+                    ma_uint32 outputFramesInClientFormatCount;
+                    ma_uint32 outputFramesInClientFormatConsumed = 0;
+                    ma_uint64 clientCapturedFramesToProcess = ma_min(inputFramesInClientFormatCap, outputFramesInClientFormatCap);
+                    ma_uint64 deviceCapturedFramesToProcess = (mappedSizeInBytesCapture / bpfDeviceCapture) - mappedDeviceFramesProcessedCapture;
+                    void* pRunningMappedDeviceBufferCapture = ma_offset_ptr(pMappedDeviceBufferCapture, mappedDeviceFramesProcessedCapture * bpfDeviceCapture);
+
+                    result = ma_data_converter_process_pcm_frames(&pDevice->capture.converter, pRunningMappedDeviceBufferCapture, &deviceCapturedFramesToProcess, inputFramesInClientFormat, &clientCapturedFramesToProcess);
+                    if (result != MA_SUCCESS) {
+                        break;
+                    }
+
+                    outputFramesInClientFormatCount     = (ma_uint32)clientCapturedFramesToProcess;
+                    mappedDeviceFramesProcessedCapture += (ma_uint32)deviceCapturedFramesToProcess;
+
+                    ma_device__handle_data_callback(pDevice, outputFramesInClientFormat, inputFramesInClientFormat, (ma_uint32)clientCapturedFramesToProcess);
+
+                    /* At this point we have input and output data in client format. All we need to do now is convert it to the output device format. This may take a few passes. */
+                    for (;;) {
+                        ma_uint32 framesWrittenThisIteration;
+                        DWORD physicalPlayCursorInBytes;
+                        DWORD physicalWriteCursorInBytes;
+                        DWORD availableBytesPlayback;
+                        DWORD silentPaddingInBytes = 0; /* <-- Must be initialized to 0. */
+
+                        /* We need the physical play and write cursors. */
+                        if (FAILED(ma_IDirectSoundBuffer_GetCurrentPosition((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, &physicalPlayCursorInBytes, &physicalWriteCursorInBytes))) {
+                            break;
+                        }
+
+                        if (physicalPlayCursorInBytes < prevPlayCursorInBytesPlayback) {
+                            physicalPlayCursorLoopFlagPlayback = !physicalPlayCursorLoopFlagPlayback;
+                        }
+                        prevPlayCursorInBytesPlayback  = physicalPlayCursorInBytes;
+
+                        /* If there's any bytes available for writing we can do that now. The space between the virtual cursor position and play cursor. */
+                        if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
+                            /* Same loop iteration. The available bytes wraps all the way around from the virtual write cursor to the physical play cursor. */
+                            if (physicalPlayCursorInBytes <= virtualWriteCursorInBytesPlayback) {
+                                availableBytesPlayback  = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
+                                availableBytesPlayback += physicalPlayCursorInBytes;    /* Wrap around. */
+                            } else {
+                                /* This is an error. */
+                                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Duplex/Playback): Play cursor has moved in front of the write cursor (same loop iteration). physicalPlayCursorInBytes=%ld, virtualWriteCursorInBytes=%ld.\n", physicalPlayCursorInBytes, virtualWriteCursorInBytesPlayback);
+                                availableBytesPlayback = 0;
+                            }
+                        } else {
+                            /* Different loop iterations. The available bytes only goes from the virtual write cursor to the physical play cursor. */
+                            if (physicalPlayCursorInBytes >= virtualWriteCursorInBytesPlayback) {
+                                availableBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
+                            } else {
+                                /* This is an error. */
+                                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Duplex/Playback): Write cursor has moved behind the play cursor (different loop iterations). physicalPlayCursorInBytes=%ld, virtualWriteCursorInBytes=%ld.\n", physicalPlayCursorInBytes, virtualWriteCursorInBytesPlayback);
+                                availableBytesPlayback = 0;
+                            }
+                        }
+
+                        /* If there's no room available for writing we need to wait for more. */
+                        if (availableBytesPlayback == 0) {
+                            /* If we haven't started the device yet, this will never get beyond 0. In this case we need to get the device started. */
+                            if (!isPlaybackDeviceStarted) {
+                                hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
+                                if (FAILED(hr)) {
+                                    ma_IDirectSoundCaptureBuffer_Stop((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
+                                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed.");
+                                    return ma_result_from_HRESULT(hr);
+                                }
+                                isPlaybackDeviceStarted = MA_TRUE;
+                            } else {
+                                ma_sleep(waitTimeInMilliseconds);
+                                continue;
+                            }
+                        }
+
+
+                        /* Getting here means there room available somewhere. We limit this to either the end of the buffer or the physical play cursor, whichever is closest. */
+                        lockOffsetInBytesPlayback = virtualWriteCursorInBytesPlayback;
+                        if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
+                            /* Same loop iteration. Go up to the end of the buffer. */
+                            lockSizeInBytesPlayback = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
+                        } else {
+                            /* Different loop iterations. Go up to the physical play cursor. */
+                            lockSizeInBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
+                        }
+
+                        hr = ma_IDirectSoundBuffer_Lock((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, lockOffsetInBytesPlayback, lockSizeInBytesPlayback, &pMappedDeviceBufferPlayback, &mappedSizeInBytesPlayback, NULL, NULL, 0);
+                        if (FAILED(hr)) {
+                            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to map buffer from playback device in preparation for writing to the device.");
+                            result = ma_result_from_HRESULT(hr);
+                            break;
+                        }
+
+                        /*
+                        Experiment: If the playback buffer is being starved, pad it with some silence to get it back in sync. This will cause a glitch, but it may prevent
+                        endless glitching due to it constantly running out of data.
+                        */
+                        if (isPlaybackDeviceStarted) {
+                            DWORD bytesQueuedForPlayback = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - availableBytesPlayback;
+                            if (bytesQueuedForPlayback < (pDevice->playback.internalPeriodSizeInFrames*bpfDevicePlayback)) {
+                                silentPaddingInBytes   = (pDevice->playback.internalPeriodSizeInFrames*2*bpfDevicePlayback) - bytesQueuedForPlayback;
+                                if (silentPaddingInBytes > lockSizeInBytesPlayback) {
+                                    silentPaddingInBytes = lockSizeInBytesPlayback;
+                                }
+
+                                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Duplex/Playback) Playback buffer starved. availableBytesPlayback=%ld, silentPaddingInBytes=%ld\n", availableBytesPlayback, silentPaddingInBytes);
+                            }
+                        }
+
+                        /* At this point we have a buffer for output. */
+                        if (silentPaddingInBytes > 0) {
+                            MA_ZERO_MEMORY(pMappedDeviceBufferPlayback, silentPaddingInBytes);
+                            framesWrittenThisIteration = silentPaddingInBytes/bpfDevicePlayback;
+                        } else {
+                            ma_uint64 convertedFrameCountIn  = (outputFramesInClientFormatCount - outputFramesInClientFormatConsumed);
+                            ma_uint64 convertedFrameCountOut = mappedSizeInBytesPlayback/bpfDevicePlayback;
+                            void* pConvertedFramesIn  = ma_offset_ptr(outputFramesInClientFormat, outputFramesInClientFormatConsumed * bpfDevicePlayback);
+                            void* pConvertedFramesOut = pMappedDeviceBufferPlayback;
+
+                            result = ma_data_converter_process_pcm_frames(&pDevice->playback.converter, pConvertedFramesIn, &convertedFrameCountIn, pConvertedFramesOut, &convertedFrameCountOut);
+                            if (result != MA_SUCCESS) {
+                                break;
+                            }
+
+                            outputFramesInClientFormatConsumed += (ma_uint32)convertedFrameCountOut;
+                            framesWrittenThisIteration          = (ma_uint32)convertedFrameCountOut;
+                        }
+
+
+                        hr = ma_IDirectSoundBuffer_Unlock((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, pMappedDeviceBufferPlayback, framesWrittenThisIteration*bpfDevicePlayback, NULL, 0);
+                        if (FAILED(hr)) {
+                            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to unlock internal buffer from playback device after writing to the device.");
+                            result = ma_result_from_HRESULT(hr);
+                            break;
+                        }
+
+                        virtualWriteCursorInBytesPlayback += framesWrittenThisIteration*bpfDevicePlayback;
+                        if ((virtualWriteCursorInBytesPlayback/bpfDevicePlayback) == pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods) {
+                            virtualWriteCursorInBytesPlayback  = 0;
+                            virtualWriteCursorLoopFlagPlayback = !virtualWriteCursorLoopFlagPlayback;
+                        }
+
+                        /*
+                        We may need to start the device. We want two full periods to be written before starting the playback device. Having an extra period adds
+                        a bit of a buffer to prevent the playback buffer from getting starved.
+                        */
+                        framesWrittenToPlaybackDevice += framesWrittenThisIteration;
+                        if (!isPlaybackDeviceStarted && framesWrittenToPlaybackDevice >= (pDevice->playback.internalPeriodSizeInFrames*2)) {
+                            hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
+                            if (FAILED(hr)) {
+                                ma_IDirectSoundCaptureBuffer_Stop((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
+                                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed.");
+                                return ma_result_from_HRESULT(hr);
+                            }
+                            isPlaybackDeviceStarted = MA_TRUE;
+                        }
+
+                        if (framesWrittenThisIteration < mappedSizeInBytesPlayback/bpfDevicePlayback) {
+                            break;  /* We're finished with the output data.*/
+                        }
+                    }
+
+                    if (clientCapturedFramesToProcess == 0) {
+                        break;  /* We just consumed every input sample. */
+                    }
+                }
+
+
+                /* At this point we're done with the mapped portion of the capture buffer. */
+                hr = ma_IDirectSoundCaptureBuffer_Unlock((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, pMappedDeviceBufferCapture, mappedSizeInBytesCapture, NULL, 0);
+                if (FAILED(hr)) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to unlock internal buffer from capture device after reading from the device.");
+                    return ma_result_from_HRESULT(hr);
+                }
+                prevReadCursorInBytesCapture = (lockOffsetInBytesCapture + mappedSizeInBytesCapture);
+            } break;
+
+
+
+            case ma_device_type_capture:
+            {
+                DWORD physicalCaptureCursorInBytes;
+                DWORD physicalReadCursorInBytes;
+                hr = ma_IDirectSoundCaptureBuffer_GetCurrentPosition((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, &physicalCaptureCursorInBytes, &physicalReadCursorInBytes);
+                if (FAILED(hr)) {
+                    return MA_ERROR;
+                }
+
+                /* If the previous capture position is the same as the current position we need to wait a bit longer. */
+                if (prevReadCursorInBytesCapture == physicalReadCursorInBytes) {
+                    ma_sleep(waitTimeInMilliseconds);
+                    continue;
+                }
+
+                /* Getting here means we have capture data available. */
+                if (prevReadCursorInBytesCapture < physicalReadCursorInBytes) {
+                    /* The capture position has not looped. This is the simple case. */
+                    lockOffsetInBytesCapture = prevReadCursorInBytesCapture;
+                    lockSizeInBytesCapture   = (physicalReadCursorInBytes - prevReadCursorInBytesCapture);
+                } else {
+                    /*
+                    The capture position has looped. This is the more complex case. Map to the end of the buffer. If this does not return anything,
+                    do it again from the start.
+                    */
+                    if (prevReadCursorInBytesCapture < pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture) {
+                        /* Lock up to the end of the buffer. */
+                        lockOffsetInBytesCapture = prevReadCursorInBytesCapture;
+                        lockSizeInBytesCapture   = (pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture) - prevReadCursorInBytesCapture;
+                    } else {
+                        /* Lock starting from the start of the buffer. */
+                        lockOffsetInBytesCapture = 0;
+                        lockSizeInBytesCapture   = physicalReadCursorInBytes;
+                    }
+                }
+
+                if (lockSizeInBytesCapture < pDevice->capture.internalPeriodSizeInFrames) {
+                    ma_sleep(waitTimeInMilliseconds);
+                    continue; /* Nothing is available in the capture buffer. */
+                }
+
+                hr = ma_IDirectSoundCaptureBuffer_Lock((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, lockOffsetInBytesCapture, lockSizeInBytesCapture, &pMappedDeviceBufferCapture, &mappedSizeInBytesCapture, NULL, NULL, 0);
+                if (FAILED(hr)) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to map buffer from capture device in preparation for writing to the device.");
+                    result = ma_result_from_HRESULT(hr);
+                }
+
+                if (lockSizeInBytesCapture != mappedSizeInBytesCapture) {
+                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[DirectSound] (Capture) lockSizeInBytesCapture=%ld != mappedSizeInBytesCapture=%ld\n", lockSizeInBytesCapture, mappedSizeInBytesCapture);
+                }
+
+                ma_device__send_frames_to_client(pDevice, mappedSizeInBytesCapture/bpfDeviceCapture, pMappedDeviceBufferCapture);
+
+                hr = ma_IDirectSoundCaptureBuffer_Unlock((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer, pMappedDeviceBufferCapture, mappedSizeInBytesCapture, NULL, 0);
+                if (FAILED(hr)) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to unlock internal buffer from capture device after reading from the device.");
+                    return ma_result_from_HRESULT(hr);
+                }
+                prevReadCursorInBytesCapture = lockOffsetInBytesCapture + mappedSizeInBytesCapture;
+
+                if (prevReadCursorInBytesCapture == (pDevice->capture.internalPeriodSizeInFrames*pDevice->capture.internalPeriods*bpfDeviceCapture)) {
+                    prevReadCursorInBytesCapture = 0;
+                }
+            } break;
+
+
+
+            case ma_device_type_playback:
+            {
+                DWORD availableBytesPlayback;
+                DWORD physicalPlayCursorInBytes;
+                DWORD physicalWriteCursorInBytes;
+                hr = ma_IDirectSoundBuffer_GetCurrentPosition((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, &physicalPlayCursorInBytes, &physicalWriteCursorInBytes);
+                if (FAILED(hr)) {
+                    break;
+                }
+
+                hr = ma_IDirectSoundBuffer_GetStatus((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, &playbackBufferStatus);
+                if (SUCCEEDED(hr) && (playbackBufferStatus & MA_DSBSTATUS_PLAYING) == 0 && isPlaybackDeviceStarted) {
+                    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[DirectSound] Attempting to resume audio due to state: %d.", (int)playbackBufferStatus);
+                    hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
+                    if (FAILED(hr)) {
+                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed after attempting to resume from state %d.", (int)playbackBufferStatus);
+                        return ma_result_from_HRESULT(hr);
+                    }
+
+                    isPlaybackDeviceStarted = MA_TRUE;
+                    ma_sleep(waitTimeInMilliseconds);
+                    continue;
+                }
+
+                if (physicalPlayCursorInBytes < prevPlayCursorInBytesPlayback) {
+                    physicalPlayCursorLoopFlagPlayback = !physicalPlayCursorLoopFlagPlayback;
+                }
+                prevPlayCursorInBytesPlayback  = physicalPlayCursorInBytes;
+
+                /* If there's any bytes available for writing we can do that now. The space between the virtual cursor position and play cursor. */
+                if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
+                    /* Same loop iteration. The available bytes wraps all the way around from the virtual write cursor to the physical play cursor. */
+                    if (physicalPlayCursorInBytes <= virtualWriteCursorInBytesPlayback) {
+                        availableBytesPlayback  = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
+                        availableBytesPlayback += physicalPlayCursorInBytes;    /* Wrap around. */
+                    } else {
+                        /* This is an error. */
+                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Playback): Play cursor has moved in front of the write cursor (same loop iterations). physicalPlayCursorInBytes=%ld, virtualWriteCursorInBytes=%ld.\n", physicalPlayCursorInBytes, virtualWriteCursorInBytesPlayback);
+                        availableBytesPlayback = 0;
+                    }
+                } else {
+                    /* Different loop iterations. The available bytes only goes from the virtual write cursor to the physical play cursor. */
+                    if (physicalPlayCursorInBytes >= virtualWriteCursorInBytesPlayback) {
+                        availableBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
+                    } else {
+                        /* This is an error. */
+                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[DirectSound] (Playback): Write cursor has moved behind the play cursor (different loop iterations). physicalPlayCursorInBytes=%ld, virtualWriteCursorInBytes=%ld.\n", physicalPlayCursorInBytes, virtualWriteCursorInBytesPlayback);
+                        availableBytesPlayback = 0;
+                    }
+                }
+
+                /* If there's no room available for writing we need to wait for more. */
+                if (availableBytesPlayback < pDevice->playback.internalPeriodSizeInFrames) {
+                    /* If we haven't started the device yet, this will never get beyond 0. In this case we need to get the device started. */
+                    if (availableBytesPlayback == 0 && !isPlaybackDeviceStarted) {
+                        hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
+                        if (FAILED(hr)) {
+                            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed.");
+                            return ma_result_from_HRESULT(hr);
+                        }
+                        isPlaybackDeviceStarted = MA_TRUE;
+                    } else {
+                        ma_sleep(waitTimeInMilliseconds);
+                        continue;
+                    }
+                }
+
+                /* Getting here means there room available somewhere. We limit this to either the end of the buffer or the physical play cursor, whichever is closest. */
+                lockOffsetInBytesPlayback = virtualWriteCursorInBytesPlayback;
+                if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
+                    /* Same loop iteration. Go up to the end of the buffer. */
+                    lockSizeInBytesPlayback = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
+                } else {
+                    /* Different loop iterations. Go up to the physical play cursor. */
+                    lockSizeInBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
+                }
+
+                hr = ma_IDirectSoundBuffer_Lock((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, lockOffsetInBytesPlayback, lockSizeInBytesPlayback, &pMappedDeviceBufferPlayback, &mappedSizeInBytesPlayback, NULL, NULL, 0);
+                if (FAILED(hr)) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to map buffer from playback device in preparation for writing to the device.");
+                    result = ma_result_from_HRESULT(hr);
+                    break;
+                }
+
+                /* At this point we have a buffer for output. */
+                ma_device__read_frames_from_client(pDevice, (mappedSizeInBytesPlayback/bpfDevicePlayback), pMappedDeviceBufferPlayback);
+
+                hr = ma_IDirectSoundBuffer_Unlock((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, pMappedDeviceBufferPlayback, mappedSizeInBytesPlayback, NULL, 0);
+                if (FAILED(hr)) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] Failed to unlock internal buffer from playback device after writing to the device.");
+                    result = ma_result_from_HRESULT(hr);
+                    break;
+                }
+
+                virtualWriteCursorInBytesPlayback += mappedSizeInBytesPlayback;
+                if (virtualWriteCursorInBytesPlayback == pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) {
+                    virtualWriteCursorInBytesPlayback  = 0;
+                    virtualWriteCursorLoopFlagPlayback = !virtualWriteCursorLoopFlagPlayback;
+                }
+
+                /*
+                We may need to start the device. We want two full periods to be written before starting the playback device. Having an extra period adds
+                a bit of a buffer to prevent the playback buffer from getting starved.
+                */
+                framesWrittenToPlaybackDevice += mappedSizeInBytesPlayback/bpfDevicePlayback;
+                if (!isPlaybackDeviceStarted && framesWrittenToPlaybackDevice >= pDevice->playback.internalPeriodSizeInFrames) {
+                    hr = ma_IDirectSoundBuffer_Play((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0, 0, MA_DSBPLAY_LOOPING);
+                    if (FAILED(hr)) {
+                        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Play() failed.");
+                        return ma_result_from_HRESULT(hr);
+                    }
+                    isPlaybackDeviceStarted = MA_TRUE;
+                }
+            } break;
+
+
+            default: return MA_INVALID_ARGS;   /* Invalid device type. */
+        }
+
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    /* Getting here means the device is being stopped. */
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        hr = ma_IDirectSoundCaptureBuffer_Stop((ma_IDirectSoundCaptureBuffer*)pDevice->dsound.pCaptureBuffer);
+        if (FAILED(hr)) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundCaptureBuffer_Stop() failed.");
+            return ma_result_from_HRESULT(hr);
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        /* The playback device should be drained before stopping. All we do is wait until the available bytes is equal to the size of the buffer. */
+        if (isPlaybackDeviceStarted) {
+            for (;;) {
+                DWORD availableBytesPlayback = 0;
+                DWORD physicalPlayCursorInBytes;
+                DWORD physicalWriteCursorInBytes;
+                hr = ma_IDirectSoundBuffer_GetCurrentPosition((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, &physicalPlayCursorInBytes, &physicalWriteCursorInBytes);
+                if (FAILED(hr)) {
+                    break;
+                }
+
+                if (physicalPlayCursorInBytes < prevPlayCursorInBytesPlayback) {
+                    physicalPlayCursorLoopFlagPlayback = !physicalPlayCursorLoopFlagPlayback;
+                }
+                prevPlayCursorInBytesPlayback  = physicalPlayCursorInBytes;
+
+                if (physicalPlayCursorLoopFlagPlayback == virtualWriteCursorLoopFlagPlayback) {
+                    /* Same loop iteration. The available bytes wraps all the way around from the virtual write cursor to the physical play cursor. */
+                    if (physicalPlayCursorInBytes <= virtualWriteCursorInBytesPlayback) {
+                        availableBytesPlayback  = (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback) - virtualWriteCursorInBytesPlayback;
+                        availableBytesPlayback += physicalPlayCursorInBytes;    /* Wrap around. */
+                    } else {
+                        break;
+                    }
+                } else {
+                    /* Different loop iterations. The available bytes only goes from the virtual write cursor to the physical play cursor. */
+                    if (physicalPlayCursorInBytes >= virtualWriteCursorInBytesPlayback) {
+                        availableBytesPlayback = physicalPlayCursorInBytes - virtualWriteCursorInBytesPlayback;
+                    } else {
+                        break;
+                    }
+                }
+
+                if (availableBytesPlayback >= (pDevice->playback.internalPeriodSizeInFrames*pDevice->playback.internalPeriods*bpfDevicePlayback)) {
+                    break;
+                }
+
+                ma_sleep(waitTimeInMilliseconds);
+            }
+        }
+
+        hr = ma_IDirectSoundBuffer_Stop((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer);
+        if (FAILED(hr)) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[DirectSound] IDirectSoundBuffer_Stop() failed.");
+            return ma_result_from_HRESULT(hr);
+        }
+
+        ma_IDirectSoundBuffer_SetCurrentPosition((ma_IDirectSoundBuffer*)pDevice->dsound.pPlaybackBuffer, 0);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_uninit__dsound(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_dsound);
+
+    ma_dlclose(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__dsound(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    MA_ASSERT(pContext != NULL);
+
+    (void)pConfig;
+
+    pContext->dsound.hDSoundDLL = ma_dlopen(ma_context_get_log(pContext), "dsound.dll");
+    if (pContext->dsound.hDSoundDLL == NULL) {
+        return MA_API_NOT_FOUND;
+    }
+
+    pContext->dsound.DirectSoundCreate            = ma_dlsym(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL, "DirectSoundCreate");
+    pContext->dsound.DirectSoundEnumerateA        = ma_dlsym(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL, "DirectSoundEnumerateA");
+    pContext->dsound.DirectSoundCaptureCreate     = ma_dlsym(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL, "DirectSoundCaptureCreate");
+    pContext->dsound.DirectSoundCaptureEnumerateA = ma_dlsym(ma_context_get_log(pContext), pContext->dsound.hDSoundDLL, "DirectSoundCaptureEnumerateA");
+
+    /*
+    We need to support all functions or nothing. DirectSound with Windows 95 seems to not work too
+    well in my testing. For example, it's missing DirectSoundCaptureEnumerateA(). This is a convenient
+    place to just disable the DirectSound backend for Windows 95.
+    */
+    if (pContext->dsound.DirectSoundCreate            == NULL ||
+        pContext->dsound.DirectSoundEnumerateA        == NULL ||
+        pContext->dsound.DirectSoundCaptureCreate     == NULL ||
+        pContext->dsound.DirectSoundCaptureEnumerateA == NULL) {
+        return MA_API_NOT_FOUND;
+    }
+
+    pContext->dsound.hWnd = pConfig->dsound.hWnd;
+
+    pCallbacks->onContextInit             = ma_context_init__dsound;
+    pCallbacks->onContextUninit           = ma_context_uninit__dsound;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__dsound;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__dsound;
+    pCallbacks->onDeviceInit              = ma_device_init__dsound;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__dsound;
+    pCallbacks->onDeviceStart             = NULL;   /* Not used. Started in onDeviceDataLoop. */
+    pCallbacks->onDeviceStop              = NULL;   /* Not used. Stopped in onDeviceDataLoop. */
+    pCallbacks->onDeviceRead              = NULL;   /* Not used. Data is read directly in onDeviceDataLoop. */
+    pCallbacks->onDeviceWrite             = NULL;   /* Not used. Data is written directly in onDeviceDataLoop. */
+    pCallbacks->onDeviceDataLoop          = ma_device_data_loop__dsound;
+
+    return MA_SUCCESS;
+}
+#endif
+
+
+
+/******************************************************************************
+
+WinMM Backend
+
+******************************************************************************/
+#ifdef MA_HAS_WINMM
+
+/*
+Some build configurations will exclude the WinMM API. An example is when WIN32_LEAN_AND_MEAN
+is defined. We need to define the types and functions we need manually.
+*/
+#define MA_MMSYSERR_NOERROR     0
+#define MA_MMSYSERR_ERROR       1
+#define MA_MMSYSERR_BADDEVICEID 2
+#define MA_MMSYSERR_INVALHANDLE 5
+#define MA_MMSYSERR_NOMEM       7
+#define MA_MMSYSERR_INVALFLAG   10
+#define MA_MMSYSERR_INVALPARAM  11
+#define MA_MMSYSERR_HANDLEBUSY  12
+
+#define MA_CALLBACK_EVENT       0x00050000
+#define MA_WAVE_ALLOWSYNC       0x0002
+
+#define MA_WHDR_DONE            0x00000001
+#define MA_WHDR_PREPARED        0x00000002
+#define MA_WHDR_BEGINLOOP       0x00000004
+#define MA_WHDR_ENDLOOP         0x00000008
+#define MA_WHDR_INQUEUE         0x00000010
+
+#define MA_MAXPNAMELEN          32
+
+typedef void* MA_HWAVEIN;
+typedef void* MA_HWAVEOUT;
+typedef UINT MA_MMRESULT;
+typedef UINT MA_MMVERSION;
+
+typedef struct
+{
+    WORD wMid;
+    WORD wPid;
+    MA_MMVERSION vDriverVersion;
+    CHAR szPname[MA_MAXPNAMELEN];
+    DWORD dwFormats;
+    WORD wChannels;
+    WORD wReserved1;
+} MA_WAVEINCAPSA;
+
+typedef struct
+{
+    WORD wMid;
+    WORD wPid;
+    MA_MMVERSION vDriverVersion;
+    CHAR szPname[MA_MAXPNAMELEN];
+    DWORD dwFormats;
+    WORD wChannels;
+    WORD wReserved1;
+    DWORD dwSupport;
+} MA_WAVEOUTCAPSA;
+
+typedef struct tagWAVEHDR
+{
+    char* lpData;
+    DWORD dwBufferLength;
+    DWORD dwBytesRecorded;
+    DWORD_PTR dwUser;
+    DWORD dwFlags;
+    DWORD dwLoops;
+    struct tagWAVEHDR* lpNext;
+    DWORD_PTR reserved;
+} MA_WAVEHDR;
+
+typedef struct
+{
+    WORD wMid;
+    WORD wPid;
+    MA_MMVERSION vDriverVersion;
+    CHAR szPname[MA_MAXPNAMELEN];
+    DWORD dwFormats;
+    WORD wChannels;
+    WORD wReserved1;
+    DWORD dwSupport;
+    GUID ManufacturerGuid;
+    GUID ProductGuid;
+    GUID NameGuid;
+} MA_WAVEOUTCAPS2A;
+
+typedef struct
+{
+    WORD wMid;
+    WORD wPid;
+    MA_MMVERSION vDriverVersion;
+    CHAR szPname[MA_MAXPNAMELEN];
+    DWORD dwFormats;
+    WORD wChannels;
+    WORD wReserved1;
+    GUID ManufacturerGuid;
+    GUID ProductGuid;
+    GUID NameGuid;
+} MA_WAVEINCAPS2A;
+
+typedef UINT        (WINAPI * MA_PFN_waveOutGetNumDevs)(void);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutGetDevCapsA)(ma_uintptr uDeviceID, MA_WAVEOUTCAPSA* pwoc, UINT cbwoc);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutOpen)(MA_HWAVEOUT* phwo, UINT uDeviceID, const MA_WAVEFORMATEX* pwfx, DWORD_PTR dwCallback, DWORD_PTR dwInstance, DWORD fdwOpen);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutClose)(MA_HWAVEOUT hwo);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutPrepareHeader)(MA_HWAVEOUT hwo, MA_WAVEHDR* pwh, UINT cbwh);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutUnprepareHeader)(MA_HWAVEOUT hwo, MA_WAVEHDR* pwh, UINT cbwh);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutWrite)(MA_HWAVEOUT hwo, MA_WAVEHDR* pwh, UINT cbwh);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveOutReset)(MA_HWAVEOUT hwo);
+typedef UINT        (WINAPI * MA_PFN_waveInGetNumDevs)(void);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveInGetDevCapsA)(ma_uintptr uDeviceID, MA_WAVEINCAPSA* pwic, UINT cbwic);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveInOpen)(MA_HWAVEIN* phwi, UINT uDeviceID, const MA_WAVEFORMATEX* pwfx, DWORD_PTR dwCallback, DWORD_PTR dwInstance, DWORD fdwOpen);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveInClose)(MA_HWAVEIN hwi);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveInPrepareHeader)(MA_HWAVEIN hwi, MA_WAVEHDR* pwh, UINT cbwh);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveInUnprepareHeader)(MA_HWAVEIN hwi, MA_WAVEHDR* pwh, UINT cbwh);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveInAddBuffer)(MA_HWAVEIN hwi, MA_WAVEHDR* pwh, UINT cbwh);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveInStart)(MA_HWAVEIN hwi);
+typedef MA_MMRESULT (WINAPI * MA_PFN_waveInReset)(MA_HWAVEIN hwi);
+
+static ma_result ma_result_from_MMRESULT(MA_MMRESULT resultMM)
+{
+    switch (resultMM)
+    {
+        case MA_MMSYSERR_NOERROR:       return MA_SUCCESS;
+        case MA_MMSYSERR_BADDEVICEID:   return MA_INVALID_ARGS;
+        case MA_MMSYSERR_INVALHANDLE:   return MA_INVALID_ARGS;
+        case MA_MMSYSERR_NOMEM:         return MA_OUT_OF_MEMORY;
+        case MA_MMSYSERR_INVALFLAG:     return MA_INVALID_ARGS;
+        case MA_MMSYSERR_INVALPARAM:    return MA_INVALID_ARGS;
+        case MA_MMSYSERR_HANDLEBUSY:    return MA_BUSY;
+        case MA_MMSYSERR_ERROR:         return MA_ERROR;
+        default:                        return MA_ERROR;
+    }
+}
+
+static char* ma_find_last_character(char* str, char ch)
+{
+    char* last;
+
+    if (str == NULL) {
+        return NULL;
+    }
+
+    last = NULL;
+    while (*str != '\0') {
+        if (*str == ch) {
+            last = str;
+        }
+
+        str += 1;
+    }
+
+    return last;
+}
+
+static ma_uint32 ma_get_period_size_in_bytes(ma_uint32 periodSizeInFrames, ma_format format, ma_uint32 channels)
+{
+    return periodSizeInFrames * ma_get_bytes_per_frame(format, channels);
+}
+
+
+/*
+Our own "WAVECAPS" structure that contains generic information shared between WAVEOUTCAPS2 and WAVEINCAPS2 so
+we can do things generically and typesafely. Names are being kept the same for consistency.
+*/
+typedef struct
+{
+    CHAR szPname[MA_MAXPNAMELEN];
+    DWORD dwFormats;
+    WORD wChannels;
+    GUID NameGuid;
+} MA_WAVECAPSA;
+
+static ma_result ma_get_best_info_from_formats_flags__winmm(DWORD dwFormats, WORD channels, WORD* pBitsPerSample, DWORD* pSampleRate)
+{
+    WORD bitsPerSample = 0;
+    DWORD sampleRate = 0;
+
+    if (pBitsPerSample) {
+        *pBitsPerSample = 0;
+    }
+    if (pSampleRate) {
+        *pSampleRate = 0;
+    }
+
+    if (channels == 1) {
+        bitsPerSample = 16;
+        if ((dwFormats & WAVE_FORMAT_48M16) != 0) {
+            sampleRate = 48000;
+        } else if ((dwFormats & WAVE_FORMAT_44M16) != 0) {
+            sampleRate = 44100;
+        } else if ((dwFormats & WAVE_FORMAT_2M16) != 0) {
+            sampleRate = 22050;
+        } else if ((dwFormats & WAVE_FORMAT_1M16) != 0) {
+            sampleRate = 11025;
+        } else if ((dwFormats & WAVE_FORMAT_96M16) != 0) {
+            sampleRate = 96000;
+        } else {
+            bitsPerSample = 8;
+            if ((dwFormats & WAVE_FORMAT_48M08) != 0) {
+                sampleRate = 48000;
+            } else if ((dwFormats & WAVE_FORMAT_44M08) != 0) {
+                sampleRate = 44100;
+            } else if ((dwFormats & WAVE_FORMAT_2M08) != 0) {
+                sampleRate = 22050;
+            } else if ((dwFormats & WAVE_FORMAT_1M08) != 0) {
+                sampleRate = 11025;
+            } else if ((dwFormats & WAVE_FORMAT_96M08) != 0) {
+                sampleRate = 96000;
+            } else {
+                return MA_FORMAT_NOT_SUPPORTED;
+            }
+        }
+    } else {
+        bitsPerSample = 16;
+        if ((dwFormats & WAVE_FORMAT_48S16) != 0) {
+            sampleRate = 48000;
+        } else if ((dwFormats & WAVE_FORMAT_44S16) != 0) {
+            sampleRate = 44100;
+        } else if ((dwFormats & WAVE_FORMAT_2S16) != 0) {
+            sampleRate = 22050;
+        } else if ((dwFormats & WAVE_FORMAT_1S16) != 0) {
+            sampleRate = 11025;
+        } else if ((dwFormats & WAVE_FORMAT_96S16) != 0) {
+            sampleRate = 96000;
+        } else {
+            bitsPerSample = 8;
+            if ((dwFormats & WAVE_FORMAT_48S08) != 0) {
+                sampleRate = 48000;
+            } else if ((dwFormats & WAVE_FORMAT_44S08) != 0) {
+                sampleRate = 44100;
+            } else if ((dwFormats & WAVE_FORMAT_2S08) != 0) {
+                sampleRate = 22050;
+            } else if ((dwFormats & WAVE_FORMAT_1S08) != 0) {
+                sampleRate = 11025;
+            } else if ((dwFormats & WAVE_FORMAT_96S08) != 0) {
+                sampleRate = 96000;
+            } else {
+                return MA_FORMAT_NOT_SUPPORTED;
+            }
+        }
+    }
+
+    if (pBitsPerSample) {
+        *pBitsPerSample = bitsPerSample;
+    }
+    if (pSampleRate) {
+        *pSampleRate = sampleRate;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_formats_flags_to_WAVEFORMATEX__winmm(DWORD dwFormats, WORD channels, MA_WAVEFORMATEX* pWF)
+{
+    ma_result result;
+
+    MA_ASSERT(pWF != NULL);
+
+    MA_ZERO_OBJECT(pWF);
+    pWF->cbSize     = sizeof(*pWF);
+    pWF->wFormatTag = WAVE_FORMAT_PCM;
+    pWF->nChannels  = (WORD)channels;
+    if (pWF->nChannels > 2) {
+        pWF->nChannels = 2;
+    }
+
+    result = ma_get_best_info_from_formats_flags__winmm(dwFormats, channels, &pWF->wBitsPerSample, &pWF->nSamplesPerSec);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pWF->nBlockAlign     = (WORD)(pWF->nChannels * pWF->wBitsPerSample / 8);
+    pWF->nAvgBytesPerSec = pWF->nBlockAlign * pWF->nSamplesPerSec;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info_from_WAVECAPS(ma_context* pContext, MA_WAVECAPSA* pCaps, ma_device_info* pDeviceInfo)
+{
+    WORD bitsPerSample;
+    DWORD sampleRate;
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pCaps != NULL);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    /*
+    Name / Description
+
+    Unfortunately the name specified in WAVE(OUT/IN)CAPS2 is limited to 31 characters. This results in an unprofessional looking
+    situation where the names of the devices are truncated. To help work around this, we need to look at the name GUID and try
+    looking in the registry for the full name. If we can't find it there, we need to just fall back to the default name.
+    */
+
+    /* Set the default to begin with. */
+    ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), pCaps->szPname, (size_t)-1);
+
+    /*
+    Now try the registry. There's a few things to consider here:
+    - The name GUID can be null, in which we case we just need to stick to the original 31 characters.
+    - If the name GUID is not present in the registry we'll also need to stick to the original 31 characters.
+    - I like consistency, so I want the returned device names to be consistent with those returned by WASAPI and DirectSound. The
+      problem, however is that WASAPI and DirectSound use "<component> (<name>)" format (such as "Speakers (High Definition Audio)"),
+      but WinMM does not specify the component name. From my admittedly limited testing, I've notice the component name seems to
+      usually fit within the 31 characters of the fixed sized buffer, so what I'm going to do is parse that string for the component
+      name, and then concatenate the name from the registry.
+    */
+    if (!ma_is_guid_null(&pCaps->NameGuid)) {
+        WCHAR guidStrW[256];
+        if (((MA_PFN_StringFromGUID2)pContext->win32.StringFromGUID2)(&pCaps->NameGuid, guidStrW, ma_countof(guidStrW)) > 0) {
+            char guidStr[256];
+            char keyStr[1024];
+            HKEY hKey;
+
+            WideCharToMultiByte(CP_UTF8, 0, guidStrW, -1, guidStr, sizeof(guidStr), 0, FALSE);
+
+            ma_strcpy_s(keyStr, sizeof(keyStr), "SYSTEM\\CurrentControlSet\\Control\\MediaCategories\\");
+            ma_strcat_s(keyStr, sizeof(keyStr), guidStr);
+
+            if (((MA_PFN_RegOpenKeyExA)pContext->win32.RegOpenKeyExA)(HKEY_LOCAL_MACHINE, keyStr, 0, KEY_READ, &hKey) == ERROR_SUCCESS) {
+                BYTE nameFromReg[512];
+                DWORD nameFromRegSize = sizeof(nameFromReg);
+                LONG resultWin32 = ((MA_PFN_RegQueryValueExA)pContext->win32.RegQueryValueExA)(hKey, "Name", 0, NULL, (BYTE*)nameFromReg, (DWORD*)&nameFromRegSize);
+                ((MA_PFN_RegCloseKey)pContext->win32.RegCloseKey)(hKey);
+
+                if (resultWin32 == ERROR_SUCCESS) {
+                    /* We have the value from the registry, so now we need to construct the name string. */
+                    char name[1024];
+                    if (ma_strcpy_s(name, sizeof(name), pDeviceInfo->name) == 0) {
+                        char* nameBeg = ma_find_last_character(name, '(');
+                        if (nameBeg != NULL) {
+                            size_t leadingLen = (nameBeg - name);
+                            ma_strncpy_s(nameBeg + 1, sizeof(name) - leadingLen, (const char*)nameFromReg, (size_t)-1);
+
+                            /* The closing ")", if it can fit. */
+                            if (leadingLen + nameFromRegSize < sizeof(name)-1) {
+                                ma_strcat_s(name, sizeof(name), ")");
+                            }
+
+                            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), name, (size_t)-1);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+    result = ma_get_best_info_from_formats_flags__winmm(pCaps->dwFormats, pCaps->wChannels, &bitsPerSample, &sampleRate);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (bitsPerSample == 8) {
+        pDeviceInfo->nativeDataFormats[0].format = ma_format_u8;
+    } else if (bitsPerSample == 16) {
+        pDeviceInfo->nativeDataFormats[0].format = ma_format_s16;
+    } else if (bitsPerSample == 24) {
+        pDeviceInfo->nativeDataFormats[0].format = ma_format_s24;
+    } else if (bitsPerSample == 32) {
+        pDeviceInfo->nativeDataFormats[0].format = ma_format_s32;
+    } else {
+        return MA_FORMAT_NOT_SUPPORTED;
+    }
+    pDeviceInfo->nativeDataFormats[0].channels   = pCaps->wChannels;
+    pDeviceInfo->nativeDataFormats[0].sampleRate = sampleRate;
+    pDeviceInfo->nativeDataFormats[0].flags      = 0;
+    pDeviceInfo->nativeDataFormatCount = 1;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info_from_WAVEOUTCAPS2(ma_context* pContext, MA_WAVEOUTCAPS2A* pCaps, ma_device_info* pDeviceInfo)
+{
+    MA_WAVECAPSA caps;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pCaps != NULL);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    MA_COPY_MEMORY(caps.szPname, pCaps->szPname, sizeof(caps.szPname));
+    caps.dwFormats = pCaps->dwFormats;
+    caps.wChannels = pCaps->wChannels;
+    caps.NameGuid  = pCaps->NameGuid;
+    return ma_context_get_device_info_from_WAVECAPS(pContext, &caps, pDeviceInfo);
+}
+
+static ma_result ma_context_get_device_info_from_WAVEINCAPS2(ma_context* pContext, MA_WAVEINCAPS2A* pCaps, ma_device_info* pDeviceInfo)
+{
+    MA_WAVECAPSA caps;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pCaps != NULL);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    MA_COPY_MEMORY(caps.szPname, pCaps->szPname, sizeof(caps.szPname));
+    caps.dwFormats = pCaps->dwFormats;
+    caps.wChannels = pCaps->wChannels;
+    caps.NameGuid  = pCaps->NameGuid;
+    return ma_context_get_device_info_from_WAVECAPS(pContext, &caps, pDeviceInfo);
+}
+
+
+static ma_result ma_context_enumerate_devices__winmm(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    UINT playbackDeviceCount;
+    UINT captureDeviceCount;
+    UINT iPlaybackDevice;
+    UINT iCaptureDevice;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    /* Playback. */
+    playbackDeviceCount = ((MA_PFN_waveOutGetNumDevs)pContext->winmm.waveOutGetNumDevs)();
+    for (iPlaybackDevice = 0; iPlaybackDevice < playbackDeviceCount; ++iPlaybackDevice) {
+        MA_MMRESULT result;
+        MA_WAVEOUTCAPS2A caps;
+
+        MA_ZERO_OBJECT(&caps);
+
+        result = ((MA_PFN_waveOutGetDevCapsA)pContext->winmm.waveOutGetDevCapsA)(iPlaybackDevice, (MA_WAVEOUTCAPSA*)&caps, sizeof(caps));
+        if (result == MA_MMSYSERR_NOERROR) {
+            ma_device_info deviceInfo;
+
+            MA_ZERO_OBJECT(&deviceInfo);
+            deviceInfo.id.winmm = iPlaybackDevice;
+
+            /* The first enumerated device is the default device. */
+            if (iPlaybackDevice == 0) {
+                deviceInfo.isDefault = MA_TRUE;
+            }
+
+            if (ma_context_get_device_info_from_WAVEOUTCAPS2(pContext, &caps, &deviceInfo) == MA_SUCCESS) {
+                ma_bool32 cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+                if (cbResult == MA_FALSE) {
+                    return MA_SUCCESS; /* Enumeration was stopped. */
+                }
+            }
+        }
+    }
+
+    /* Capture. */
+    captureDeviceCount = ((MA_PFN_waveInGetNumDevs)pContext->winmm.waveInGetNumDevs)();
+    for (iCaptureDevice = 0; iCaptureDevice < captureDeviceCount; ++iCaptureDevice) {
+        MA_MMRESULT result;
+        MA_WAVEINCAPS2A caps;
+
+        MA_ZERO_OBJECT(&caps);
+
+        result = ((MA_PFN_waveInGetDevCapsA)pContext->winmm.waveInGetDevCapsA)(iCaptureDevice, (MA_WAVEINCAPSA*)&caps, sizeof(caps));
+        if (result == MA_MMSYSERR_NOERROR) {
+            ma_device_info deviceInfo;
+
+            MA_ZERO_OBJECT(&deviceInfo);
+            deviceInfo.id.winmm = iCaptureDevice;
+
+            /* The first enumerated device is the default device. */
+            if (iCaptureDevice == 0) {
+                deviceInfo.isDefault = MA_TRUE;
+            }
+
+            if (ma_context_get_device_info_from_WAVEINCAPS2(pContext, &caps, &deviceInfo) == MA_SUCCESS) {
+                ma_bool32 cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+                if (cbResult == MA_FALSE) {
+                    return MA_SUCCESS; /* Enumeration was stopped. */
+                }
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info__winmm(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    UINT winMMDeviceID;
+
+    MA_ASSERT(pContext != NULL);
+
+    winMMDeviceID = 0;
+    if (pDeviceID != NULL) {
+        winMMDeviceID = (UINT)pDeviceID->winmm;
+    }
+
+    pDeviceInfo->id.winmm = winMMDeviceID;
+
+    /* The first ID is the default device. */
+    if (winMMDeviceID == 0) {
+        pDeviceInfo->isDefault = MA_TRUE;
+    }
+
+    if (deviceType == ma_device_type_playback) {
+        MA_MMRESULT result;
+        MA_WAVEOUTCAPS2A caps;
+
+        MA_ZERO_OBJECT(&caps);
+
+        result = ((MA_PFN_waveOutGetDevCapsA)pContext->winmm.waveOutGetDevCapsA)(winMMDeviceID, (MA_WAVEOUTCAPSA*)&caps, sizeof(caps));
+        if (result == MA_MMSYSERR_NOERROR) {
+            return ma_context_get_device_info_from_WAVEOUTCAPS2(pContext, &caps, pDeviceInfo);
+        }
+    } else {
+        MA_MMRESULT result;
+        MA_WAVEINCAPS2A caps;
+
+        MA_ZERO_OBJECT(&caps);
+
+        result = ((MA_PFN_waveInGetDevCapsA)pContext->winmm.waveInGetDevCapsA)(winMMDeviceID, (MA_WAVEINCAPSA*)&caps, sizeof(caps));
+        if (result == MA_MMSYSERR_NOERROR) {
+            return ma_context_get_device_info_from_WAVEINCAPS2(pContext, &caps, pDeviceInfo);
+        }
+    }
+
+    return MA_NO_DEVICE;
+}
+
+
+static ma_result ma_device_uninit__winmm(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ((MA_PFN_waveInClose)pDevice->pContext->winmm.waveInClose)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture);
+        CloseHandle((HANDLE)pDevice->winmm.hEventCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ((MA_PFN_waveOutReset)pDevice->pContext->winmm.waveOutReset)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback);
+        ((MA_PFN_waveOutClose)pDevice->pContext->winmm.waveOutClose)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback);
+        CloseHandle((HANDLE)pDevice->winmm.hEventPlayback);
+    }
+
+    ma_free(pDevice->winmm._pHeapData, &pDevice->pContext->allocationCallbacks);
+
+    MA_ZERO_OBJECT(&pDevice->winmm);   /* Safety. */
+
+    return MA_SUCCESS;
+}
+
+static ma_uint32 ma_calculate_period_size_in_frames_from_descriptor__winmm(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
+{
+    /* WinMM has a minimum period size of 40ms. */
+    ma_uint32 minPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(40, nativeSampleRate);
+    ma_uint32 periodSizeInFrames;
+
+    periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, nativeSampleRate, performanceProfile);
+    if (periodSizeInFrames < minPeriodSizeInFrames) {
+        periodSizeInFrames = minPeriodSizeInFrames;
+    }
+
+    return periodSizeInFrames;
+}
+
+static ma_result ma_device_init__winmm(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    const char* errorMsg = "";
+    ma_result errorCode = MA_ERROR;
+    ma_result result = MA_SUCCESS;
+    ma_uint32 heapSize;
+    UINT winMMDeviceIDPlayback = 0;
+    UINT winMMDeviceIDCapture  = 0;
+
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ZERO_OBJECT(&pDevice->winmm);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    /* No exclusive mode with WinMM. */
+    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
+        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
+        return MA_SHARE_MODE_NOT_SUPPORTED;
+    }
+
+    if (pDescriptorPlayback->pDeviceID != NULL) {
+        winMMDeviceIDPlayback = (UINT)pDescriptorPlayback->pDeviceID->winmm;
+    }
+    if (pDescriptorCapture->pDeviceID != NULL) {
+        winMMDeviceIDCapture = (UINT)pDescriptorCapture->pDeviceID->winmm;
+    }
+
+    /* The capture device needs to be initialized first. */
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        MA_WAVEINCAPSA caps;
+        MA_WAVEFORMATEX wf;
+        MA_MMRESULT resultMM;
+
+        /* We use an event to know when a new fragment needs to be enqueued. */
+        pDevice->winmm.hEventCapture = (ma_handle)CreateEventA(NULL, TRUE, TRUE, NULL);
+        if (pDevice->winmm.hEventCapture == NULL) {
+            errorMsg = "[WinMM] Failed to create event for fragment enqueuing for the capture device.", errorCode = ma_result_from_GetLastError(GetLastError());
+            goto on_error;
+        }
+
+        /* The format should be based on the device's actual format. */
+        if (((MA_PFN_waveInGetDevCapsA)pDevice->pContext->winmm.waveInGetDevCapsA)(winMMDeviceIDCapture, &caps, sizeof(caps)) != MA_MMSYSERR_NOERROR) {
+            errorMsg = "[WinMM] Failed to retrieve internal device caps.", errorCode = MA_FORMAT_NOT_SUPPORTED;
+            goto on_error;
+        }
+
+        result = ma_formats_flags_to_WAVEFORMATEX__winmm(caps.dwFormats, caps.wChannels, &wf);
+        if (result != MA_SUCCESS) {
+            errorMsg = "[WinMM] Could not find appropriate format for internal device.", errorCode = result;
+            goto on_error;
+        }
+
+        resultMM = ((MA_PFN_waveInOpen)pDevice->pContext->winmm.waveInOpen)((MA_HWAVEIN*)&pDevice->winmm.hDeviceCapture, winMMDeviceIDCapture, &wf, (DWORD_PTR)pDevice->winmm.hEventCapture, (DWORD_PTR)pDevice, MA_CALLBACK_EVENT | MA_WAVE_ALLOWSYNC);
+        if (resultMM != MA_MMSYSERR_NOERROR) {
+            errorMsg = "[WinMM] Failed to open capture device.", errorCode = MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+            goto on_error;
+        }
+
+        pDescriptorCapture->format             = ma_format_from_WAVEFORMATEX(&wf);
+        pDescriptorCapture->channels           = wf.nChannels;
+        pDescriptorCapture->sampleRate         = wf.nSamplesPerSec;
+        ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorCapture->channels);
+        pDescriptorCapture->periodCount        = pDescriptorCapture->periodCount;
+        pDescriptorCapture->periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__winmm(pDescriptorCapture, pDescriptorCapture->sampleRate, pConfig->performanceProfile);
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        MA_WAVEOUTCAPSA caps;
+        MA_WAVEFORMATEX wf;
+        MA_MMRESULT resultMM;
+
+        /* We use an event to know when a new fragment needs to be enqueued. */
+        pDevice->winmm.hEventPlayback = (ma_handle)CreateEventA(NULL, TRUE, TRUE, NULL);
+        if (pDevice->winmm.hEventPlayback == NULL) {
+            errorMsg = "[WinMM] Failed to create event for fragment enqueuing for the playback device.", errorCode = ma_result_from_GetLastError(GetLastError());
+            goto on_error;
+        }
+
+        /* The format should be based on the device's actual format. */
+        if (((MA_PFN_waveOutGetDevCapsA)pDevice->pContext->winmm.waveOutGetDevCapsA)(winMMDeviceIDPlayback, &caps, sizeof(caps)) != MA_MMSYSERR_NOERROR) {
+            errorMsg = "[WinMM] Failed to retrieve internal device caps.", errorCode = MA_FORMAT_NOT_SUPPORTED;
+            goto on_error;
+        }
+
+        result = ma_formats_flags_to_WAVEFORMATEX__winmm(caps.dwFormats, caps.wChannels, &wf);
+        if (result != MA_SUCCESS) {
+            errorMsg = "[WinMM] Could not find appropriate format for internal device.", errorCode = result;
+            goto on_error;
+        }
+
+        resultMM = ((MA_PFN_waveOutOpen)pDevice->pContext->winmm.waveOutOpen)((MA_HWAVEOUT*)&pDevice->winmm.hDevicePlayback, winMMDeviceIDPlayback, &wf, (DWORD_PTR)pDevice->winmm.hEventPlayback, (DWORD_PTR)pDevice, MA_CALLBACK_EVENT | MA_WAVE_ALLOWSYNC);
+        if (resultMM != MA_MMSYSERR_NOERROR) {
+            errorMsg = "[WinMM] Failed to open playback device.", errorCode = MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+            goto on_error;
+        }
+
+        pDescriptorPlayback->format             = ma_format_from_WAVEFORMATEX(&wf);
+        pDescriptorPlayback->channels           = wf.nChannels;
+        pDescriptorPlayback->sampleRate         = wf.nSamplesPerSec;
+        ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pDescriptorPlayback->channelMap, ma_countof(pDescriptorPlayback->channelMap), pDescriptorPlayback->channels);
+        pDescriptorPlayback->periodCount        = pDescriptorPlayback->periodCount;
+        pDescriptorPlayback->periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__winmm(pDescriptorPlayback, pDescriptorPlayback->sampleRate, pConfig->performanceProfile);
+    }
+
+    /*
+    The heap allocated data is allocated like so:
+
+    [Capture WAVEHDRs][Playback WAVEHDRs][Capture Intermediary Buffer][Playback Intermediary Buffer]
+    */
+    heapSize = 0;
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        heapSize += sizeof(MA_WAVEHDR)*pDescriptorCapture->periodCount + (pDescriptorCapture->periodSizeInFrames * pDescriptorCapture->periodCount * ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels));
+    }
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        heapSize += sizeof(MA_WAVEHDR)*pDescriptorPlayback->periodCount + (pDescriptorPlayback->periodSizeInFrames * pDescriptorPlayback->periodCount * ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels));
+    }
+
+    pDevice->winmm._pHeapData = (ma_uint8*)ma_calloc(heapSize, &pDevice->pContext->allocationCallbacks);
+    if (pDevice->winmm._pHeapData == NULL) {
+        errorMsg = "[WinMM] Failed to allocate memory for the intermediary buffer.", errorCode = MA_OUT_OF_MEMORY;
+        goto on_error;
+    }
+
+    MA_ZERO_MEMORY(pDevice->winmm._pHeapData, heapSize);
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ma_uint32 iPeriod;
+
+        if (pConfig->deviceType == ma_device_type_capture) {
+            pDevice->winmm.pWAVEHDRCapture            = pDevice->winmm._pHeapData;
+            pDevice->winmm.pIntermediaryBufferCapture = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*(pDescriptorCapture->periodCount));
+        } else {
+            pDevice->winmm.pWAVEHDRCapture            = pDevice->winmm._pHeapData;
+            pDevice->winmm.pIntermediaryBufferCapture = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*(pDescriptorCapture->periodCount + pDescriptorPlayback->periodCount));
+        }
+
+        /* Prepare headers. */
+        for (iPeriod = 0; iPeriod < pDescriptorCapture->periodCount; ++iPeriod) {
+            ma_uint32 periodSizeInBytes = ma_get_period_size_in_bytes(pDescriptorCapture->periodSizeInFrames, pDescriptorCapture->format, pDescriptorCapture->channels);
+
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].lpData         = (char*)(pDevice->winmm.pIntermediaryBufferCapture + (periodSizeInBytes*iPeriod));
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].dwBufferLength = periodSizeInBytes;
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].dwFlags        = 0L;
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].dwLoops        = 0L;
+            ((MA_PFN_waveInPrepareHeader)pDevice->pContext->winmm.waveInPrepareHeader)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod], sizeof(MA_WAVEHDR));
+
+            /*
+            The user data of the MA_WAVEHDR structure is a single flag the controls whether or not it is ready for writing. Consider it to be named "isLocked". A value of 0 means
+            it's unlocked and available for writing. A value of 1 means it's locked.
+            */
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod].dwUser = 0;
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_uint32 iPeriod;
+
+        if (pConfig->deviceType == ma_device_type_playback) {
+            pDevice->winmm.pWAVEHDRPlayback            = pDevice->winmm._pHeapData;
+            pDevice->winmm.pIntermediaryBufferPlayback = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*pDescriptorPlayback->periodCount);
+        } else {
+            pDevice->winmm.pWAVEHDRPlayback            = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*(pDescriptorCapture->periodCount));
+            pDevice->winmm.pIntermediaryBufferPlayback = pDevice->winmm._pHeapData + (sizeof(MA_WAVEHDR)*(pDescriptorCapture->periodCount + pDescriptorPlayback->periodCount)) + (pDescriptorCapture->periodSizeInFrames*pDescriptorCapture->periodCount*ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels));
+        }
+
+        /* Prepare headers. */
+        for (iPeriod = 0; iPeriod < pDescriptorPlayback->periodCount; ++iPeriod) {
+            ma_uint32 periodSizeInBytes = ma_get_period_size_in_bytes(pDescriptorPlayback->periodSizeInFrames, pDescriptorPlayback->format, pDescriptorPlayback->channels);
+
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].lpData         = (char*)(pDevice->winmm.pIntermediaryBufferPlayback + (periodSizeInBytes*iPeriod));
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].dwBufferLength = periodSizeInBytes;
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].dwFlags        = 0L;
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].dwLoops        = 0L;
+            ((MA_PFN_waveOutPrepareHeader)pDevice->pContext->winmm.waveOutPrepareHeader)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod], sizeof(MA_WAVEHDR));
+
+            /*
+            The user data of the MA_WAVEHDR structure is a single flag the controls whether or not it is ready for writing. Consider it to be named "isLocked". A value of 0 means
+            it's unlocked and available for writing. A value of 1 means it's locked.
+            */
+            ((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod].dwUser = 0;
+        }
+    }
+
+    return MA_SUCCESS;
+
+on_error:
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->winmm.pWAVEHDRCapture != NULL) {
+            ma_uint32 iPeriod;
+            for (iPeriod = 0; iPeriod < pDescriptorCapture->periodCount; ++iPeriod) {
+                ((MA_PFN_waveInUnprepareHeader)pDevice->pContext->winmm.waveInUnprepareHeader)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod], sizeof(MA_WAVEHDR));
+            }
+        }
+
+        ((MA_PFN_waveInClose)pDevice->pContext->winmm.waveInClose)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->winmm.pWAVEHDRCapture != NULL) {
+            ma_uint32 iPeriod;
+            for (iPeriod = 0; iPeriod < pDescriptorPlayback->periodCount; ++iPeriod) {
+                ((MA_PFN_waveOutUnprepareHeader)pDevice->pContext->winmm.waveOutUnprepareHeader)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback)[iPeriod], sizeof(MA_WAVEHDR));
+            }
+        }
+
+        ((MA_PFN_waveOutClose)pDevice->pContext->winmm.waveOutClose)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback);
+    }
+
+    ma_free(pDevice->winmm._pHeapData, &pDevice->pContext->allocationCallbacks);
+
+    if (errorMsg != NULL && errorMsg[0] != '\0') {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "%s", errorMsg);
+    }
+
+    return errorCode;
+}
+
+static ma_result ma_device_start__winmm(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        MA_MMRESULT resultMM;
+        MA_WAVEHDR* pWAVEHDR;
+        ma_uint32 iPeriod;
+
+        pWAVEHDR = (MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture;
+
+        /* Make sure the event is reset to a non-signaled state to ensure we don't prematurely return from WaitForSingleObject(). */
+        ResetEvent((HANDLE)pDevice->winmm.hEventCapture);
+
+        /* To start the device we attach all of the buffers and then start it. As the buffers are filled with data we will get notifications. */
+        for (iPeriod = 0; iPeriod < pDevice->capture.internalPeriods; ++iPeriod) {
+            resultMM = ((MA_PFN_waveInAddBuffer)pDevice->pContext->winmm.waveInAddBuffer)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[iPeriod], sizeof(MA_WAVEHDR));
+            if (resultMM != MA_MMSYSERR_NOERROR) {
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WinMM] Failed to attach input buffers to capture device in preparation for capture.");
+                return ma_result_from_MMRESULT(resultMM);
+            }
+
+            /* Make sure all of the buffers start out locked. We don't want to access them until the backend tells us we can. */
+            pWAVEHDR[iPeriod].dwUser = 1;   /* 1 = locked. */
+        }
+
+        /* Capture devices need to be explicitly started, unlike playback devices. */
+        resultMM = ((MA_PFN_waveInStart)pDevice->pContext->winmm.waveInStart)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture);
+        if (resultMM != MA_MMSYSERR_NOERROR) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WinMM] Failed to start backend device.");
+            return ma_result_from_MMRESULT(resultMM);
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        /* Don't need to do anything for playback. It'll be started automatically in ma_device_start__winmm(). */
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__winmm(ma_device* pDevice)
+{
+    MA_MMRESULT resultMM;
+
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->winmm.hDeviceCapture == NULL) {
+            return MA_INVALID_ARGS;
+        }
+
+        resultMM = ((MA_PFN_waveInReset)pDevice->pContext->winmm.waveInReset)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture);
+        if (resultMM != MA_MMSYSERR_NOERROR) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[WinMM] WARNING: Failed to reset capture device.");
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_uint32 iPeriod;
+        MA_WAVEHDR* pWAVEHDR;
+
+        if (pDevice->winmm.hDevicePlayback == NULL) {
+            return MA_INVALID_ARGS;
+        }
+
+        /* We need to drain the device. To do this we just loop over each header and if it's locked just wait for the event. */
+        pWAVEHDR = (MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback;
+        for (iPeriod = 0; iPeriod < pDevice->playback.internalPeriods; iPeriod += 1) {
+            if (pWAVEHDR[iPeriod].dwUser == 1) { /* 1 = locked. */
+                if (WaitForSingleObject((HANDLE)pDevice->winmm.hEventPlayback, INFINITE) != WAIT_OBJECT_0) {
+                    break;  /* An error occurred so just abandon ship and stop the device without draining. */
+                }
+
+                pWAVEHDR[iPeriod].dwUser = 0;
+            }
+        }
+
+        resultMM = ((MA_PFN_waveOutReset)pDevice->pContext->winmm.waveOutReset)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback);
+        if (resultMM != MA_MMSYSERR_NOERROR) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[WinMM] WARNING: Failed to reset playback device.");
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_write__winmm(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
+{
+    ma_result result = MA_SUCCESS;
+    MA_MMRESULT resultMM;
+    ma_uint32 totalFramesWritten;
+    MA_WAVEHDR* pWAVEHDR;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pPCMFrames != NULL);
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = 0;
+    }
+
+    pWAVEHDR = (MA_WAVEHDR*)pDevice->winmm.pWAVEHDRPlayback;
+
+    /* Keep processing as much data as possible. */
+    totalFramesWritten = 0;
+    while (totalFramesWritten < frameCount) {
+        /* If the current header has some space available we need to write part of it. */
+        if (pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwUser == 0) { /* 0 = unlocked. */
+            /*
+            This header has room in it. We copy as much of it as we can. If we end up fully consuming the buffer we need to
+            write it out and move on to the next iteration.
+            */
+            ma_uint32 bpf = ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+            ma_uint32 framesRemainingInHeader = (pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwBufferLength/bpf) - pDevice->winmm.headerFramesConsumedPlayback;
+
+            ma_uint32 framesToCopy = ma_min(framesRemainingInHeader, (frameCount - totalFramesWritten));
+            const void* pSrc = ma_offset_ptr(pPCMFrames, totalFramesWritten*bpf);
+            void* pDst = ma_offset_ptr(pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].lpData, pDevice->winmm.headerFramesConsumedPlayback*bpf);
+            MA_COPY_MEMORY(pDst, pSrc, framesToCopy*bpf);
+
+            pDevice->winmm.headerFramesConsumedPlayback += framesToCopy;
+            totalFramesWritten += framesToCopy;
+
+            /* If we've consumed the buffer entirely we need to write it out to the device. */
+            if (pDevice->winmm.headerFramesConsumedPlayback == (pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwBufferLength/bpf)) {
+                pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwUser = 1;            /* 1 = locked. */
+                pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwFlags &= ~MA_WHDR_DONE; /* <-- Need to make sure the WHDR_DONE flag is unset. */
+
+                /* Make sure the event is reset to a non-signaled state to ensure we don't prematurely return from WaitForSingleObject(). */
+                ResetEvent((HANDLE)pDevice->winmm.hEventPlayback);
+
+                /* The device will be started here. */
+                resultMM = ((MA_PFN_waveOutWrite)pDevice->pContext->winmm.waveOutWrite)((MA_HWAVEOUT)pDevice->winmm.hDevicePlayback, &pWAVEHDR[pDevice->winmm.iNextHeaderPlayback], sizeof(MA_WAVEHDR));
+                if (resultMM != MA_MMSYSERR_NOERROR) {
+                    result = ma_result_from_MMRESULT(resultMM);
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WinMM] waveOutWrite() failed.");
+                    break;
+                }
+
+                /* Make sure we move to the next header. */
+                pDevice->winmm.iNextHeaderPlayback = (pDevice->winmm.iNextHeaderPlayback + 1) % pDevice->playback.internalPeriods;
+                pDevice->winmm.headerFramesConsumedPlayback = 0;
+            }
+
+            /* If at this point we have consumed the entire input buffer we can return. */
+            MA_ASSERT(totalFramesWritten <= frameCount);
+            if (totalFramesWritten == frameCount) {
+                break;
+            }
+
+            /* Getting here means there's more to process. */
+            continue;
+        }
+
+        /* Getting here means there isn't enough room in the buffer and we need to wait for one to become available. */
+        if (WaitForSingleObject((HANDLE)pDevice->winmm.hEventPlayback, INFINITE) != WAIT_OBJECT_0) {
+            result = MA_ERROR;
+            break;
+        }
+
+        /* Something happened. If the next buffer has been marked as done we need to reset a bit of state. */
+        if ((pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwFlags & MA_WHDR_DONE) != 0) {
+            pWAVEHDR[pDevice->winmm.iNextHeaderPlayback].dwUser = 0;    /* 0 = unlocked (make it available for writing). */
+            pDevice->winmm.headerFramesConsumedPlayback = 0;
+        }
+
+        /* If the device has been stopped we need to break. */
+        if (ma_device_get_state(pDevice) != ma_device_state_started) {
+            break;
+        }
+    }
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = totalFramesWritten;
+    }
+
+    return result;
+}
+
+static ma_result ma_device_read__winmm(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    MA_MMRESULT resultMM;
+    ma_uint32 totalFramesRead;
+    MA_WAVEHDR* pWAVEHDR;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pPCMFrames != NULL);
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    pWAVEHDR = (MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture;
+
+    /* Keep processing as much data as possible. */
+    totalFramesRead = 0;
+    while (totalFramesRead < frameCount) {
+        /* If the current header has some space available we need to write part of it. */
+        if (pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwUser == 0) { /* 0 = unlocked. */
+            /* The buffer is available for reading. If we fully consume it we need to add it back to the buffer. */
+            ma_uint32 bpf = ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+            ma_uint32 framesRemainingInHeader = (pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwBufferLength/bpf) - pDevice->winmm.headerFramesConsumedCapture;
+
+            ma_uint32 framesToCopy = ma_min(framesRemainingInHeader, (frameCount - totalFramesRead));
+            const void* pSrc = ma_offset_ptr(pWAVEHDR[pDevice->winmm.iNextHeaderCapture].lpData, pDevice->winmm.headerFramesConsumedCapture*bpf);
+            void* pDst = ma_offset_ptr(pPCMFrames, totalFramesRead*bpf);
+            MA_COPY_MEMORY(pDst, pSrc, framesToCopy*bpf);
+
+            pDevice->winmm.headerFramesConsumedCapture += framesToCopy;
+            totalFramesRead += framesToCopy;
+
+            /* If we've consumed the buffer entirely we need to add it back to the device. */
+            if (pDevice->winmm.headerFramesConsumedCapture == (pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwBufferLength/bpf)) {
+                pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwUser = 1;            /* 1 = locked. */
+                pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwFlags &= ~MA_WHDR_DONE; /* <-- Need to make sure the WHDR_DONE flag is unset. */
+
+                /* Make sure the event is reset to a non-signaled state to ensure we don't prematurely return from WaitForSingleObject(). */
+                ResetEvent((HANDLE)pDevice->winmm.hEventCapture);
+
+                /* The device will be started here. */
+                resultMM = ((MA_PFN_waveInAddBuffer)pDevice->pContext->winmm.waveInAddBuffer)((MA_HWAVEIN)pDevice->winmm.hDeviceCapture, &((MA_WAVEHDR*)pDevice->winmm.pWAVEHDRCapture)[pDevice->winmm.iNextHeaderCapture], sizeof(MA_WAVEHDR));
+                if (resultMM != MA_MMSYSERR_NOERROR) {
+                    result = ma_result_from_MMRESULT(resultMM);
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[WinMM] waveInAddBuffer() failed.");
+                    break;
+                }
+
+                /* Make sure we move to the next header. */
+                pDevice->winmm.iNextHeaderCapture = (pDevice->winmm.iNextHeaderCapture + 1) % pDevice->capture.internalPeriods;
+                pDevice->winmm.headerFramesConsumedCapture = 0;
+            }
+
+            /* If at this point we have filled the entire input buffer we can return. */
+            MA_ASSERT(totalFramesRead <= frameCount);
+            if (totalFramesRead == frameCount) {
+                break;
+            }
+
+            /* Getting here means there's more to process. */
+            continue;
+        }
+
+        /* Getting here means there isn't enough any data left to send to the client which means we need to wait for more. */
+        if (WaitForSingleObject((HANDLE)pDevice->winmm.hEventCapture, INFINITE) != WAIT_OBJECT_0) {
+            result = MA_ERROR;
+            break;
+        }
+
+        /* Something happened. If the next buffer has been marked as done we need to reset a bit of state. */
+        if ((pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwFlags & MA_WHDR_DONE) != 0) {
+            pWAVEHDR[pDevice->winmm.iNextHeaderCapture].dwUser = 0;    /* 0 = unlocked (make it available for reading). */
+            pDevice->winmm.headerFramesConsumedCapture = 0;
+        }
+
+        /* If the device has been stopped we need to break. */
+        if (ma_device_get_state(pDevice) != ma_device_state_started) {
+            break;
+        }
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = totalFramesRead;
+    }
+
+    return result;
+}
+
+static ma_result ma_context_uninit__winmm(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_winmm);
+
+    ma_dlclose(ma_context_get_log(pContext), pContext->winmm.hWinMM);
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__winmm(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    MA_ASSERT(pContext != NULL);
+
+    (void)pConfig;
+
+    pContext->winmm.hWinMM = ma_dlopen(ma_context_get_log(pContext), "winmm.dll");
+    if (pContext->winmm.hWinMM == NULL) {
+        return MA_NO_BACKEND;
+    }
+
+    pContext->winmm.waveOutGetNumDevs      = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutGetNumDevs");
+    pContext->winmm.waveOutGetDevCapsA     = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutGetDevCapsA");
+    pContext->winmm.waveOutOpen            = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutOpen");
+    pContext->winmm.waveOutClose           = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutClose");
+    pContext->winmm.waveOutPrepareHeader   = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutPrepareHeader");
+    pContext->winmm.waveOutUnprepareHeader = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutUnprepareHeader");
+    pContext->winmm.waveOutWrite           = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutWrite");
+    pContext->winmm.waveOutReset           = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveOutReset");
+    pContext->winmm.waveInGetNumDevs       = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInGetNumDevs");
+    pContext->winmm.waveInGetDevCapsA      = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInGetDevCapsA");
+    pContext->winmm.waveInOpen             = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInOpen");
+    pContext->winmm.waveInClose            = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInClose");
+    pContext->winmm.waveInPrepareHeader    = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInPrepareHeader");
+    pContext->winmm.waveInUnprepareHeader  = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInUnprepareHeader");
+    pContext->winmm.waveInAddBuffer        = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInAddBuffer");
+    pContext->winmm.waveInStart            = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInStart");
+    pContext->winmm.waveInReset            = ma_dlsym(ma_context_get_log(pContext), pContext->winmm.hWinMM, "waveInReset");
+
+    pCallbacks->onContextInit             = ma_context_init__winmm;
+    pCallbacks->onContextUninit           = ma_context_uninit__winmm;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__winmm;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__winmm;
+    pCallbacks->onDeviceInit              = ma_device_init__winmm;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__winmm;
+    pCallbacks->onDeviceStart             = ma_device_start__winmm;
+    pCallbacks->onDeviceStop              = ma_device_stop__winmm;
+    pCallbacks->onDeviceRead              = ma_device_read__winmm;
+    pCallbacks->onDeviceWrite             = ma_device_write__winmm;
+    pCallbacks->onDeviceDataLoop          = NULL;   /* This is a blocking read-write API, so this can be NULL since miniaudio will manage the audio thread for us. */
+
+    return MA_SUCCESS;
+}
+#endif
+
+
+
+
+/******************************************************************************
+
+ALSA Backend
+
+******************************************************************************/
+#ifdef MA_HAS_ALSA
+
+#include <poll.h>           /* poll(), struct pollfd */
+#include <sys/eventfd.h>    /* eventfd() */
+
+#ifdef MA_NO_RUNTIME_LINKING
+
+/* asoundlib.h marks some functions with "inline" which isn't always supported. Need to emulate it. */
+#if !defined(__cplusplus)
+    #if defined(__STRICT_ANSI__)
+        #if !defined(inline)
+            #define inline __inline__ __attribute__((always_inline))
+            #define MA_INLINE_DEFINED
+        #endif
+    #endif
+#endif
+#include <alsa/asoundlib.h>
+#if defined(MA_INLINE_DEFINED)
+    #undef inline
+    #undef MA_INLINE_DEFINED
+#endif
+
+typedef snd_pcm_uframes_t                       ma_snd_pcm_uframes_t;
+typedef snd_pcm_sframes_t                       ma_snd_pcm_sframes_t;
+typedef snd_pcm_stream_t                        ma_snd_pcm_stream_t;
+typedef snd_pcm_format_t                        ma_snd_pcm_format_t;
+typedef snd_pcm_access_t                        ma_snd_pcm_access_t;
+typedef snd_pcm_t                               ma_snd_pcm_t;
+typedef snd_pcm_hw_params_t                     ma_snd_pcm_hw_params_t;
+typedef snd_pcm_sw_params_t                     ma_snd_pcm_sw_params_t;
+typedef snd_pcm_format_mask_t                   ma_snd_pcm_format_mask_t;
+typedef snd_pcm_info_t                          ma_snd_pcm_info_t;
+typedef snd_pcm_channel_area_t                  ma_snd_pcm_channel_area_t;
+typedef snd_pcm_chmap_t                         ma_snd_pcm_chmap_t;
+typedef snd_pcm_state_t                         ma_snd_pcm_state_t;
+
+/* snd_pcm_stream_t */
+#define MA_SND_PCM_STREAM_PLAYBACK              SND_PCM_STREAM_PLAYBACK
+#define MA_SND_PCM_STREAM_CAPTURE               SND_PCM_STREAM_CAPTURE
+
+/* snd_pcm_format_t */
+#define MA_SND_PCM_FORMAT_UNKNOWN               SND_PCM_FORMAT_UNKNOWN
+#define MA_SND_PCM_FORMAT_U8                    SND_PCM_FORMAT_U8
+#define MA_SND_PCM_FORMAT_S16_LE                SND_PCM_FORMAT_S16_LE
+#define MA_SND_PCM_FORMAT_S16_BE                SND_PCM_FORMAT_S16_BE
+#define MA_SND_PCM_FORMAT_S24_LE                SND_PCM_FORMAT_S24_LE
+#define MA_SND_PCM_FORMAT_S24_BE                SND_PCM_FORMAT_S24_BE
+#define MA_SND_PCM_FORMAT_S32_LE                SND_PCM_FORMAT_S32_LE
+#define MA_SND_PCM_FORMAT_S32_BE                SND_PCM_FORMAT_S32_BE
+#define MA_SND_PCM_FORMAT_FLOAT_LE              SND_PCM_FORMAT_FLOAT_LE
+#define MA_SND_PCM_FORMAT_FLOAT_BE              SND_PCM_FORMAT_FLOAT_BE
+#define MA_SND_PCM_FORMAT_FLOAT64_LE            SND_PCM_FORMAT_FLOAT64_LE
+#define MA_SND_PCM_FORMAT_FLOAT64_BE            SND_PCM_FORMAT_FLOAT64_BE
+#define MA_SND_PCM_FORMAT_MU_LAW                SND_PCM_FORMAT_MU_LAW
+#define MA_SND_PCM_FORMAT_A_LAW                 SND_PCM_FORMAT_A_LAW
+#define MA_SND_PCM_FORMAT_S24_3LE               SND_PCM_FORMAT_S24_3LE
+#define MA_SND_PCM_FORMAT_S24_3BE               SND_PCM_FORMAT_S24_3BE
+
+/* ma_snd_pcm_access_t */
+#define MA_SND_PCM_ACCESS_MMAP_INTERLEAVED      SND_PCM_ACCESS_MMAP_INTERLEAVED
+#define MA_SND_PCM_ACCESS_MMAP_NONINTERLEAVED   SND_PCM_ACCESS_MMAP_NONINTERLEAVED
+#define MA_SND_PCM_ACCESS_MMAP_COMPLEX          SND_PCM_ACCESS_MMAP_COMPLEX
+#define MA_SND_PCM_ACCESS_RW_INTERLEAVED        SND_PCM_ACCESS_RW_INTERLEAVED
+#define MA_SND_PCM_ACCESS_RW_NONINTERLEAVED     SND_PCM_ACCESS_RW_NONINTERLEAVED
+
+/* Channel positions. */
+#define MA_SND_CHMAP_UNKNOWN                    SND_CHMAP_UNKNOWN
+#define MA_SND_CHMAP_NA                         SND_CHMAP_NA
+#define MA_SND_CHMAP_MONO                       SND_CHMAP_MONO
+#define MA_SND_CHMAP_FL                         SND_CHMAP_FL
+#define MA_SND_CHMAP_FR                         SND_CHMAP_FR
+#define MA_SND_CHMAP_RL                         SND_CHMAP_RL
+#define MA_SND_CHMAP_RR                         SND_CHMAP_RR
+#define MA_SND_CHMAP_FC                         SND_CHMAP_FC
+#define MA_SND_CHMAP_LFE                        SND_CHMAP_LFE
+#define MA_SND_CHMAP_SL                         SND_CHMAP_SL
+#define MA_SND_CHMAP_SR                         SND_CHMAP_SR
+#define MA_SND_CHMAP_RC                         SND_CHMAP_RC
+#define MA_SND_CHMAP_FLC                        SND_CHMAP_FLC
+#define MA_SND_CHMAP_FRC                        SND_CHMAP_FRC
+#define MA_SND_CHMAP_RLC                        SND_CHMAP_RLC
+#define MA_SND_CHMAP_RRC                        SND_CHMAP_RRC
+#define MA_SND_CHMAP_FLW                        SND_CHMAP_FLW
+#define MA_SND_CHMAP_FRW                        SND_CHMAP_FRW
+#define MA_SND_CHMAP_FLH                        SND_CHMAP_FLH
+#define MA_SND_CHMAP_FCH                        SND_CHMAP_FCH
+#define MA_SND_CHMAP_FRH                        SND_CHMAP_FRH
+#define MA_SND_CHMAP_TC                         SND_CHMAP_TC
+#define MA_SND_CHMAP_TFL                        SND_CHMAP_TFL
+#define MA_SND_CHMAP_TFR                        SND_CHMAP_TFR
+#define MA_SND_CHMAP_TFC                        SND_CHMAP_TFC
+#define MA_SND_CHMAP_TRL                        SND_CHMAP_TRL
+#define MA_SND_CHMAP_TRR                        SND_CHMAP_TRR
+#define MA_SND_CHMAP_TRC                        SND_CHMAP_TRC
+#define MA_SND_CHMAP_TFLC                       SND_CHMAP_TFLC
+#define MA_SND_CHMAP_TFRC                       SND_CHMAP_TFRC
+#define MA_SND_CHMAP_TSL                        SND_CHMAP_TSL
+#define MA_SND_CHMAP_TSR                        SND_CHMAP_TSR
+#define MA_SND_CHMAP_LLFE                       SND_CHMAP_LLFE
+#define MA_SND_CHMAP_RLFE                       SND_CHMAP_RLFE
+#define MA_SND_CHMAP_BC                         SND_CHMAP_BC
+#define MA_SND_CHMAP_BLC                        SND_CHMAP_BLC
+#define MA_SND_CHMAP_BRC                        SND_CHMAP_BRC
+
+/* Open mode flags. */
+#define MA_SND_PCM_NO_AUTO_RESAMPLE             SND_PCM_NO_AUTO_RESAMPLE
+#define MA_SND_PCM_NO_AUTO_CHANNELS             SND_PCM_NO_AUTO_CHANNELS
+#define MA_SND_PCM_NO_AUTO_FORMAT               SND_PCM_NO_AUTO_FORMAT
+#else
+#include <errno.h>  /* For EPIPE, etc. */
+typedef unsigned long                           ma_snd_pcm_uframes_t;
+typedef long                                    ma_snd_pcm_sframes_t;
+typedef int                                     ma_snd_pcm_stream_t;
+typedef int                                     ma_snd_pcm_format_t;
+typedef int                                     ma_snd_pcm_access_t;
+typedef int                                     ma_snd_pcm_state_t;
+typedef struct ma_snd_pcm_t                     ma_snd_pcm_t;
+typedef struct ma_snd_pcm_hw_params_t           ma_snd_pcm_hw_params_t;
+typedef struct ma_snd_pcm_sw_params_t           ma_snd_pcm_sw_params_t;
+typedef struct ma_snd_pcm_format_mask_t         ma_snd_pcm_format_mask_t;
+typedef struct ma_snd_pcm_info_t                ma_snd_pcm_info_t;
+typedef struct
+{
+    void* addr;
+    unsigned int first;
+    unsigned int step;
+} ma_snd_pcm_channel_area_t;
+typedef struct
+{
+    unsigned int channels;
+    unsigned int pos[1];
+} ma_snd_pcm_chmap_t;
+
+/* snd_pcm_state_t */
+#define MA_SND_PCM_STATE_OPEN                  0
+#define MA_SND_PCM_STATE_SETUP                 1
+#define MA_SND_PCM_STATE_PREPARED              2
+#define MA_SND_PCM_STATE_RUNNING               3
+#define MA_SND_PCM_STATE_XRUN                  4
+#define MA_SND_PCM_STATE_DRAINING              5
+#define MA_SND_PCM_STATE_PAUSED                6
+#define MA_SND_PCM_STATE_SUSPENDED             7
+#define MA_SND_PCM_STATE_DISCONNECTED          8
+
+/* snd_pcm_stream_t */
+#define MA_SND_PCM_STREAM_PLAYBACK             0
+#define MA_SND_PCM_STREAM_CAPTURE              1
+
+/* snd_pcm_format_t */
+#define MA_SND_PCM_FORMAT_UNKNOWN              -1
+#define MA_SND_PCM_FORMAT_U8                   1
+#define MA_SND_PCM_FORMAT_S16_LE               2
+#define MA_SND_PCM_FORMAT_S16_BE               3
+#define MA_SND_PCM_FORMAT_S24_LE               6
+#define MA_SND_PCM_FORMAT_S24_BE               7
+#define MA_SND_PCM_FORMAT_S32_LE               10
+#define MA_SND_PCM_FORMAT_S32_BE               11
+#define MA_SND_PCM_FORMAT_FLOAT_LE             14
+#define MA_SND_PCM_FORMAT_FLOAT_BE             15
+#define MA_SND_PCM_FORMAT_FLOAT64_LE           16
+#define MA_SND_PCM_FORMAT_FLOAT64_BE           17
+#define MA_SND_PCM_FORMAT_MU_LAW               20
+#define MA_SND_PCM_FORMAT_A_LAW                21
+#define MA_SND_PCM_FORMAT_S24_3LE              32
+#define MA_SND_PCM_FORMAT_S24_3BE              33
+
+/* snd_pcm_access_t */
+#define MA_SND_PCM_ACCESS_MMAP_INTERLEAVED     0
+#define MA_SND_PCM_ACCESS_MMAP_NONINTERLEAVED  1
+#define MA_SND_PCM_ACCESS_MMAP_COMPLEX         2
+#define MA_SND_PCM_ACCESS_RW_INTERLEAVED       3
+#define MA_SND_PCM_ACCESS_RW_NONINTERLEAVED    4
+
+/* Channel positions. */
+#define MA_SND_CHMAP_UNKNOWN                   0
+#define MA_SND_CHMAP_NA                        1
+#define MA_SND_CHMAP_MONO                      2
+#define MA_SND_CHMAP_FL                        3
+#define MA_SND_CHMAP_FR                        4
+#define MA_SND_CHMAP_RL                        5
+#define MA_SND_CHMAP_RR                        6
+#define MA_SND_CHMAP_FC                        7
+#define MA_SND_CHMAP_LFE                       8
+#define MA_SND_CHMAP_SL                        9
+#define MA_SND_CHMAP_SR                        10
+#define MA_SND_CHMAP_RC                        11
+#define MA_SND_CHMAP_FLC                       12
+#define MA_SND_CHMAP_FRC                       13
+#define MA_SND_CHMAP_RLC                       14
+#define MA_SND_CHMAP_RRC                       15
+#define MA_SND_CHMAP_FLW                       16
+#define MA_SND_CHMAP_FRW                       17
+#define MA_SND_CHMAP_FLH                       18
+#define MA_SND_CHMAP_FCH                       19
+#define MA_SND_CHMAP_FRH                       20
+#define MA_SND_CHMAP_TC                        21
+#define MA_SND_CHMAP_TFL                       22
+#define MA_SND_CHMAP_TFR                       23
+#define MA_SND_CHMAP_TFC                       24
+#define MA_SND_CHMAP_TRL                       25
+#define MA_SND_CHMAP_TRR                       26
+#define MA_SND_CHMAP_TRC                       27
+#define MA_SND_CHMAP_TFLC                      28
+#define MA_SND_CHMAP_TFRC                      29
+#define MA_SND_CHMAP_TSL                       30
+#define MA_SND_CHMAP_TSR                       31
+#define MA_SND_CHMAP_LLFE                      32
+#define MA_SND_CHMAP_RLFE                      33
+#define MA_SND_CHMAP_BC                        34
+#define MA_SND_CHMAP_BLC                       35
+#define MA_SND_CHMAP_BRC                       36
+
+/* Open mode flags. */
+#define MA_SND_PCM_NO_AUTO_RESAMPLE            0x00010000
+#define MA_SND_PCM_NO_AUTO_CHANNELS            0x00020000
+#define MA_SND_PCM_NO_AUTO_FORMAT              0x00040000
+#endif
+
+typedef int                  (* ma_snd_pcm_open_proc)                          (ma_snd_pcm_t **pcm, const char *name, ma_snd_pcm_stream_t stream, int mode);
+typedef int                  (* ma_snd_pcm_close_proc)                         (ma_snd_pcm_t *pcm);
+typedef size_t               (* ma_snd_pcm_hw_params_sizeof_proc)              (void);
+typedef int                  (* ma_snd_pcm_hw_params_any_proc)                 (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params);
+typedef int                  (* ma_snd_pcm_hw_params_set_format_proc)          (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_t val);
+typedef int                  (* ma_snd_pcm_hw_params_set_format_first_proc)    (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_t *format);
+typedef void                 (* ma_snd_pcm_hw_params_get_format_mask_proc)     (ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_mask_t *mask);
+typedef int                  (* ma_snd_pcm_hw_params_set_channels_proc)        (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val);
+typedef int                  (* ma_snd_pcm_hw_params_set_channels_near_proc)   (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int *val);
+typedef int                  (* ma_snd_pcm_hw_params_set_channels_minmax_proc) (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int *minimum, unsigned int *maximum);
+typedef int                  (* ma_snd_pcm_hw_params_set_rate_resample_proc)   (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val);
+typedef int                  (* ma_snd_pcm_hw_params_set_rate_proc)            (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val, int dir);
+typedef int                  (* ma_snd_pcm_hw_params_set_rate_near_proc)       (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int *val, int *dir);
+typedef int                  (* ma_snd_pcm_hw_params_set_buffer_size_near_proc)(ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_uframes_t *val);
+typedef int                  (* ma_snd_pcm_hw_params_set_periods_near_proc)    (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int *val, int *dir);
+typedef int                  (* ma_snd_pcm_hw_params_set_access_proc)          (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_access_t _access);
+typedef int                  (* ma_snd_pcm_hw_params_get_format_proc)          (const ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_t *format);
+typedef int                  (* ma_snd_pcm_hw_params_get_channels_proc)        (const ma_snd_pcm_hw_params_t *params, unsigned int *val);
+typedef int                  (* ma_snd_pcm_hw_params_get_channels_min_proc)    (const ma_snd_pcm_hw_params_t *params, unsigned int *val);
+typedef int                  (* ma_snd_pcm_hw_params_get_channels_max_proc)    (const ma_snd_pcm_hw_params_t *params, unsigned int *val);
+typedef int                  (* ma_snd_pcm_hw_params_get_rate_proc)            (const ma_snd_pcm_hw_params_t *params, unsigned int *rate, int *dir);
+typedef int                  (* ma_snd_pcm_hw_params_get_rate_min_proc)        (const ma_snd_pcm_hw_params_t *params, unsigned int *rate, int *dir);
+typedef int                  (* ma_snd_pcm_hw_params_get_rate_max_proc)        (const ma_snd_pcm_hw_params_t *params, unsigned int *rate, int *dir);
+typedef int                  (* ma_snd_pcm_hw_params_get_buffer_size_proc)     (const ma_snd_pcm_hw_params_t *params, ma_snd_pcm_uframes_t *val);
+typedef int                  (* ma_snd_pcm_hw_params_get_periods_proc)         (const ma_snd_pcm_hw_params_t *params, unsigned int *val, int *dir);
+typedef int                  (* ma_snd_pcm_hw_params_get_access_proc)          (const ma_snd_pcm_hw_params_t *params, ma_snd_pcm_access_t *_access);
+typedef int                  (* ma_snd_pcm_hw_params_test_format_proc)         (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, ma_snd_pcm_format_t val);
+typedef int                  (* ma_snd_pcm_hw_params_test_channels_proc)       (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val);
+typedef int                  (* ma_snd_pcm_hw_params_test_rate_proc)           (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params, unsigned int val, int dir);
+typedef int                  (* ma_snd_pcm_hw_params_proc)                     (ma_snd_pcm_t *pcm, ma_snd_pcm_hw_params_t *params);
+typedef size_t               (* ma_snd_pcm_sw_params_sizeof_proc)              (void);
+typedef int                  (* ma_snd_pcm_sw_params_current_proc)             (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params);
+typedef int                  (* ma_snd_pcm_sw_params_get_boundary_proc)        (const ma_snd_pcm_sw_params_t *params, ma_snd_pcm_uframes_t* val);
+typedef int                  (* ma_snd_pcm_sw_params_set_avail_min_proc)       (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params, ma_snd_pcm_uframes_t val);
+typedef int                  (* ma_snd_pcm_sw_params_set_start_threshold_proc) (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params, ma_snd_pcm_uframes_t val);
+typedef int                  (* ma_snd_pcm_sw_params_set_stop_threshold_proc)  (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params, ma_snd_pcm_uframes_t val);
+typedef int                  (* ma_snd_pcm_sw_params_proc)                     (ma_snd_pcm_t *pcm, ma_snd_pcm_sw_params_t *params);
+typedef size_t               (* ma_snd_pcm_format_mask_sizeof_proc)            (void);
+typedef int                  (* ma_snd_pcm_format_mask_test_proc)              (const ma_snd_pcm_format_mask_t *mask, ma_snd_pcm_format_t val);
+typedef ma_snd_pcm_chmap_t * (* ma_snd_pcm_get_chmap_proc)                     (ma_snd_pcm_t *pcm);
+typedef ma_snd_pcm_state_t   (* ma_snd_pcm_state_proc)                         (ma_snd_pcm_t *pcm);
+typedef int                  (* ma_snd_pcm_prepare_proc)                       (ma_snd_pcm_t *pcm);
+typedef int                  (* ma_snd_pcm_start_proc)                         (ma_snd_pcm_t *pcm);
+typedef int                  (* ma_snd_pcm_drop_proc)                          (ma_snd_pcm_t *pcm);
+typedef int                  (* ma_snd_pcm_drain_proc)                         (ma_snd_pcm_t *pcm);
+typedef int                  (* ma_snd_pcm_reset_proc)                         (ma_snd_pcm_t *pcm);
+typedef int                  (* ma_snd_device_name_hint_proc)                  (int card, const char *iface, void ***hints);
+typedef char *               (* ma_snd_device_name_get_hint_proc)              (const void *hint, const char *id);
+typedef int                  (* ma_snd_card_get_index_proc)                    (const char *name);
+typedef int                  (* ma_snd_device_name_free_hint_proc)             (void **hints);
+typedef int                  (* ma_snd_pcm_mmap_begin_proc)                    (ma_snd_pcm_t *pcm, const ma_snd_pcm_channel_area_t **areas, ma_snd_pcm_uframes_t *offset, ma_snd_pcm_uframes_t *frames);
+typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_mmap_commit_proc)                   (ma_snd_pcm_t *pcm, ma_snd_pcm_uframes_t offset, ma_snd_pcm_uframes_t frames);
+typedef int                  (* ma_snd_pcm_recover_proc)                       (ma_snd_pcm_t *pcm, int err, int silent);
+typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_readi_proc)                         (ma_snd_pcm_t *pcm, void *buffer, ma_snd_pcm_uframes_t size);
+typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_writei_proc)                        (ma_snd_pcm_t *pcm, const void *buffer, ma_snd_pcm_uframes_t size);
+typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_avail_proc)                         (ma_snd_pcm_t *pcm);
+typedef ma_snd_pcm_sframes_t (* ma_snd_pcm_avail_update_proc)                  (ma_snd_pcm_t *pcm);
+typedef int                  (* ma_snd_pcm_wait_proc)                          (ma_snd_pcm_t *pcm, int timeout);
+typedef int                  (* ma_snd_pcm_nonblock_proc)                      (ma_snd_pcm_t *pcm, int nonblock);
+typedef int                  (* ma_snd_pcm_info_proc)                          (ma_snd_pcm_t *pcm, ma_snd_pcm_info_t* info);
+typedef size_t               (* ma_snd_pcm_info_sizeof_proc)                   (void);
+typedef const char*          (* ma_snd_pcm_info_get_name_proc)                 (const ma_snd_pcm_info_t* info);
+typedef int                  (* ma_snd_pcm_poll_descriptors_proc)              (ma_snd_pcm_t *pcm, struct pollfd *pfds, unsigned int space);
+typedef int                  (* ma_snd_pcm_poll_descriptors_count_proc)        (ma_snd_pcm_t *pcm);
+typedef int                  (* ma_snd_pcm_poll_descriptors_revents_proc)      (ma_snd_pcm_t *pcm, struct pollfd *pfds, unsigned int nfds, unsigned short *revents);
+typedef int                  (* ma_snd_config_update_free_global_proc)         (void);
+
+/* This array specifies each of the common devices that can be used for both playback and capture. */
+static const char* g_maCommonDeviceNamesALSA[] = {
+    "default",
+    "null",
+    "pulse",
+    "jack"
+};
+
+/* This array allows us to blacklist specific playback devices. */
+static const char* g_maBlacklistedPlaybackDeviceNamesALSA[] = {
+    ""
+};
+
+/* This array allows us to blacklist specific capture devices. */
+static const char* g_maBlacklistedCaptureDeviceNamesALSA[] = {
+    ""
+};
+
+
+static ma_snd_pcm_format_t ma_convert_ma_format_to_alsa_format(ma_format format)
+{
+    ma_snd_pcm_format_t ALSAFormats[] = {
+        MA_SND_PCM_FORMAT_UNKNOWN,     /* ma_format_unknown */
+        MA_SND_PCM_FORMAT_U8,          /* ma_format_u8 */
+        MA_SND_PCM_FORMAT_S16_LE,      /* ma_format_s16 */
+        MA_SND_PCM_FORMAT_S24_3LE,     /* ma_format_s24 */
+        MA_SND_PCM_FORMAT_S32_LE,      /* ma_format_s32 */
+        MA_SND_PCM_FORMAT_FLOAT_LE     /* ma_format_f32 */
+    };
+
+    if (ma_is_big_endian()) {
+        ALSAFormats[0] = MA_SND_PCM_FORMAT_UNKNOWN;
+        ALSAFormats[1] = MA_SND_PCM_FORMAT_U8;
+        ALSAFormats[2] = MA_SND_PCM_FORMAT_S16_BE;
+        ALSAFormats[3] = MA_SND_PCM_FORMAT_S24_3BE;
+        ALSAFormats[4] = MA_SND_PCM_FORMAT_S32_BE;
+        ALSAFormats[5] = MA_SND_PCM_FORMAT_FLOAT_BE;
+    }
+
+    return ALSAFormats[format];
+}
+
+static ma_format ma_format_from_alsa(ma_snd_pcm_format_t formatALSA)
+{
+    if (ma_is_little_endian()) {
+        switch (formatALSA) {
+            case MA_SND_PCM_FORMAT_S16_LE:   return ma_format_s16;
+            case MA_SND_PCM_FORMAT_S24_3LE:  return ma_format_s24;
+            case MA_SND_PCM_FORMAT_S32_LE:   return ma_format_s32;
+            case MA_SND_PCM_FORMAT_FLOAT_LE: return ma_format_f32;
+            default: break;
+        }
+    } else {
+        switch (formatALSA) {
+            case MA_SND_PCM_FORMAT_S16_BE:   return ma_format_s16;
+            case MA_SND_PCM_FORMAT_S24_3BE:  return ma_format_s24;
+            case MA_SND_PCM_FORMAT_S32_BE:   return ma_format_s32;
+            case MA_SND_PCM_FORMAT_FLOAT_BE: return ma_format_f32;
+            default: break;
+        }
+    }
+
+    /* Endian agnostic. */
+    switch (formatALSA) {
+        case MA_SND_PCM_FORMAT_U8: return ma_format_u8;
+        default: return ma_format_unknown;
+    }
+}
+
+static ma_channel ma_convert_alsa_channel_position_to_ma_channel(unsigned int alsaChannelPos)
+{
+    switch (alsaChannelPos)
+    {
+        case MA_SND_CHMAP_MONO: return MA_CHANNEL_MONO;
+        case MA_SND_CHMAP_FL:   return MA_CHANNEL_FRONT_LEFT;
+        case MA_SND_CHMAP_FR:   return MA_CHANNEL_FRONT_RIGHT;
+        case MA_SND_CHMAP_RL:   return MA_CHANNEL_BACK_LEFT;
+        case MA_SND_CHMAP_RR:   return MA_CHANNEL_BACK_RIGHT;
+        case MA_SND_CHMAP_FC:   return MA_CHANNEL_FRONT_CENTER;
+        case MA_SND_CHMAP_LFE:  return MA_CHANNEL_LFE;
+        case MA_SND_CHMAP_SL:   return MA_CHANNEL_SIDE_LEFT;
+        case MA_SND_CHMAP_SR:   return MA_CHANNEL_SIDE_RIGHT;
+        case MA_SND_CHMAP_RC:   return MA_CHANNEL_BACK_CENTER;
+        case MA_SND_CHMAP_FLC:  return MA_CHANNEL_FRONT_LEFT_CENTER;
+        case MA_SND_CHMAP_FRC:  return MA_CHANNEL_FRONT_RIGHT_CENTER;
+        case MA_SND_CHMAP_RLC:  return 0;
+        case MA_SND_CHMAP_RRC:  return 0;
+        case MA_SND_CHMAP_FLW:  return 0;
+        case MA_SND_CHMAP_FRW:  return 0;
+        case MA_SND_CHMAP_FLH:  return 0;
+        case MA_SND_CHMAP_FCH:  return 0;
+        case MA_SND_CHMAP_FRH:  return 0;
+        case MA_SND_CHMAP_TC:   return MA_CHANNEL_TOP_CENTER;
+        case MA_SND_CHMAP_TFL:  return MA_CHANNEL_TOP_FRONT_LEFT;
+        case MA_SND_CHMAP_TFR:  return MA_CHANNEL_TOP_FRONT_RIGHT;
+        case MA_SND_CHMAP_TFC:  return MA_CHANNEL_TOP_FRONT_CENTER;
+        case MA_SND_CHMAP_TRL:  return MA_CHANNEL_TOP_BACK_LEFT;
+        case MA_SND_CHMAP_TRR:  return MA_CHANNEL_TOP_BACK_RIGHT;
+        case MA_SND_CHMAP_TRC:  return MA_CHANNEL_TOP_BACK_CENTER;
+        default: break;
+    }
+
+    return 0;
+}
+
+static ma_bool32 ma_is_common_device_name__alsa(const char* name)
+{
+    size_t iName;
+    for (iName = 0; iName < ma_countof(g_maCommonDeviceNamesALSA); ++iName) {
+        if (ma_strcmp(name, g_maCommonDeviceNamesALSA[iName]) == 0) {
+            return MA_TRUE;
+        }
+    }
+
+    return MA_FALSE;
+}
+
+
+static ma_bool32 ma_is_playback_device_blacklisted__alsa(const char* name)
+{
+    size_t iName;
+    for (iName = 0; iName < ma_countof(g_maBlacklistedPlaybackDeviceNamesALSA); ++iName) {
+        if (ma_strcmp(name, g_maBlacklistedPlaybackDeviceNamesALSA[iName]) == 0) {
+            return MA_TRUE;
+        }
+    }
+
+    return MA_FALSE;
+}
+
+static ma_bool32 ma_is_capture_device_blacklisted__alsa(const char* name)
+{
+    size_t iName;
+    for (iName = 0; iName < ma_countof(g_maBlacklistedCaptureDeviceNamesALSA); ++iName) {
+        if (ma_strcmp(name, g_maBlacklistedCaptureDeviceNamesALSA[iName]) == 0) {
+            return MA_TRUE;
+        }
+    }
+
+    return MA_FALSE;
+}
+
+static ma_bool32 ma_is_device_blacklisted__alsa(ma_device_type deviceType, const char* name)
+{
+    if (deviceType == ma_device_type_playback) {
+        return ma_is_playback_device_blacklisted__alsa(name);
+    } else {
+        return ma_is_capture_device_blacklisted__alsa(name);
+    }
+}
+
+
+static const char* ma_find_char(const char* str, char c, int* index)
+{
+    int i = 0;
+    for (;;) {
+        if (str[i] == '\0') {
+            if (index) *index = -1;
+            return NULL;
+        }
+
+        if (str[i] == c) {
+            if (index) *index = i;
+            return str + i;
+        }
+
+        i += 1;
+    }
+
+    /* Should never get here, but treat it as though the character was not found to make me feel better inside. */
+    if (index) *index = -1;
+    return NULL;
+}
+
+static ma_bool32 ma_is_device_name_in_hw_format__alsa(const char* hwid)
+{
+    /* This function is just checking whether or not hwid is in "hw:%d,%d" format. */
+
+    int commaPos;
+    const char* dev;
+    int i;
+
+    if (hwid == NULL) {
+        return MA_FALSE;
+    }
+
+    if (hwid[0] != 'h' || hwid[1] != 'w' || hwid[2] != ':') {
+        return MA_FALSE;
+    }
+
+    hwid += 3;
+
+    dev = ma_find_char(hwid, ',', &commaPos);
+    if (dev == NULL) {
+        return MA_FALSE;
+    } else {
+        dev += 1;   /* Skip past the ",". */
+    }
+
+    /* Check if the part between the ":" and the "," contains only numbers. If not, return false. */
+    for (i = 0; i < commaPos; ++i) {
+        if (hwid[i] < '0' || hwid[i] > '9') {
+            return MA_FALSE;
+        }
+    }
+
+    /* Check if everything after the "," is numeric. If not, return false. */
+    i = 0;
+    while (dev[i] != '\0') {
+        if (dev[i] < '0' || dev[i] > '9') {
+            return MA_FALSE;
+        }
+        i += 1;
+    }
+
+    return MA_TRUE;
+}
+
+static int ma_convert_device_name_to_hw_format__alsa(ma_context* pContext, char* dst, size_t dstSize, const char* src)  /* Returns 0 on success, non-0 on error. */
+{
+    /* src should look something like this: "hw:CARD=I82801AAICH,DEV=0" */
+
+    int colonPos;
+    int commaPos;
+    char card[256];
+    const char* dev;
+    int cardIndex;
+
+    if (dst == NULL) {
+        return -1;
+    }
+    if (dstSize < 7) {
+        return -1;     /* Absolute minimum size of the output buffer is 7 bytes. */
+    }
+
+    *dst = '\0';    /* Safety. */
+    if (src == NULL) {
+        return -1;
+    }
+
+    /* If the input name is already in "hw:%d,%d" format, just return that verbatim. */
+    if (ma_is_device_name_in_hw_format__alsa(src)) {
+        return ma_strcpy_s(dst, dstSize, src);
+    }
+
+    src = ma_find_char(src, ':', &colonPos);
+    if (src == NULL) {
+        return -1;  /* Couldn't find a colon */
+    }
+
+    dev = ma_find_char(src, ',', &commaPos);
+    if (dev == NULL) {
+        dev = "0";
+        ma_strncpy_s(card, sizeof(card), src+6, (size_t)-1);   /* +6 = ":CARD=" */
+    } else {
+        dev = dev + 5;  /* +5 = ",DEV=" */
+        ma_strncpy_s(card, sizeof(card), src+6, commaPos-6);   /* +6 = ":CARD=" */
+    }
+
+    cardIndex = ((ma_snd_card_get_index_proc)pContext->alsa.snd_card_get_index)(card);
+    if (cardIndex < 0) {
+        return -2;  /* Failed to retrieve the card index. */
+    }
+
+
+    /* Construction. */
+    dst[0] = 'h'; dst[1] = 'w'; dst[2] = ':';
+    if (ma_itoa_s(cardIndex, dst+3, dstSize-3, 10) != 0) {
+        return -3;
+    }
+    if (ma_strcat_s(dst, dstSize, ",") != 0) {
+        return -3;
+    }
+    if (ma_strcat_s(dst, dstSize, dev) != 0) {
+        return -3;
+    }
+
+    return 0;
+}
+
+static ma_bool32 ma_does_id_exist_in_list__alsa(ma_device_id* pUniqueIDs, ma_uint32 count, const char* pHWID)
+{
+    ma_uint32 i;
+
+    MA_ASSERT(pHWID != NULL);
+
+    for (i = 0; i < count; ++i) {
+        if (ma_strcmp(pUniqueIDs[i].alsa, pHWID) == 0) {
+            return MA_TRUE;
+        }
+    }
+
+    return MA_FALSE;
+}
+
+
+static ma_result ma_context_open_pcm__alsa(ma_context* pContext, ma_share_mode shareMode, ma_device_type deviceType, const ma_device_id* pDeviceID, int openMode, ma_snd_pcm_t** ppPCM)
+{
+    ma_snd_pcm_t* pPCM;
+    ma_snd_pcm_stream_t stream;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(ppPCM != NULL);
+
+    *ppPCM = NULL;
+    pPCM = NULL;
+
+    stream = (deviceType == ma_device_type_playback) ? MA_SND_PCM_STREAM_PLAYBACK : MA_SND_PCM_STREAM_CAPTURE;
+
+    if (pDeviceID == NULL) {
+        ma_bool32 isDeviceOpen;
+        size_t i;
+
+        /*
+        We're opening the default device. I don't know if trying anything other than "default" is necessary, but it makes
+        me feel better to try as hard as we can get to get _something_ working.
+        */
+        const char* defaultDeviceNames[] = {
+            "default",
+            NULL,
+            NULL,
+            NULL,
+            NULL,
+            NULL,
+            NULL
+        };
+
+        if (shareMode == ma_share_mode_exclusive) {
+            defaultDeviceNames[1] = "hw";
+            defaultDeviceNames[2] = "hw:0";
+            defaultDeviceNames[3] = "hw:0,0";
+        } else {
+            if (deviceType == ma_device_type_playback) {
+                defaultDeviceNames[1] = "dmix";
+                defaultDeviceNames[2] = "dmix:0";
+                defaultDeviceNames[3] = "dmix:0,0";
+            } else {
+                defaultDeviceNames[1] = "dsnoop";
+                defaultDeviceNames[2] = "dsnoop:0";
+                defaultDeviceNames[3] = "dsnoop:0,0";
+            }
+            defaultDeviceNames[4] = "hw";
+            defaultDeviceNames[5] = "hw:0";
+            defaultDeviceNames[6] = "hw:0,0";
+        }
+
+        isDeviceOpen = MA_FALSE;
+        for (i = 0; i < ma_countof(defaultDeviceNames); ++i) {
+            if (defaultDeviceNames[i] != NULL && defaultDeviceNames[i][0] != '\0') {
+                if (((ma_snd_pcm_open_proc)pContext->alsa.snd_pcm_open)(&pPCM, defaultDeviceNames[i], stream, openMode) == 0) {
+                    isDeviceOpen = MA_TRUE;
+                    break;
+                }
+            }
+        }
+
+        if (!isDeviceOpen) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[ALSA] snd_pcm_open() failed when trying to open an appropriate default device.");
+            return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+        }
+    } else {
+        /*
+        We're trying to open a specific device. There's a few things to consider here:
+
+        miniaudio recognizes a special format of device id that excludes the "hw", "dmix", etc. prefix. It looks like this: ":0,0", ":0,1", etc. When
+        an ID of this format is specified, it indicates to miniaudio that it can try different combinations of plugins ("hw", "dmix", etc.) until it
+        finds an appropriate one that works. This comes in very handy when trying to open a device in shared mode ("dmix"), vs exclusive mode ("hw").
+        */
+
+        /* May end up needing to make small adjustments to the ID, so make a copy. */
+        ma_device_id deviceID = *pDeviceID;
+        int resultALSA = -ENODEV;
+
+        if (deviceID.alsa[0] != ':') {
+            /* The ID is not in ":0,0" format. Use the ID exactly as-is. */
+            resultALSA = ((ma_snd_pcm_open_proc)pContext->alsa.snd_pcm_open)(&pPCM, deviceID.alsa, stream, openMode);
+        } else {
+            char hwid[256];
+
+            /* The ID is in ":0,0" format. Try different plugins depending on the shared mode. */
+            if (deviceID.alsa[1] == '\0') {
+                deviceID.alsa[0] = '\0';  /* An ID of ":" should be converted to "". */
+            }
+
+            if (shareMode == ma_share_mode_shared) {
+                if (deviceType == ma_device_type_playback) {
+                    ma_strcpy_s(hwid, sizeof(hwid), "dmix");
+                } else {
+                    ma_strcpy_s(hwid, sizeof(hwid), "dsnoop");
+                }
+
+                if (ma_strcat_s(hwid, sizeof(hwid), deviceID.alsa) == 0) {
+                    resultALSA = ((ma_snd_pcm_open_proc)pContext->alsa.snd_pcm_open)(&pPCM, hwid, stream, openMode);
+                }
+            }
+
+            /* If at this point we still don't have an open device it means we're either preferencing exclusive mode or opening with "dmix"/"dsnoop" failed. */
+            if (resultALSA != 0) {
+                ma_strcpy_s(hwid, sizeof(hwid), "hw");
+                if (ma_strcat_s(hwid, sizeof(hwid), deviceID.alsa) == 0) {
+                    resultALSA = ((ma_snd_pcm_open_proc)pContext->alsa.snd_pcm_open)(&pPCM, hwid, stream, openMode);
+                }
+            }
+        }
+
+        if (resultALSA < 0) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[ALSA] snd_pcm_open() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+    }
+
+    *ppPCM = pPCM;
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_context_enumerate_devices__alsa(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    int resultALSA;
+    ma_bool32 cbResult = MA_TRUE;
+    char** ppDeviceHints;
+    ma_device_id* pUniqueIDs = NULL;
+    ma_uint32 uniqueIDCount = 0;
+    char** ppNextDeviceHint;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    ma_mutex_lock(&pContext->alsa.internalDeviceEnumLock);
+
+    resultALSA = ((ma_snd_device_name_hint_proc)pContext->alsa.snd_device_name_hint)(-1, "pcm", (void***)&ppDeviceHints);
+    if (resultALSA < 0) {
+        ma_mutex_unlock(&pContext->alsa.internalDeviceEnumLock);
+        return ma_result_from_errno(-resultALSA);
+    }
+
+    ppNextDeviceHint = ppDeviceHints;
+    while (*ppNextDeviceHint != NULL) {
+        char* NAME = ((ma_snd_device_name_get_hint_proc)pContext->alsa.snd_device_name_get_hint)(*ppNextDeviceHint, "NAME");
+        char* DESC = ((ma_snd_device_name_get_hint_proc)pContext->alsa.snd_device_name_get_hint)(*ppNextDeviceHint, "DESC");
+        char* IOID = ((ma_snd_device_name_get_hint_proc)pContext->alsa.snd_device_name_get_hint)(*ppNextDeviceHint, "IOID");
+        ma_device_type deviceType = ma_device_type_playback;
+        ma_bool32 stopEnumeration = MA_FALSE;
+        char hwid[sizeof(pUniqueIDs->alsa)];
+        ma_device_info deviceInfo;
+
+        if ((IOID == NULL || ma_strcmp(IOID, "Output") == 0)) {
+            deviceType = ma_device_type_playback;
+        }
+        if ((IOID != NULL && ma_strcmp(IOID, "Input" ) == 0)) {
+            deviceType = ma_device_type_capture;
+        }
+
+        if (NAME != NULL) {
+            if (pContext->alsa.useVerboseDeviceEnumeration) {
+                /* Verbose mode. Use the name exactly as-is. */
+                ma_strncpy_s(hwid, sizeof(hwid), NAME, (size_t)-1);
+            } else {
+                /* Simplified mode. Use ":%d,%d" format. */
+                if (ma_convert_device_name_to_hw_format__alsa(pContext, hwid, sizeof(hwid), NAME) == 0) {
+                    /*
+                    At this point, hwid looks like "hw:0,0". In simplified enumeration mode, we actually want to strip off the
+                    plugin name so it looks like ":0,0". The reason for this is that this special format is detected at device
+                    initialization time and is used as an indicator to try to use the most appropriate plugin depending on the
+                    device type and sharing mode.
+                    */
+                    char* dst = hwid;
+                    char* src = hwid+2;
+                    while ((*dst++ = *src++));
+                } else {
+                    /* Conversion to "hw:%d,%d" failed. Just use the name as-is. */
+                    ma_strncpy_s(hwid, sizeof(hwid), NAME, (size_t)-1);
+                }
+
+                if (ma_does_id_exist_in_list__alsa(pUniqueIDs, uniqueIDCount, hwid)) {
+                    goto next_device;   /* The device has already been enumerated. Move on to the next one. */
+                } else {
+                    /* The device has not yet been enumerated. Make sure it's added to our list so that it's not enumerated again. */
+                    size_t newCapacity = sizeof(*pUniqueIDs) * (uniqueIDCount + 1);
+                    ma_device_id* pNewUniqueIDs = (ma_device_id*)ma_realloc(pUniqueIDs, newCapacity, &pContext->allocationCallbacks);
+                    if (pNewUniqueIDs == NULL) {
+                        goto next_device;   /* Failed to allocate memory. */
+                    }
+
+                    pUniqueIDs = pNewUniqueIDs;
+                    MA_COPY_MEMORY(pUniqueIDs[uniqueIDCount].alsa, hwid, sizeof(hwid));
+                    uniqueIDCount += 1;
+                }
+            }
+        } else {
+            MA_ZERO_MEMORY(hwid, sizeof(hwid));
+        }
+
+        MA_ZERO_OBJECT(&deviceInfo);
+        ma_strncpy_s(deviceInfo.id.alsa, sizeof(deviceInfo.id.alsa), hwid, (size_t)-1);
+
+        /*
+        There's no good way to determine whether or not a device is the default on Linux. We're just going to do something simple and
+        just use the name of "default" as the indicator.
+        */
+        if (ma_strcmp(deviceInfo.id.alsa, "default") == 0) {
+            deviceInfo.isDefault = MA_TRUE;
+        }
+
+
+        /*
+        DESC is the friendly name. We treat this slightly differently depending on whether or not we are using verbose
+        device enumeration. In verbose mode we want to take the entire description so that the end-user can distinguish
+        between the subdevices of each card/dev pair. In simplified mode, however, we only want the first part of the
+        description.
+
+        The value in DESC seems to be split into two lines, with the first line being the name of the device and the
+        second line being a description of the device. I don't like having the description be across two lines because
+        it makes formatting ugly and annoying. I'm therefore deciding to put it all on a single line with the second line
+        being put into parentheses. In simplified mode I'm just stripping the second line entirely.
+        */
+        if (DESC != NULL) {
+            int lfPos;
+            const char* line2 = ma_find_char(DESC, '\n', &lfPos);
+            if (line2 != NULL) {
+                line2 += 1; /* Skip past the new-line character. */
+
+                if (pContext->alsa.useVerboseDeviceEnumeration) {
+                    /* Verbose mode. Put the second line in brackets. */
+                    ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), DESC, lfPos);
+                    ma_strcat_s (deviceInfo.name, sizeof(deviceInfo.name), " (");
+                    ma_strcat_s (deviceInfo.name, sizeof(deviceInfo.name), line2);
+                    ma_strcat_s (deviceInfo.name, sizeof(deviceInfo.name), ")");
+                } else {
+                    /* Simplified mode. Strip the second line entirely. */
+                    ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), DESC, lfPos);
+                }
+            } else {
+                /* There's no second line. Just copy the whole description. */
+                ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), DESC, (size_t)-1);
+            }
+        }
+
+        if (!ma_is_device_blacklisted__alsa(deviceType, NAME)) {
+            cbResult = callback(pContext, deviceType, &deviceInfo, pUserData);
+        }
+
+        /*
+        Some devices are both playback and capture, but they are only enumerated by ALSA once. We need to fire the callback
+        again for the other device type in this case. We do this for known devices and where the IOID hint is NULL, which
+        means both Input and Output.
+        */
+        if (cbResult) {
+            if (ma_is_common_device_name__alsa(NAME) || IOID == NULL) {
+                if (deviceType == ma_device_type_playback) {
+                    if (!ma_is_capture_device_blacklisted__alsa(NAME)) {
+                        cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+                    }
+                } else {
+                    if (!ma_is_playback_device_blacklisted__alsa(NAME)) {
+                        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+                    }
+                }
+            }
+        }
+
+        if (cbResult == MA_FALSE) {
+            stopEnumeration = MA_TRUE;
+        }
+
+    next_device:
+        free(NAME);
+        free(DESC);
+        free(IOID);
+        ppNextDeviceHint += 1;
+
+        /* We need to stop enumeration if the callback returned false. */
+        if (stopEnumeration) {
+            break;
+        }
+    }
+
+    ma_free(pUniqueIDs, &pContext->allocationCallbacks);
+    ((ma_snd_device_name_free_hint_proc)pContext->alsa.snd_device_name_free_hint)((void**)ppDeviceHints);
+
+    ma_mutex_unlock(&pContext->alsa.internalDeviceEnumLock);
+
+    return MA_SUCCESS;
+}
+
+
+typedef struct
+{
+    ma_device_type deviceType;
+    const ma_device_id* pDeviceID;
+    ma_share_mode shareMode;
+    ma_device_info* pDeviceInfo;
+    ma_bool32 foundDevice;
+} ma_context_get_device_info_enum_callback_data__alsa;
+
+static ma_bool32 ma_context_get_device_info_enum_callback__alsa(ma_context* pContext, ma_device_type deviceType, const ma_device_info* pDeviceInfo, void* pUserData)
+{
+    ma_context_get_device_info_enum_callback_data__alsa* pData = (ma_context_get_device_info_enum_callback_data__alsa*)pUserData;
+    MA_ASSERT(pData != NULL);
+
+    (void)pContext;
+
+    if (pData->pDeviceID == NULL && ma_strcmp(pDeviceInfo->id.alsa, "default") == 0) {
+        ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), pDeviceInfo->name, (size_t)-1);
+        pData->foundDevice = MA_TRUE;
+    } else {
+        if (pData->deviceType == deviceType && (pData->pDeviceID != NULL && ma_strcmp(pData->pDeviceID->alsa, pDeviceInfo->id.alsa) == 0)) {
+            ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), pDeviceInfo->name, (size_t)-1);
+            pData->foundDevice = MA_TRUE;
+        }
+    }
+
+    /* Keep enumerating until we have found the device. */
+    return !pData->foundDevice;
+}
+
+static void ma_context_test_rate_and_add_native_data_format__alsa(ma_context* pContext, ma_snd_pcm_t* pPCM, ma_snd_pcm_hw_params_t* pHWParams, ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 flags, ma_device_info* pDeviceInfo)
+{
+    MA_ASSERT(pPCM        != NULL);
+    MA_ASSERT(pHWParams   != NULL);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    if (pDeviceInfo->nativeDataFormatCount < ma_countof(pDeviceInfo->nativeDataFormats) && ((ma_snd_pcm_hw_params_test_rate_proc)pContext->alsa.snd_pcm_hw_params_test_rate)(pPCM, pHWParams, sampleRate, 0) == 0) {
+        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
+        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
+        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = sampleRate;
+        pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = flags;
+        pDeviceInfo->nativeDataFormatCount += 1;
+    }
+}
+
+static void ma_context_iterate_rates_and_add_native_data_format__alsa(ma_context* pContext, ma_snd_pcm_t* pPCM, ma_snd_pcm_hw_params_t* pHWParams, ma_format format, ma_uint32 channels, ma_uint32 flags, ma_device_info* pDeviceInfo)
+{
+    ma_uint32 iSampleRate;
+    unsigned int minSampleRate;
+    unsigned int maxSampleRate;
+    int sampleRateDir;  /* Not used. Just passed into snd_pcm_hw_params_get_rate_min/max(). */
+
+    /* There could be a range. */
+    ((ma_snd_pcm_hw_params_get_rate_min_proc)pContext->alsa.snd_pcm_hw_params_get_rate_min)(pHWParams, &minSampleRate, &sampleRateDir);
+    ((ma_snd_pcm_hw_params_get_rate_max_proc)pContext->alsa.snd_pcm_hw_params_get_rate_max)(pHWParams, &maxSampleRate, &sampleRateDir);
+
+    /* Make sure our sample rates are clamped to sane values. Stupid devices like "pulse" will reports rates like "1" which is ridiculous. */
+    minSampleRate = ma_clamp(minSampleRate, (unsigned int)ma_standard_sample_rate_min, (unsigned int)ma_standard_sample_rate_max);
+    maxSampleRate = ma_clamp(maxSampleRate, (unsigned int)ma_standard_sample_rate_min, (unsigned int)ma_standard_sample_rate_max);
+
+    for (iSampleRate = 0; iSampleRate < ma_countof(g_maStandardSampleRatePriorities); iSampleRate += 1) {
+        ma_uint32 standardSampleRate = g_maStandardSampleRatePriorities[iSampleRate];
+
+        if (standardSampleRate >= minSampleRate && standardSampleRate <= maxSampleRate) {
+            ma_context_test_rate_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, channels, standardSampleRate, flags, pDeviceInfo);
+        }
+    }
+
+    /* Now make sure our min and max rates are included just in case they aren't in the range of our standard rates. */
+    if (!ma_is_standard_sample_rate(minSampleRate)) {
+        ma_context_test_rate_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, channels, minSampleRate, flags, pDeviceInfo);
+    }
+
+    if (!ma_is_standard_sample_rate(maxSampleRate) && maxSampleRate != minSampleRate) {
+        ma_context_test_rate_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, channels, maxSampleRate, flags, pDeviceInfo);
+    }
+}
+
+static ma_result ma_context_get_device_info__alsa(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    ma_context_get_device_info_enum_callback_data__alsa data;
+    ma_result result;
+    int resultALSA;
+    ma_snd_pcm_t* pPCM;
+    ma_snd_pcm_hw_params_t* pHWParams;
+    ma_uint32 iFormat;
+    ma_uint32 iChannel;
+
+    MA_ASSERT(pContext != NULL);
+
+    /* We just enumerate to find basic information about the device. */
+    data.deviceType  = deviceType;
+    data.pDeviceID   = pDeviceID;
+    data.pDeviceInfo = pDeviceInfo;
+    data.foundDevice = MA_FALSE;
+    result = ma_context_enumerate_devices__alsa(pContext, ma_context_get_device_info_enum_callback__alsa, &data);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (!data.foundDevice) {
+        return MA_NO_DEVICE;
+    }
+
+    if (ma_strcmp(pDeviceInfo->id.alsa, "default") == 0) {
+        pDeviceInfo->isDefault = MA_TRUE;
+    }
+
+    /* For detailed info we need to open the device. */
+    result = ma_context_open_pcm__alsa(pContext, ma_share_mode_shared, deviceType, pDeviceID, 0, &pPCM);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* We need to initialize a HW parameters object in order to know what formats are supported. */
+    pHWParams = (ma_snd_pcm_hw_params_t*)ma_calloc(((ma_snd_pcm_hw_params_sizeof_proc)pContext->alsa.snd_pcm_hw_params_sizeof)(), &pContext->allocationCallbacks);
+    if (pHWParams == NULL) {
+        ((ma_snd_pcm_close_proc)pContext->alsa.snd_pcm_close)(pPCM);
+        return MA_OUT_OF_MEMORY;
+    }
+
+    resultALSA = ((ma_snd_pcm_hw_params_any_proc)pContext->alsa.snd_pcm_hw_params_any)(pPCM, pHWParams);
+    if (resultALSA < 0) {
+        ma_free(pHWParams, &pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to initialize hardware parameters. snd_pcm_hw_params_any() failed.");
+        return ma_result_from_errno(-resultALSA);
+    }
+
+    /*
+    Some ALSA devices can support many permutations of formats, channels and rates. We only support
+    a fixed number of permutations which means we need to employ some strategies to ensure the best
+    combinations are returned. An example is the "pulse" device which can do its own data conversion
+    in software and as a result can support any combination of format, channels and rate.
+
+    We want to ensure that the first data formats are the best. We have a list of favored sample
+    formats and sample rates, so these will be the basis of our iteration.
+    */
+
+    /* Formats. We just iterate over our standard formats and test them, making sure we reset the configuration space each iteration. */
+    for (iFormat = 0; iFormat < ma_countof(g_maFormatPriorities); iFormat += 1) {
+        ma_format format = g_maFormatPriorities[iFormat];
+
+        /*
+        For each format we need to make sure we reset the configuration space so we don't return
+        channel counts and rates that aren't compatible with a format.
+        */
+        ((ma_snd_pcm_hw_params_any_proc)pContext->alsa.snd_pcm_hw_params_any)(pPCM, pHWParams);
+
+        /* Test the format first. If this fails it means the format is not supported and we can skip it. */
+        if (((ma_snd_pcm_hw_params_test_format_proc)pContext->alsa.snd_pcm_hw_params_test_format)(pPCM, pHWParams, ma_convert_ma_format_to_alsa_format(format)) == 0) {
+            /* The format is supported. */
+            unsigned int minChannels;
+            unsigned int maxChannels;
+
+            /*
+            The configuration space needs to be restricted to this format so we can get an accurate
+            picture of which sample rates and channel counts are support with this format.
+            */
+            ((ma_snd_pcm_hw_params_set_format_proc)pContext->alsa.snd_pcm_hw_params_set_format)(pPCM, pHWParams, ma_convert_ma_format_to_alsa_format(format));
+
+            /* Now we need to check for supported channels. */
+            ((ma_snd_pcm_hw_params_get_channels_min_proc)pContext->alsa.snd_pcm_hw_params_get_channels_min)(pHWParams, &minChannels);
+            ((ma_snd_pcm_hw_params_get_channels_max_proc)pContext->alsa.snd_pcm_hw_params_get_channels_max)(pHWParams, &maxChannels);
+
+            if (minChannels > MA_MAX_CHANNELS) {
+                continue;   /* Too many channels. */
+            }
+            if (maxChannels < MA_MIN_CHANNELS) {
+                continue;   /* Not enough channels. */
+            }
+
+            /*
+            Make sure the channel count is clamped. This is mainly intended for the max channels
+            because some devices can report an unbound maximum.
+            */
+            minChannels = ma_clamp(minChannels, MA_MIN_CHANNELS, MA_MAX_CHANNELS);
+            maxChannels = ma_clamp(maxChannels, MA_MIN_CHANNELS, MA_MAX_CHANNELS);
+
+            if (minChannels == MA_MIN_CHANNELS && maxChannels == MA_MAX_CHANNELS) {
+                /* The device supports all channels. Don't iterate over every single one. Instead just set the channels to 0 which means all channels are supported. */
+                ma_context_iterate_rates_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, 0, 0, pDeviceInfo);    /* Intentionally setting the channel count to 0 as that means all channels are supported. */
+            } else {
+                /* The device only supports a specific set of channels. We need to iterate over all of them. */
+                for (iChannel = minChannels; iChannel <= maxChannels; iChannel += 1) {
+                    /* Test the channel before applying it to the configuration space. */
+                    unsigned int channels = iChannel;
+
+                    /* Make sure our channel range is reset before testing again or else we'll always fail the test. */
+                    ((ma_snd_pcm_hw_params_any_proc)pContext->alsa.snd_pcm_hw_params_any)(pPCM, pHWParams);
+                    ((ma_snd_pcm_hw_params_set_format_proc)pContext->alsa.snd_pcm_hw_params_set_format)(pPCM, pHWParams, ma_convert_ma_format_to_alsa_format(format));
+
+                    if (((ma_snd_pcm_hw_params_test_channels_proc)pContext->alsa.snd_pcm_hw_params_test_channels)(pPCM, pHWParams, channels) == 0) {
+                        /* The channel count is supported. */
+
+                        /* The configuration space now needs to be restricted to the channel count before extracting the sample rate. */
+                        ((ma_snd_pcm_hw_params_set_channels_proc)pContext->alsa.snd_pcm_hw_params_set_channels)(pPCM, pHWParams, channels);
+
+                        /* Only after the configuration space has been restricted to the specific channel count should we iterate over our sample rates. */
+                        ma_context_iterate_rates_and_add_native_data_format__alsa(pContext, pPCM, pHWParams, format, channels, 0, pDeviceInfo);
+                    } else {
+                        /* The channel count is not supported. Skip. */
+                    }
+                }
+            }
+        } else {
+            /* The format is not supported. Skip. */
+        }
+    }
+
+    ma_free(pHWParams, &pContext->allocationCallbacks);
+
+    ((ma_snd_pcm_close_proc)pContext->alsa.snd_pcm_close)(pPCM);
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_uninit__alsa(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if ((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture) {
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture);
+        close(pDevice->alsa.wakeupfdCapture);
+        ma_free(pDevice->alsa.pPollDescriptorsCapture, &pDevice->pContext->allocationCallbacks);
+    }
+
+    if ((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback) {
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback);
+        close(pDevice->alsa.wakeupfdPlayback);
+        ma_free(pDevice->alsa.pPollDescriptorsPlayback, &pDevice->pContext->allocationCallbacks);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init_by_type__alsa(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptor, ma_device_type deviceType)
+{
+    ma_result result;
+    int resultALSA;
+    ma_snd_pcm_t* pPCM;
+    ma_bool32 isUsingMMap;
+    ma_snd_pcm_format_t formatALSA;
+    ma_format internalFormat;
+    ma_uint32 internalChannels;
+    ma_uint32 internalSampleRate;
+    ma_channel internalChannelMap[MA_MAX_CHANNELS];
+    ma_uint32 internalPeriodSizeInFrames;
+    ma_uint32 internalPeriods;
+    int openMode;
+    ma_snd_pcm_hw_params_t* pHWParams;
+    ma_snd_pcm_sw_params_t* pSWParams;
+    ma_snd_pcm_uframes_t bufferBoundary;
+    int pollDescriptorCount;
+    struct pollfd* pPollDescriptors;
+    int wakeupfd;
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(deviceType != ma_device_type_duplex); /* This function should only be called for playback _or_ capture, never duplex. */
+    MA_ASSERT(pDevice != NULL);
+
+    formatALSA = ma_convert_ma_format_to_alsa_format(pDescriptor->format);
+
+    openMode = 0;
+    if (pConfig->alsa.noAutoResample) {
+        openMode |= MA_SND_PCM_NO_AUTO_RESAMPLE;
+    }
+    if (pConfig->alsa.noAutoChannels) {
+        openMode |= MA_SND_PCM_NO_AUTO_CHANNELS;
+    }
+    if (pConfig->alsa.noAutoFormat) {
+        openMode |= MA_SND_PCM_NO_AUTO_FORMAT;
+    }
+
+    result = ma_context_open_pcm__alsa(pDevice->pContext, pDescriptor->shareMode, deviceType, pDescriptor->pDeviceID, openMode, &pPCM);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+
+    /* Hardware parameters. */
+    pHWParams = (ma_snd_pcm_hw_params_t*)ma_calloc(((ma_snd_pcm_hw_params_sizeof_proc)pDevice->pContext->alsa.snd_pcm_hw_params_sizeof)(), &pDevice->pContext->allocationCallbacks);
+    if (pHWParams == NULL) {
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to allocate memory for hardware parameters.");
+        return MA_OUT_OF_MEMORY;
+    }
+
+    resultALSA = ((ma_snd_pcm_hw_params_any_proc)pDevice->pContext->alsa.snd_pcm_hw_params_any)(pPCM, pHWParams);
+    if (resultALSA < 0) {
+        ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to initialize hardware parameters. snd_pcm_hw_params_any() failed.");
+        return ma_result_from_errno(-resultALSA);
+    }
+
+    /* MMAP Mode. Try using interleaved MMAP access. If this fails, fall back to standard readi/writei. */
+    isUsingMMap = MA_FALSE;
+#if 0   /* NOTE: MMAP mode temporarily disabled. */
+    if (deviceType != ma_device_type_capture) {    /* <-- Disabling MMAP mode for capture devices because I apparently do not have a device that supports it which means I can't test it... Contributions welcome. */
+        if (!pConfig->alsa.noMMap) {
+            if (((ma_snd_pcm_hw_params_set_access_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_access)(pPCM, pHWParams, MA_SND_PCM_ACCESS_MMAP_INTERLEAVED) == 0) {
+                pDevice->alsa.isUsingMMap = MA_TRUE;
+            }
+        }
+    }
+#endif
+
+    if (!isUsingMMap) {
+        resultALSA = ((ma_snd_pcm_hw_params_set_access_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_access)(pPCM, pHWParams, MA_SND_PCM_ACCESS_RW_INTERLEAVED);
+        if (resultALSA < 0) {
+            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set access mode to neither SND_PCM_ACCESS_MMAP_INTERLEAVED nor SND_PCM_ACCESS_RW_INTERLEAVED. snd_pcm_hw_params_set_access() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+    }
+
+    /*
+    Most important properties first. The documentation for OSS (yes, I know this is ALSA!) recommends format, channels, then sample rate. I can't
+    find any documentation for ALSA specifically, so I'm going to copy the recommendation for OSS.
+    */
+
+    /* Format. */
+    {
+        /*
+        At this point we should have a list of supported formats, so now we need to find the best one. We first check if the requested format is
+        supported, and if so, use that one. If it's not supported, we just run though a list of formats and try to find the best one.
+        */
+        if (formatALSA == MA_SND_PCM_FORMAT_UNKNOWN || ((ma_snd_pcm_hw_params_test_format_proc)pDevice->pContext->alsa.snd_pcm_hw_params_test_format)(pPCM, pHWParams, formatALSA) != 0) {
+            /* We're either requesting the native format or the specified format is not supported. */
+            size_t iFormat;
+
+            formatALSA = MA_SND_PCM_FORMAT_UNKNOWN;
+            for (iFormat = 0; iFormat < ma_countof(g_maFormatPriorities); ++iFormat) {
+                if (((ma_snd_pcm_hw_params_test_format_proc)pDevice->pContext->alsa.snd_pcm_hw_params_test_format)(pPCM, pHWParams, ma_convert_ma_format_to_alsa_format(g_maFormatPriorities[iFormat])) == 0) {
+                    formatALSA = ma_convert_ma_format_to_alsa_format(g_maFormatPriorities[iFormat]);
+                    break;
+                }
+            }
+
+            if (formatALSA == MA_SND_PCM_FORMAT_UNKNOWN) {
+                ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+                ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Format not supported. The device does not support any miniaudio formats.");
+                return MA_FORMAT_NOT_SUPPORTED;
+            }
+        }
+
+        resultALSA = ((ma_snd_pcm_hw_params_set_format_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_format)(pPCM, pHWParams, formatALSA);
+        if (resultALSA < 0) {
+            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Format not supported. snd_pcm_hw_params_set_format() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+
+        internalFormat = ma_format_from_alsa(formatALSA);
+        if (internalFormat == ma_format_unknown) {
+            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] The chosen format is not supported by miniaudio.");
+            return MA_FORMAT_NOT_SUPPORTED;
+        }
+    }
+
+    /* Channels. */
+    {
+        unsigned int channels = pDescriptor->channels;
+        if (channels == 0) {
+            channels = MA_DEFAULT_CHANNELS;
+        }
+
+        resultALSA = ((ma_snd_pcm_hw_params_set_channels_near_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_channels_near)(pPCM, pHWParams, &channels);
+        if (resultALSA < 0) {
+            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set channel count. snd_pcm_hw_params_set_channels_near() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+
+        internalChannels = (ma_uint32)channels;
+    }
+
+    /* Sample Rate */
+    {
+        unsigned int sampleRate;
+
+        /*
+        It appears there's either a bug in ALSA, a bug in some drivers, or I'm doing something silly; but having resampling enabled causes
+        problems with some device configurations when used in conjunction with MMAP access mode. To fix this problem we need to disable
+        resampling.
+
+        To reproduce this problem, open the "plug:dmix" device, and set the sample rate to 44100. Internally, it looks like dmix uses a
+        sample rate of 48000. The hardware parameters will get set correctly with no errors, but it looks like the 44100 -> 48000 resampling
+        doesn't work properly - but only with MMAP access mode. You will notice skipping/crackling in the audio, and it'll run at a slightly
+        faster rate.
+
+        miniaudio has built-in support for sample rate conversion (albeit low quality at the moment), so disabling resampling should be fine
+        for us. The only problem is that it won't be taking advantage of any kind of hardware-accelerated resampling and it won't be very
+        good quality until I get a chance to improve the quality of miniaudio's software sample rate conversion.
+
+        I don't currently know if the dmix plugin is the only one with this error. Indeed, this is the only one I've been able to reproduce
+        this error with. In the future, we may want to restrict the disabling of resampling to only known bad plugins.
+        */
+        ((ma_snd_pcm_hw_params_set_rate_resample_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_rate_resample)(pPCM, pHWParams, 0);
+
+        sampleRate = pDescriptor->sampleRate;
+        if (sampleRate == 0) {
+            sampleRate = MA_DEFAULT_SAMPLE_RATE;
+        }
+
+        resultALSA = ((ma_snd_pcm_hw_params_set_rate_near_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_rate_near)(pPCM, pHWParams, &sampleRate, 0);
+        if (resultALSA < 0) {
+            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Sample rate not supported. snd_pcm_hw_params_set_rate_near() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+
+        internalSampleRate = (ma_uint32)sampleRate;
+    }
+
+    /* Periods. */
+    {
+        ma_uint32 periods = pDescriptor->periodCount;
+
+        resultALSA = ((ma_snd_pcm_hw_params_set_periods_near_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_periods_near)(pPCM, pHWParams, &periods, NULL);
+        if (resultALSA < 0) {
+            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set period count. snd_pcm_hw_params_set_periods_near() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+
+        internalPeriods = periods;
+    }
+
+    /* Buffer Size */
+    {
+        ma_snd_pcm_uframes_t actualBufferSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, internalSampleRate, pConfig->performanceProfile) * internalPeriods;
+
+        resultALSA = ((ma_snd_pcm_hw_params_set_buffer_size_near_proc)pDevice->pContext->alsa.snd_pcm_hw_params_set_buffer_size_near)(pPCM, pHWParams, &actualBufferSizeInFrames);
+        if (resultALSA < 0) {
+            ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set buffer size for device. snd_pcm_hw_params_set_buffer_size() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+
+        internalPeriodSizeInFrames = actualBufferSizeInFrames / internalPeriods;
+    }
+
+    /* Apply hardware parameters. */
+    resultALSA = ((ma_snd_pcm_hw_params_proc)pDevice->pContext->alsa.snd_pcm_hw_params)(pPCM, pHWParams);
+    if (resultALSA < 0) {
+        ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set hardware parameters. snd_pcm_hw_params() failed.");
+        return ma_result_from_errno(-resultALSA);
+    }
+
+    ma_free(pHWParams, &pDevice->pContext->allocationCallbacks);
+    pHWParams = NULL;
+
+
+    /* Software parameters. */
+    pSWParams = (ma_snd_pcm_sw_params_t*)ma_calloc(((ma_snd_pcm_sw_params_sizeof_proc)pDevice->pContext->alsa.snd_pcm_sw_params_sizeof)(), &pDevice->pContext->allocationCallbacks);
+    if (pSWParams == NULL) {
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to allocate memory for software parameters.");
+        return MA_OUT_OF_MEMORY;
+    }
+
+    resultALSA = ((ma_snd_pcm_sw_params_current_proc)pDevice->pContext->alsa.snd_pcm_sw_params_current)(pPCM, pSWParams);
+    if (resultALSA < 0) {
+        ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to initialize software parameters. snd_pcm_sw_params_current() failed.");
+        return ma_result_from_errno(-resultALSA);
+    }
+
+    resultALSA = ((ma_snd_pcm_sw_params_set_avail_min_proc)pDevice->pContext->alsa.snd_pcm_sw_params_set_avail_min)(pPCM, pSWParams, ma_prev_power_of_2(internalPeriodSizeInFrames));
+    if (resultALSA < 0) {
+        ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] snd_pcm_sw_params_set_avail_min() failed.");
+        return ma_result_from_errno(-resultALSA);
+    }
+
+    resultALSA = ((ma_snd_pcm_sw_params_get_boundary_proc)pDevice->pContext->alsa.snd_pcm_sw_params_get_boundary)(pSWParams, &bufferBoundary);
+    if (resultALSA < 0) {
+        bufferBoundary = internalPeriodSizeInFrames * internalPeriods;
+    }
+
+    if (deviceType == ma_device_type_playback && !isUsingMMap) {   /* Only playback devices in writei/readi mode need a start threshold. */
+        /*
+        Subtle detail here with the start threshold. When in playback-only mode (no full-duplex) we can set the start threshold to
+        the size of a period. But for full-duplex we need to set it such that it is at least two periods.
+        */
+        resultALSA = ((ma_snd_pcm_sw_params_set_start_threshold_proc)pDevice->pContext->alsa.snd_pcm_sw_params_set_start_threshold)(pPCM, pSWParams, internalPeriodSizeInFrames*2);
+        if (resultALSA < 0) {
+            ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set start threshold for playback device. snd_pcm_sw_params_set_start_threshold() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+
+        resultALSA = ((ma_snd_pcm_sw_params_set_stop_threshold_proc)pDevice->pContext->alsa.snd_pcm_sw_params_set_stop_threshold)(pPCM, pSWParams, bufferBoundary);
+        if (resultALSA < 0) { /* Set to boundary to loop instead of stop in the event of an xrun. */
+            ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
+            ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set stop threshold for playback device. snd_pcm_sw_params_set_stop_threshold() failed.");
+            return ma_result_from_errno(-resultALSA);
+        }
+    }
+
+    resultALSA = ((ma_snd_pcm_sw_params_proc)pDevice->pContext->alsa.snd_pcm_sw_params)(pPCM, pSWParams);
+    if (resultALSA < 0) {
+        ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to set software parameters. snd_pcm_sw_params() failed.");
+        return ma_result_from_errno(-resultALSA);
+    }
+
+    ma_free(pSWParams, &pDevice->pContext->allocationCallbacks);
+    pSWParams = NULL;
+
+
+    /* Grab the internal channel map. For now we're not going to bother trying to change the channel map and instead just do it ourselves. */
+    {
+        ma_snd_pcm_chmap_t* pChmap = NULL;
+        if (pDevice->pContext->alsa.snd_pcm_get_chmap != NULL) {
+            pChmap = ((ma_snd_pcm_get_chmap_proc)pDevice->pContext->alsa.snd_pcm_get_chmap)(pPCM);
+        }
+
+        if (pChmap != NULL) {
+            ma_uint32 iChannel;
+
+            /* There are cases where the returned channel map can have a different channel count than was returned by snd_pcm_hw_params_set_channels_near(). */
+            if (pChmap->channels >= internalChannels) {
+                /* Drop excess channels. */
+                for (iChannel = 0; iChannel < internalChannels; ++iChannel) {
+                    internalChannelMap[iChannel] = ma_convert_alsa_channel_position_to_ma_channel(pChmap->pos[iChannel]);
+                }
+            } else {
+                ma_uint32 i;
+
+                /*
+                Excess channels use defaults. Do an initial fill with defaults, overwrite the first pChmap->channels, validate to ensure there are no duplicate
+                channels. If validation fails, fall back to defaults.
+                */
+                ma_bool32 isValid = MA_TRUE;
+
+                /* Fill with defaults. */
+                ma_channel_map_init_standard(ma_standard_channel_map_alsa, internalChannelMap, ma_countof(internalChannelMap), internalChannels);
+
+                /* Overwrite first pChmap->channels channels. */
+                for (iChannel = 0; iChannel < pChmap->channels; ++iChannel) {
+                    internalChannelMap[iChannel] = ma_convert_alsa_channel_position_to_ma_channel(pChmap->pos[iChannel]);
+                }
+
+                /* Validate. */
+                for (i = 0; i < internalChannels && isValid; ++i) {
+                    ma_uint32 j;
+                    for (j = i+1; j < internalChannels; ++j) {
+                        if (internalChannelMap[i] == internalChannelMap[j]) {
+                            isValid = MA_FALSE;
+                            break;
+                        }
+                    }
+                }
+
+                /* If our channel map is invalid, fall back to defaults. */
+                if (!isValid) {
+                    ma_channel_map_init_standard(ma_standard_channel_map_alsa, internalChannelMap, ma_countof(internalChannelMap), internalChannels);
+                }
+            }
+
+            free(pChmap);
+            pChmap = NULL;
+        } else {
+            /* Could not retrieve the channel map. Fall back to a hard-coded assumption. */
+            ma_channel_map_init_standard(ma_standard_channel_map_alsa, internalChannelMap, ma_countof(internalChannelMap), internalChannels);
+        }
+    }
+
+
+    /*
+    We need to retrieve the poll descriptors so we can use poll() to wait for data to become
+    available for reading or writing. There's no well defined maximum for this so we're just going
+    to allocate this on the heap.
+    */
+    pollDescriptorCount = ((ma_snd_pcm_poll_descriptors_count_proc)pDevice->pContext->alsa.snd_pcm_poll_descriptors_count)(pPCM);
+    if (pollDescriptorCount <= 0) {
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to retrieve poll descriptors count.");
+        return MA_ERROR;
+    }
+
+    pPollDescriptors = (struct pollfd*)ma_malloc(sizeof(*pPollDescriptors) * (pollDescriptorCount + 1), &pDevice->pContext->allocationCallbacks);   /* +1 because we want room for the wakeup descriptor. */
+    if (pPollDescriptors == NULL) {
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to allocate memory for poll descriptors.");
+        return MA_OUT_OF_MEMORY;
+    }
+
+    /*
+    We need an eventfd to wakeup from poll() and avoid a deadlock in situations where the driver
+    never returns from writei() and readi(). This has been observed with the "pulse" device.
+    */
+    wakeupfd = eventfd(0, 0);
+    if (wakeupfd < 0) {
+        ma_free(pPollDescriptors, &pDevice->pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to create eventfd for poll wakeup.");
+        return ma_result_from_errno(errno);
+    }
+
+    /* We'll place the wakeup fd at the start of the buffer. */
+    pPollDescriptors[0].fd      = wakeupfd;
+    pPollDescriptors[0].events  = POLLIN;    /* We only care about waiting to read from the wakeup file descriptor. */
+    pPollDescriptors[0].revents = 0;
+
+    /* We can now extract the PCM poll descriptors which we place after the wakeup descriptor. */
+    pollDescriptorCount = ((ma_snd_pcm_poll_descriptors_proc)pDevice->pContext->alsa.snd_pcm_poll_descriptors)(pPCM, pPollDescriptors + 1, pollDescriptorCount);    /* +1 because we want to place these descriptors after the wakeup descriptor. */
+    if (pollDescriptorCount <= 0) {
+        close(wakeupfd);
+        ma_free(pPollDescriptors, &pDevice->pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to retrieve poll descriptors.");
+        return MA_ERROR;
+    }
+
+    if (deviceType == ma_device_type_capture) {
+        pDevice->alsa.pollDescriptorCountCapture = pollDescriptorCount;
+        pDevice->alsa.pPollDescriptorsCapture = pPollDescriptors;
+        pDevice->alsa.wakeupfdCapture = wakeupfd;
+    } else {
+        pDevice->alsa.pollDescriptorCountPlayback = pollDescriptorCount;
+        pDevice->alsa.pPollDescriptorsPlayback = pPollDescriptors;
+        pDevice->alsa.wakeupfdPlayback = wakeupfd;
+    }
+
+
+    /* We're done. Prepare the device. */
+    resultALSA = ((ma_snd_pcm_prepare_proc)pDevice->pContext->alsa.snd_pcm_prepare)(pPCM);
+    if (resultALSA < 0) {
+        close(wakeupfd);
+        ma_free(pPollDescriptors, &pDevice->pContext->allocationCallbacks);
+        ((ma_snd_pcm_close_proc)pDevice->pContext->alsa.snd_pcm_close)(pPCM);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to prepare device.");
+        return ma_result_from_errno(-resultALSA);
+    }
+
+
+    if (deviceType == ma_device_type_capture) {
+        pDevice->alsa.pPCMCapture         = (ma_ptr)pPCM;
+        pDevice->alsa.isUsingMMapCapture  = isUsingMMap;
+    } else {
+        pDevice->alsa.pPCMPlayback        = (ma_ptr)pPCM;
+        pDevice->alsa.isUsingMMapPlayback = isUsingMMap;
+    }
+
+    pDescriptor->format             = internalFormat;
+    pDescriptor->channels           = internalChannels;
+    pDescriptor->sampleRate         = internalSampleRate;
+    ma_channel_map_copy(pDescriptor->channelMap, internalChannelMap, ma_min(internalChannels, MA_MAX_CHANNELS));
+    pDescriptor->periodSizeInFrames = internalPeriodSizeInFrames;
+    pDescriptor->periodCount        = internalPeriods;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init__alsa(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ZERO_OBJECT(&pDevice->alsa);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ma_result result = ma_device_init_by_type__alsa(pDevice, pConfig, pDescriptorCapture, ma_device_type_capture);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_result result = ma_device_init_by_type__alsa(pDevice, pConfig, pDescriptorPlayback, ma_device_type_playback);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start__alsa(ma_device* pDevice)
+{
+    int resultALSA;
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        resultALSA = ((ma_snd_pcm_start_proc)pDevice->pContext->alsa.snd_pcm_start)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture);
+        if (resultALSA < 0) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to start capture device.");
+            return ma_result_from_errno(-resultALSA);
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        /*        
+        When data is written to the device we wait for the device to get ready to receive data with poll(). In my testing
+        I have observed that poll() can sometimes block forever unless the device is started explicitly with snd_pcm_start()
+        or some data is written with snd_pcm_writei().
+
+        To resolve this I've decided to do an explicit start with snd_pcm_start(). The problem with this is that the device
+        is started without any data in the internal buffer which will result in an immediate underrun. If instead we were
+        to call into snd_pcm_writei() in an attempt to prevent the underrun, we would run the risk of a weird deadlock
+        issue as documented inside ma_device_write__alsa().
+        */
+        resultALSA = ((ma_snd_pcm_start_proc)pDevice->pContext->alsa.snd_pcm_start)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback);
+        if (resultALSA < 0) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to start playback device.");
+            return ma_result_from_errno(-resultALSA);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__alsa(ma_device* pDevice)
+{
+    /*
+    The stop callback will get called on the worker thread after read/write__alsa() has returned. At this point there is
+    a small chance that our wakeupfd has not been cleared. We'll clear that out now if applicable.
+    */
+    int resultPoll;
+    int resultRead;
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Dropping capture device...\n");
+        ((ma_snd_pcm_drop_proc)pDevice->pContext->alsa.snd_pcm_drop)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture);
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Dropping capture device successful.\n");
+
+        /* We need to prepare the device again, otherwise we won't be able to restart the device. */
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing capture device...\n");
+        if (((ma_snd_pcm_prepare_proc)pDevice->pContext->alsa.snd_pcm_prepare)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture) < 0) {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing capture device failed.\n");
+        } else {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing capture device successful.\n");
+        }
+
+        /* Clear the wakeupfd. */
+        resultPoll = poll((struct pollfd*)pDevice->alsa.pPollDescriptorsCapture, 1, 0);
+        if (resultPoll > 0) {
+            ma_uint64 t;
+            resultRead = read(((struct pollfd*)pDevice->alsa.pPollDescriptorsCapture)[0].fd, &t, sizeof(t));
+            if (resultRead != sizeof(t)) {
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Failed to read from capture wakeupfd. read() = %d\n", resultRead);
+            }
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Dropping playback device...\n");
+        ((ma_snd_pcm_drop_proc)pDevice->pContext->alsa.snd_pcm_drop)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback);
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Dropping playback device successful.\n");
+
+        /* We need to prepare the device again, otherwise we won't be able to restart the device. */
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing playback device...\n");
+        if (((ma_snd_pcm_prepare_proc)pDevice->pContext->alsa.snd_pcm_prepare)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback) < 0) {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing playback device failed.\n");
+        } else {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Preparing playback device successful.\n");
+        }
+
+        /* Clear the wakeupfd. */
+        resultPoll = poll((struct pollfd*)pDevice->alsa.pPollDescriptorsPlayback, 1, 0);
+        if (resultPoll > 0) {
+            ma_uint64 t;
+            resultRead = read(((struct pollfd*)pDevice->alsa.pPollDescriptorsPlayback)[0].fd, &t, sizeof(t));
+            if (resultRead != sizeof(t)) {
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Failed to read from playback wakeupfd. read() = %d\n", resultRead);
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_wait__alsa(ma_device* pDevice, ma_snd_pcm_t* pPCM, struct pollfd* pPollDescriptors, int pollDescriptorCount, short requiredEvent)
+{
+    for (;;) {
+        unsigned short revents;
+        int resultALSA;
+        int resultPoll = poll(pPollDescriptors, pollDescriptorCount, -1);
+        if (resultPoll < 0) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[ALSA] poll() failed.\n");
+
+            /*
+            There have been reports that poll() is returning an error randomly and that instead of
+            returning an error, simply trying again will work. I'm experimenting with adopting this
+            advice.
+            */
+            continue;
+            /*return ma_result_from_errno(errno);*/
+        }
+
+        /*
+        Before checking the ALSA poll descriptor flag we need to check if the wakeup descriptor
+        has had it's POLLIN flag set. If so, we need to actually read the data and then exit the
+        function. The wakeup descriptor will be the first item in the descriptors buffer.
+        */
+        if ((pPollDescriptors[0].revents & POLLIN) != 0) {
+            ma_uint64 t;
+            int resultRead = read(pPollDescriptors[0].fd, &t, sizeof(t));    /* <-- Important that we read here so that the next write() does not block. */
+            if (resultRead < 0) {
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] read() failed.\n");
+                return ma_result_from_errno(errno);
+            }
+
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] POLLIN set for wakeupfd\n");
+            return MA_DEVICE_NOT_STARTED;
+        }
+
+        /*
+        Getting here means that some data should be able to be read. We need to use ALSA to
+        translate the revents flags for us.
+        */
+        resultALSA = ((ma_snd_pcm_poll_descriptors_revents_proc)pDevice->pContext->alsa.snd_pcm_poll_descriptors_revents)(pPCM, pPollDescriptors + 1, pollDescriptorCount - 1, &revents);   /* +1, -1 to ignore the wakeup descriptor. */
+        if (resultALSA < 0) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] snd_pcm_poll_descriptors_revents() failed.\n");
+            return ma_result_from_errno(-resultALSA);
+        }
+
+        if ((revents & POLLERR) != 0) {
+            ma_snd_pcm_state_t state = ((ma_snd_pcm_state_proc)pDevice->pContext->alsa.snd_pcm_state)(pPCM);
+            if (state == MA_SND_PCM_STATE_XRUN) {
+                /* The PCM is in a xrun state. This will be recovered from at a higher level. We can disregard this. */
+            } else {
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[ALSA] POLLERR detected. status = %d\n", ((ma_snd_pcm_state_proc)pDevice->pContext->alsa.snd_pcm_state)(pPCM));
+            }
+        }
+
+        if ((revents & requiredEvent) == requiredEvent) {
+            break;  /* We're done. Data available for reading or writing. */
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_wait_read__alsa(ma_device* pDevice)
+{
+    return ma_device_wait__alsa(pDevice, (ma_snd_pcm_t*)pDevice->alsa.pPCMCapture, (struct pollfd*)pDevice->alsa.pPollDescriptorsCapture, pDevice->alsa.pollDescriptorCountCapture + 1, POLLIN); /* +1 to account for the wakeup descriptor. */
+}
+
+static ma_result ma_device_wait_write__alsa(ma_device* pDevice)
+{
+    return ma_device_wait__alsa(pDevice, (ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback, (struct pollfd*)pDevice->alsa.pPollDescriptorsPlayback, pDevice->alsa.pollDescriptorCountPlayback + 1, POLLOUT); /* +1 to account for the wakeup descriptor. */
+}
+
+static ma_result ma_device_read__alsa(ma_device* pDevice, void* pFramesOut, ma_uint32 frameCount, ma_uint32* pFramesRead)
+{
+    ma_snd_pcm_sframes_t resultALSA = 0;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    while (ma_device_get_state(pDevice) == ma_device_state_started) {
+        ma_result result;
+
+        /* The first thing to do is wait for data to become available for reading. This will return an error code if the device has been stopped. */
+        result = ma_device_wait_read__alsa(pDevice);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        /* Getting here means we should have data available. */
+        resultALSA = ((ma_snd_pcm_readi_proc)pDevice->pContext->alsa.snd_pcm_readi)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture, pFramesOut, frameCount);
+        if (resultALSA >= 0) {
+            break;  /* Success. */
+        } else {
+            if (resultALSA == -EAGAIN) {
+                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "EGAIN (read)\n");*/
+                continue;   /* Try again. */
+            } else if (resultALSA == -EPIPE) {
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "EPIPE (read)\n");
+
+                /* Overrun. Recover and try again. If this fails we need to return an error. */
+                resultALSA = ((ma_snd_pcm_recover_proc)pDevice->pContext->alsa.snd_pcm_recover)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture, resultALSA, MA_TRUE);
+                if (resultALSA < 0) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to recover device after overrun.");
+                    return ma_result_from_errno((int)-resultALSA);
+                }
+
+                resultALSA = ((ma_snd_pcm_start_proc)pDevice->pContext->alsa.snd_pcm_start)((ma_snd_pcm_t*)pDevice->alsa.pPCMCapture);
+                if (resultALSA < 0) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to start device after underrun.");
+                    return ma_result_from_errno((int)-resultALSA);
+                }
+
+                continue;   /* Try reading again. */
+            }
+        }
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = resultALSA;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_write__alsa(ma_device* pDevice, const void* pFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
+{
+    ma_snd_pcm_sframes_t resultALSA = 0;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pFrames != NULL);
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = 0;
+    }
+
+    while (ma_device_get_state(pDevice) == ma_device_state_started) {
+        ma_result result;
+
+        /* The first thing to do is wait for space to become available for writing. This will return an error code if the device has been stopped. */
+        result = ma_device_wait_write__alsa(pDevice);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        resultALSA = ((ma_snd_pcm_writei_proc)pDevice->pContext->alsa.snd_pcm_writei)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback, pFrames, frameCount);
+        if (resultALSA >= 0) {
+            break;  /* Success. */
+        } else {
+            if (resultALSA == -EAGAIN) {
+                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "EGAIN (write)\n");*/
+                continue;   /* Try again. */
+            } else if (resultALSA == -EPIPE) {
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "EPIPE (write)\n");
+
+                /* Underrun. Recover and try again. If this fails we need to return an error. */
+                resultALSA = ((ma_snd_pcm_recover_proc)pDevice->pContext->alsa.snd_pcm_recover)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback, resultALSA, MA_TRUE);    /* MA_TRUE=silent (don't print anything on error). */
+                if (resultALSA < 0) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to recover device after underrun.");
+                    return ma_result_from_errno((int)-resultALSA);
+                }
+
+                /*
+                In my testing I have had a situation where writei() does not automatically restart the device even though I've set it
+                up as such in the software parameters. What will happen is writei() will block indefinitely even though the number of
+                frames is well beyond the auto-start threshold. To work around this I've needed to add an explicit start here. Not sure
+                if this is me just being stupid and not recovering the device properly, but this definitely feels like something isn't
+                quite right here.
+                */
+                resultALSA = ((ma_snd_pcm_start_proc)pDevice->pContext->alsa.snd_pcm_start)((ma_snd_pcm_t*)pDevice->alsa.pPCMPlayback);
+                if (resultALSA < 0) {
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] Failed to start device after underrun.");
+                    return ma_result_from_errno((int)-resultALSA);
+                }
+
+                continue;   /* Try writing again. */
+            }
+        }
+    }
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = resultALSA;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_data_loop_wakeup__alsa(ma_device* pDevice)
+{
+    ma_uint64 t = 1;
+    int resultWrite = 0;
+
+    MA_ASSERT(pDevice != NULL);
+
+    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Waking up...\n");
+
+    /* Write to an eventfd to trigger a wakeup from poll() and abort any reading or writing. */
+    if (pDevice->alsa.pPollDescriptorsCapture != NULL) {
+        resultWrite = write(pDevice->alsa.wakeupfdCapture, &t, sizeof(t));
+    }
+    if (pDevice->alsa.pPollDescriptorsPlayback != NULL) {
+        resultWrite = write(pDevice->alsa.wakeupfdPlayback, &t, sizeof(t));
+    }
+
+    if (resultWrite < 0) {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[ALSA] write() failed.\n");
+        return ma_result_from_errno(errno);
+    }
+
+    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[ALSA] Waking up completed successfully.\n");
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_uninit__alsa(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_alsa);
+
+    /* Clean up memory for memory leak checkers. */
+    ((ma_snd_config_update_free_global_proc)pContext->alsa.snd_config_update_free_global)();
+
+#ifndef MA_NO_RUNTIME_LINKING
+    ma_dlclose(ma_context_get_log(pContext), pContext->alsa.asoundSO);
+#endif
+
+    ma_mutex_uninit(&pContext->alsa.internalDeviceEnumLock);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__alsa(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    ma_result result;
+#ifndef MA_NO_RUNTIME_LINKING
+    const char* libasoundNames[] = {
+        "libasound.so.2",
+        "libasound.so"
+    };
+    size_t i;
+
+    for (i = 0; i < ma_countof(libasoundNames); ++i) {
+        pContext->alsa.asoundSO = ma_dlopen(ma_context_get_log(pContext), libasoundNames[i]);
+        if (pContext->alsa.asoundSO != NULL) {
+            break;
+        }
+    }
+
+    if (pContext->alsa.asoundSO == NULL) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "[ALSA] Failed to open shared object.\n");
+        return MA_NO_BACKEND;
+    }
+
+    pContext->alsa.snd_pcm_open                           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_open");
+    pContext->alsa.snd_pcm_close                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_close");
+    pContext->alsa.snd_pcm_hw_params_sizeof               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_sizeof");
+    pContext->alsa.snd_pcm_hw_params_any                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_any");
+    pContext->alsa.snd_pcm_hw_params_set_format           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_format");
+    pContext->alsa.snd_pcm_hw_params_set_format_first     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_format_first");
+    pContext->alsa.snd_pcm_hw_params_get_format_mask      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_format_mask");
+    pContext->alsa.snd_pcm_hw_params_set_channels         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_channels");
+    pContext->alsa.snd_pcm_hw_params_set_channels_near    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_channels_near");
+    pContext->alsa.snd_pcm_hw_params_set_channels_minmax  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_channels_minmax");
+    pContext->alsa.snd_pcm_hw_params_set_rate_resample    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_rate_resample");
+    pContext->alsa.snd_pcm_hw_params_set_rate             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_rate");
+    pContext->alsa.snd_pcm_hw_params_set_rate_near        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_rate_near");
+    pContext->alsa.snd_pcm_hw_params_set_buffer_size_near = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_buffer_size_near");
+    pContext->alsa.snd_pcm_hw_params_set_periods_near     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_periods_near");
+    pContext->alsa.snd_pcm_hw_params_set_access           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_set_access");
+    pContext->alsa.snd_pcm_hw_params_get_format           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_format");
+    pContext->alsa.snd_pcm_hw_params_get_channels         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_channels");
+    pContext->alsa.snd_pcm_hw_params_get_channels_min     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_channels_min");
+    pContext->alsa.snd_pcm_hw_params_get_channels_max     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_channels_max");
+    pContext->alsa.snd_pcm_hw_params_get_rate             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_rate");
+    pContext->alsa.snd_pcm_hw_params_get_rate_min         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_rate_min");
+    pContext->alsa.snd_pcm_hw_params_get_rate_max         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_rate_max");
+    pContext->alsa.snd_pcm_hw_params_get_buffer_size      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_buffer_size");
+    pContext->alsa.snd_pcm_hw_params_get_periods          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_periods");
+    pContext->alsa.snd_pcm_hw_params_get_access           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_get_access");
+    pContext->alsa.snd_pcm_hw_params_test_format          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_test_format");
+    pContext->alsa.snd_pcm_hw_params_test_channels        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_test_channels");
+    pContext->alsa.snd_pcm_hw_params_test_rate            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params_test_rate");
+    pContext->alsa.snd_pcm_hw_params                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_hw_params");
+    pContext->alsa.snd_pcm_sw_params_sizeof               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_sizeof");
+    pContext->alsa.snd_pcm_sw_params_current              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_current");
+    pContext->alsa.snd_pcm_sw_params_get_boundary         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_get_boundary");
+    pContext->alsa.snd_pcm_sw_params_set_avail_min        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_set_avail_min");
+    pContext->alsa.snd_pcm_sw_params_set_start_threshold  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_set_start_threshold");
+    pContext->alsa.snd_pcm_sw_params_set_stop_threshold   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params_set_stop_threshold");
+    pContext->alsa.snd_pcm_sw_params                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_sw_params");
+    pContext->alsa.snd_pcm_format_mask_sizeof             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_format_mask_sizeof");
+    pContext->alsa.snd_pcm_format_mask_test               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_format_mask_test");
+    pContext->alsa.snd_pcm_get_chmap                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_get_chmap");
+    pContext->alsa.snd_pcm_state                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_state");
+    pContext->alsa.snd_pcm_prepare                        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_prepare");
+    pContext->alsa.snd_pcm_start                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_start");
+    pContext->alsa.snd_pcm_drop                           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_drop");
+    pContext->alsa.snd_pcm_drain                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_drain");
+    pContext->alsa.snd_pcm_reset                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_reset");
+    pContext->alsa.snd_device_name_hint                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_device_name_hint");
+    pContext->alsa.snd_device_name_get_hint               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_device_name_get_hint");
+    pContext->alsa.snd_card_get_index                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_card_get_index");
+    pContext->alsa.snd_device_name_free_hint              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_device_name_free_hint");
+    pContext->alsa.snd_pcm_mmap_begin                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_mmap_begin");
+    pContext->alsa.snd_pcm_mmap_commit                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_mmap_commit");
+    pContext->alsa.snd_pcm_recover                        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_recover");
+    pContext->alsa.snd_pcm_readi                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_readi");
+    pContext->alsa.snd_pcm_writei                         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_writei");
+    pContext->alsa.snd_pcm_avail                          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_avail");
+    pContext->alsa.snd_pcm_avail_update                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_avail_update");
+    pContext->alsa.snd_pcm_wait                           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_wait");
+    pContext->alsa.snd_pcm_nonblock                       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_nonblock");
+    pContext->alsa.snd_pcm_info                           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_info");
+    pContext->alsa.snd_pcm_info_sizeof                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_info_sizeof");
+    pContext->alsa.snd_pcm_info_get_name                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_info_get_name");
+    pContext->alsa.snd_pcm_poll_descriptors               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_poll_descriptors");
+    pContext->alsa.snd_pcm_poll_descriptors_count         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_poll_descriptors_count");
+    pContext->alsa.snd_pcm_poll_descriptors_revents       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_pcm_poll_descriptors_revents");
+    pContext->alsa.snd_config_update_free_global          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->alsa.asoundSO, "snd_config_update_free_global");
+#else
+    /* The system below is just for type safety. */
+    ma_snd_pcm_open_proc                           _snd_pcm_open                           = snd_pcm_open;
+    ma_snd_pcm_close_proc                          _snd_pcm_close                          = snd_pcm_close;
+    ma_snd_pcm_hw_params_sizeof_proc               _snd_pcm_hw_params_sizeof               = snd_pcm_hw_params_sizeof;
+    ma_snd_pcm_hw_params_any_proc                  _snd_pcm_hw_params_any                  = snd_pcm_hw_params_any;
+    ma_snd_pcm_hw_params_set_format_proc           _snd_pcm_hw_params_set_format           = snd_pcm_hw_params_set_format;
+    ma_snd_pcm_hw_params_set_format_first_proc     _snd_pcm_hw_params_set_format_first     = snd_pcm_hw_params_set_format_first;
+    ma_snd_pcm_hw_params_get_format_mask_proc      _snd_pcm_hw_params_get_format_mask      = snd_pcm_hw_params_get_format_mask;
+    ma_snd_pcm_hw_params_set_channels_proc         _snd_pcm_hw_params_set_channels         = snd_pcm_hw_params_set_channels;
+    ma_snd_pcm_hw_params_set_channels_near_proc    _snd_pcm_hw_params_set_channels_near    = snd_pcm_hw_params_set_channels_near;
+    ma_snd_pcm_hw_params_set_rate_resample_proc    _snd_pcm_hw_params_set_rate_resample    = snd_pcm_hw_params_set_rate_resample;
+    ma_snd_pcm_hw_params_set_rate_near             _snd_pcm_hw_params_set_rate             = snd_pcm_hw_params_set_rate;
+    ma_snd_pcm_hw_params_set_rate_near_proc        _snd_pcm_hw_params_set_rate_near        = snd_pcm_hw_params_set_rate_near;
+    ma_snd_pcm_hw_params_set_rate_minmax_proc      _snd_pcm_hw_params_set_rate_minmax      = snd_pcm_hw_params_set_rate_minmax;
+    ma_snd_pcm_hw_params_set_buffer_size_near_proc _snd_pcm_hw_params_set_buffer_size_near = snd_pcm_hw_params_set_buffer_size_near;
+    ma_snd_pcm_hw_params_set_periods_near_proc     _snd_pcm_hw_params_set_periods_near     = snd_pcm_hw_params_set_periods_near;
+    ma_snd_pcm_hw_params_set_access_proc           _snd_pcm_hw_params_set_access           = snd_pcm_hw_params_set_access;
+    ma_snd_pcm_hw_params_get_format_proc           _snd_pcm_hw_params_get_format           = snd_pcm_hw_params_get_format;
+    ma_snd_pcm_hw_params_get_channels_proc         _snd_pcm_hw_params_get_channels         = snd_pcm_hw_params_get_channels;
+    ma_snd_pcm_hw_params_get_channels_min_proc     _snd_pcm_hw_params_get_channels_min     = snd_pcm_hw_params_get_channels_min;
+    ma_snd_pcm_hw_params_get_channels_max_proc     _snd_pcm_hw_params_get_channels_max     = snd_pcm_hw_params_get_channels_max;
+    ma_snd_pcm_hw_params_get_rate_proc             _snd_pcm_hw_params_get_rate             = snd_pcm_hw_params_get_rate;
+    ma_snd_pcm_hw_params_get_rate_min_proc         _snd_pcm_hw_params_get_rate_min         = snd_pcm_hw_params_get_rate_min;
+    ma_snd_pcm_hw_params_get_rate_max_proc         _snd_pcm_hw_params_get_rate_max         = snd_pcm_hw_params_get_rate_max;
+    ma_snd_pcm_hw_params_get_buffer_size_proc      _snd_pcm_hw_params_get_buffer_size      = snd_pcm_hw_params_get_buffer_size;
+    ma_snd_pcm_hw_params_get_periods_proc          _snd_pcm_hw_params_get_periods          = snd_pcm_hw_params_get_periods;
+    ma_snd_pcm_hw_params_get_access_proc           _snd_pcm_hw_params_get_access           = snd_pcm_hw_params_get_access;
+    ma_snd_pcm_hw_params_test_format_proc          _snd_pcm_hw_params_test_format          = snd_pcm_hw_params_test_format;
+    ma_snd_pcm_hw_params_test_channels_proc        _snd_pcm_hw_params_test_channels        = snd_pcm_hw_params_test_channels;
+    ma_snd_pcm_hw_params_test_rate_proc            _snd_pcm_hw_params_test_rate            = snd_pcm_hw_params_test_rate;
+    ma_snd_pcm_hw_params_proc                      _snd_pcm_hw_params                      = snd_pcm_hw_params;
+    ma_snd_pcm_sw_params_sizeof_proc               _snd_pcm_sw_params_sizeof               = snd_pcm_sw_params_sizeof;
+    ma_snd_pcm_sw_params_current_proc              _snd_pcm_sw_params_current              = snd_pcm_sw_params_current;
+    ma_snd_pcm_sw_params_get_boundary_proc         _snd_pcm_sw_params_get_boundary         = snd_pcm_sw_params_get_boundary;
+    ma_snd_pcm_sw_params_set_avail_min_proc        _snd_pcm_sw_params_set_avail_min        = snd_pcm_sw_params_set_avail_min;
+    ma_snd_pcm_sw_params_set_start_threshold_proc  _snd_pcm_sw_params_set_start_threshold  = snd_pcm_sw_params_set_start_threshold;
+    ma_snd_pcm_sw_params_set_stop_threshold_proc   _snd_pcm_sw_params_set_stop_threshold   = snd_pcm_sw_params_set_stop_threshold;
+    ma_snd_pcm_sw_params_proc                      _snd_pcm_sw_params                      = snd_pcm_sw_params;
+    ma_snd_pcm_format_mask_sizeof_proc             _snd_pcm_format_mask_sizeof             = snd_pcm_format_mask_sizeof;
+    ma_snd_pcm_format_mask_test_proc               _snd_pcm_format_mask_test               = snd_pcm_format_mask_test;
+    ma_snd_pcm_get_chmap_proc                      _snd_pcm_get_chmap                      = snd_pcm_get_chmap;
+    ma_snd_pcm_state_proc                          _snd_pcm_state                          = snd_pcm_state;
+    ma_snd_pcm_prepare_proc                        _snd_pcm_prepare                        = snd_pcm_prepare;
+    ma_snd_pcm_start_proc                          _snd_pcm_start                          = snd_pcm_start;
+    ma_snd_pcm_drop_proc                           _snd_pcm_drop                           = snd_pcm_drop;
+    ma_snd_pcm_drain_proc                          _snd_pcm_drain                          = snd_pcm_drain;
+    ma_snd_pcm_reset_proc                          _snd_pcm_reset                          = snd_pcm_reset;
+    ma_snd_device_name_hint_proc                   _snd_device_name_hint                   = snd_device_name_hint;
+    ma_snd_device_name_get_hint_proc               _snd_device_name_get_hint               = snd_device_name_get_hint;
+    ma_snd_card_get_index_proc                     _snd_card_get_index                     = snd_card_get_index;
+    ma_snd_device_name_free_hint_proc              _snd_device_name_free_hint              = snd_device_name_free_hint;
+    ma_snd_pcm_mmap_begin_proc                     _snd_pcm_mmap_begin                     = snd_pcm_mmap_begin;
+    ma_snd_pcm_mmap_commit_proc                    _snd_pcm_mmap_commit                    = snd_pcm_mmap_commit;
+    ma_snd_pcm_recover_proc                        _snd_pcm_recover                        = snd_pcm_recover;
+    ma_snd_pcm_readi_proc                          _snd_pcm_readi                          = snd_pcm_readi;
+    ma_snd_pcm_writei_proc                         _snd_pcm_writei                         = snd_pcm_writei;
+    ma_snd_pcm_avail_proc                          _snd_pcm_avail                          = snd_pcm_avail;
+    ma_snd_pcm_avail_update_proc                   _snd_pcm_avail_update                   = snd_pcm_avail_update;
+    ma_snd_pcm_wait_proc                           _snd_pcm_wait                           = snd_pcm_wait;
+    ma_snd_pcm_nonblock_proc                       _snd_pcm_nonblock                       = snd_pcm_nonblock;
+    ma_snd_pcm_info_proc                           _snd_pcm_info                           = snd_pcm_info;
+    ma_snd_pcm_info_sizeof_proc                    _snd_pcm_info_sizeof                    = snd_pcm_info_sizeof;
+    ma_snd_pcm_info_get_name_proc                  _snd_pcm_info_get_name                  = snd_pcm_info_get_name;
+    ma_snd_pcm_poll_descriptors                    _snd_pcm_poll_descriptors               = snd_pcm_poll_descriptors;
+    ma_snd_pcm_poll_descriptors_count              _snd_pcm_poll_descriptors_count         = snd_pcm_poll_descriptors_count;
+    ma_snd_pcm_poll_descriptors_revents            _snd_pcm_poll_descriptors_revents       = snd_pcm_poll_descriptors_revents;
+    ma_snd_config_update_free_global_proc          _snd_config_update_free_global          = snd_config_update_free_global;
+
+    pContext->alsa.snd_pcm_open                           = (ma_proc)_snd_pcm_open;
+    pContext->alsa.snd_pcm_close                          = (ma_proc)_snd_pcm_close;
+    pContext->alsa.snd_pcm_hw_params_sizeof               = (ma_proc)_snd_pcm_hw_params_sizeof;
+    pContext->alsa.snd_pcm_hw_params_any                  = (ma_proc)_snd_pcm_hw_params_any;
+    pContext->alsa.snd_pcm_hw_params_set_format           = (ma_proc)_snd_pcm_hw_params_set_format;
+    pContext->alsa.snd_pcm_hw_params_set_format_first     = (ma_proc)_snd_pcm_hw_params_set_format_first;
+    pContext->alsa.snd_pcm_hw_params_get_format_mask      = (ma_proc)_snd_pcm_hw_params_get_format_mask;
+    pContext->alsa.snd_pcm_hw_params_set_channels         = (ma_proc)_snd_pcm_hw_params_set_channels;
+    pContext->alsa.snd_pcm_hw_params_set_channels_near    = (ma_proc)_snd_pcm_hw_params_set_channels_near;
+    pContext->alsa.snd_pcm_hw_params_set_channels_minmax  = (ma_proc)_snd_pcm_hw_params_set_channels_minmax;
+    pContext->alsa.snd_pcm_hw_params_set_rate_resample    = (ma_proc)_snd_pcm_hw_params_set_rate_resample;
+    pContext->alsa.snd_pcm_hw_params_set_rate             = (ma_proc)_snd_pcm_hw_params_set_rate;
+    pContext->alsa.snd_pcm_hw_params_set_rate_near        = (ma_proc)_snd_pcm_hw_params_set_rate_near;
+    pContext->alsa.snd_pcm_hw_params_set_buffer_size_near = (ma_proc)_snd_pcm_hw_params_set_buffer_size_near;
+    pContext->alsa.snd_pcm_hw_params_set_periods_near     = (ma_proc)_snd_pcm_hw_params_set_periods_near;
+    pContext->alsa.snd_pcm_hw_params_set_access           = (ma_proc)_snd_pcm_hw_params_set_access;
+    pContext->alsa.snd_pcm_hw_params_get_format           = (ma_proc)_snd_pcm_hw_params_get_format;
+    pContext->alsa.snd_pcm_hw_params_get_channels         = (ma_proc)_snd_pcm_hw_params_get_channels;
+    pContext->alsa.snd_pcm_hw_params_get_channels_min     = (ma_proc)_snd_pcm_hw_params_get_channels_min;
+    pContext->alsa.snd_pcm_hw_params_get_channels_max     = (ma_proc)_snd_pcm_hw_params_get_channels_max;
+    pContext->alsa.snd_pcm_hw_params_get_rate             = (ma_proc)_snd_pcm_hw_params_get_rate;
+    pContext->alsa.snd_pcm_hw_params_get_rate_min         = (ma_proc)_snd_pcm_hw_params_get_rate_min;
+    pContext->alsa.snd_pcm_hw_params_get_rate_max         = (ma_proc)_snd_pcm_hw_params_get_rate_max;
+    pContext->alsa.snd_pcm_hw_params_get_buffer_size      = (ma_proc)_snd_pcm_hw_params_get_buffer_size;
+    pContext->alsa.snd_pcm_hw_params_get_periods          = (ma_proc)_snd_pcm_hw_params_get_periods;
+    pContext->alsa.snd_pcm_hw_params_get_access           = (ma_proc)_snd_pcm_hw_params_get_access;
+    pContext->alsa.snd_pcm_hw_params_test_format          = (ma_proc)_snd_pcm_hw_params_test_format;
+    pContext->alsa.snd_pcm_hw_params_test_channels        = (ma_proc)_snd_pcm_hw_params_test_channels;
+    pContext->alsa.snd_pcm_hw_params_test_rate            = (ma_proc)_snd_pcm_hw_params_test_rate;
+    pContext->alsa.snd_pcm_hw_params                      = (ma_proc)_snd_pcm_hw_params;
+    pContext->alsa.snd_pcm_sw_params_sizeof               = (ma_proc)_snd_pcm_sw_params_sizeof;
+    pContext->alsa.snd_pcm_sw_params_current              = (ma_proc)_snd_pcm_sw_params_current;
+    pContext->alsa.snd_pcm_sw_params_get_boundary         = (ma_proc)_snd_pcm_sw_params_get_boundary;
+    pContext->alsa.snd_pcm_sw_params_set_avail_min        = (ma_proc)_snd_pcm_sw_params_set_avail_min;
+    pContext->alsa.snd_pcm_sw_params_set_start_threshold  = (ma_proc)_snd_pcm_sw_params_set_start_threshold;
+    pContext->alsa.snd_pcm_sw_params_set_stop_threshold   = (ma_proc)_snd_pcm_sw_params_set_stop_threshold;
+    pContext->alsa.snd_pcm_sw_params                      = (ma_proc)_snd_pcm_sw_params;
+    pContext->alsa.snd_pcm_format_mask_sizeof             = (ma_proc)_snd_pcm_format_mask_sizeof;
+    pContext->alsa.snd_pcm_format_mask_test               = (ma_proc)_snd_pcm_format_mask_test;
+    pContext->alsa.snd_pcm_get_chmap                      = (ma_proc)_snd_pcm_get_chmap;
+    pContext->alsa.snd_pcm_state                          = (ma_proc)_snd_pcm_state;
+    pContext->alsa.snd_pcm_prepare                        = (ma_proc)_snd_pcm_prepare;
+    pContext->alsa.snd_pcm_start                          = (ma_proc)_snd_pcm_start;
+    pContext->alsa.snd_pcm_drop                           = (ma_proc)_snd_pcm_drop;
+    pContext->alsa.snd_pcm_drain                          = (ma_proc)_snd_pcm_drain;
+    pContext->alsa.snd_pcm_reset                          = (ma_proc)_snd_pcm_reset;
+    pContext->alsa.snd_device_name_hint                   = (ma_proc)_snd_device_name_hint;
+    pContext->alsa.snd_device_name_get_hint               = (ma_proc)_snd_device_name_get_hint;
+    pContext->alsa.snd_card_get_index                     = (ma_proc)_snd_card_get_index;
+    pContext->alsa.snd_device_name_free_hint              = (ma_proc)_snd_device_name_free_hint;
+    pContext->alsa.snd_pcm_mmap_begin                     = (ma_proc)_snd_pcm_mmap_begin;
+    pContext->alsa.snd_pcm_mmap_commit                    = (ma_proc)_snd_pcm_mmap_commit;
+    pContext->alsa.snd_pcm_recover                        = (ma_proc)_snd_pcm_recover;
+    pContext->alsa.snd_pcm_readi                          = (ma_proc)_snd_pcm_readi;
+    pContext->alsa.snd_pcm_writei                         = (ma_proc)_snd_pcm_writei;
+    pContext->alsa.snd_pcm_avail                          = (ma_proc)_snd_pcm_avail;
+    pContext->alsa.snd_pcm_avail_update                   = (ma_proc)_snd_pcm_avail_update;
+    pContext->alsa.snd_pcm_wait                           = (ma_proc)_snd_pcm_wait;
+    pContext->alsa.snd_pcm_nonblock                       = (ma_proc)_snd_pcm_nonblock;
+    pContext->alsa.snd_pcm_info                           = (ma_proc)_snd_pcm_info;
+    pContext->alsa.snd_pcm_info_sizeof                    = (ma_proc)_snd_pcm_info_sizeof;
+    pContext->alsa.snd_pcm_info_get_name                  = (ma_proc)_snd_pcm_info_get_name;
+    pContext->alsa.snd_pcm_poll_descriptors               = (ma_proc)_snd_pcm_poll_descriptors;
+    pContext->alsa.snd_pcm_poll_descriptors_count         = (ma_proc)_snd_pcm_poll_descriptors_count;
+    pContext->alsa.snd_pcm_poll_descriptors_revents       = (ma_proc)_snd_pcm_poll_descriptors_revents;
+    pContext->alsa.snd_config_update_free_global          = (ma_proc)_snd_config_update_free_global;
+#endif
+
+    pContext->alsa.useVerboseDeviceEnumeration = pConfig->alsa.useVerboseDeviceEnumeration;
+
+    result = ma_mutex_init(&pContext->alsa.internalDeviceEnumLock);
+    if (result != MA_SUCCESS) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[ALSA] WARNING: Failed to initialize mutex for internal device enumeration.");
+        return result;
+    }
+
+    pCallbacks->onContextInit             = ma_context_init__alsa;
+    pCallbacks->onContextUninit           = ma_context_uninit__alsa;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__alsa;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__alsa;
+    pCallbacks->onDeviceInit              = ma_device_init__alsa;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__alsa;
+    pCallbacks->onDeviceStart             = ma_device_start__alsa;
+    pCallbacks->onDeviceStop              = ma_device_stop__alsa;
+    pCallbacks->onDeviceRead              = ma_device_read__alsa;
+    pCallbacks->onDeviceWrite             = ma_device_write__alsa;
+    pCallbacks->onDeviceDataLoop          = NULL;
+    pCallbacks->onDeviceDataLoopWakeup    = ma_device_data_loop_wakeup__alsa;
+
+    return MA_SUCCESS;
+}
+#endif  /* MA_HAS_ALSA */
+
+
+
+/******************************************************************************
+
+PulseAudio Backend
+
+******************************************************************************/
+#ifdef MA_HAS_PULSEAUDIO
+/*
+The PulseAudio API, along with Apple's Core Audio, is the worst of the mainstream audio APIs. This is a brief description of what's going on
+in the PulseAudio backend. I apologize if this gets a bit ranty for your liking - you might want to skip this discussion.
+
+PulseAudio has something they call the "Simple API", which unfortunately isn't suitable for miniaudio. I've not seen anywhere where it
+allows you to enumerate over devices, nor does it seem to support the ability to stop and start streams. Looking at the documentation, it
+appears as though the stream is constantly running and you prevent sound from being emitted or captured by simply not calling the read or
+write functions. This is not a professional solution as it would be much better to *actually* stop the underlying stream. Perhaps the
+simple API has some smarts to do this automatically, but I'm not sure. Another limitation with the simple API is that it seems inefficient
+when you want to have multiple streams to a single context. For these reasons, miniaudio is not using the simple API.
+
+Since we're not using the simple API, we're left with the asynchronous API as our only other option. And boy, is this where it starts to
+get fun, and I don't mean that in a good way...
+
+The problems start with the very name of the API - "asynchronous". Yes, this is an asynchronous oriented API which means your commands
+don't immediately take effect. You instead need to issue your commands, and then wait for them to complete. The waiting mechanism is
+enabled through the use of a "main loop". In the asynchronous API you cannot get away from the main loop, and the main loop is where almost
+all of PulseAudio's problems stem from.
+
+When you first initialize PulseAudio you need an object referred to as "main loop". You can implement this yourself by defining your own
+vtable, but it's much easier to just use one of the built-in main loop implementations. There's two generic implementations called
+pa_mainloop and pa_threaded_mainloop, and another implementation specific to GLib called pa_glib_mainloop. We're using pa_threaded_mainloop
+because it simplifies management of the worker thread. The idea of the main loop object is pretty self explanatory - you're supposed to use
+it to implement a worker thread which runs in a loop. The main loop is where operations are actually executed.
+
+To initialize the main loop, you just use `pa_threaded_mainloop_new()`. This is the first function you'll call. You can then get a pointer
+to the vtable with `pa_threaded_mainloop_get_api()` (the main loop vtable is called `pa_mainloop_api`). Again, you can bypass the threaded
+main loop object entirely and just implement `pa_mainloop_api` directly, but there's no need for it unless you're doing something extremely
+specialized such as if you want to integrate it into your application's existing main loop infrastructure.
+
+(EDIT 2021-01-26: miniaudio is no longer using `pa_threaded_mainloop` due to this issue: https://github.com/mackron/miniaudio/issues/262.
+It is now using `pa_mainloop` which turns out to be a simpler solution anyway. The rest of this rant still applies, however.)
+
+Once you have your main loop vtable (the `pa_mainloop_api` object) you can create the PulseAudio context. This is very similar to
+miniaudio's context and they map to each other quite well. You have one context to many streams, which is basically the same as miniaudio's
+one `ma_context` to many `ma_device`s. Here's where it starts to get annoying, however. When you first create the PulseAudio context, which
+is done with `pa_context_new()`, it's not actually connected to anything. When you connect, you call `pa_context_connect()`. However, if
+you remember, PulseAudio is an asynchronous API. That means you cannot just assume the context is connected after `pa_context_context()`
+has returned. You instead need to wait for it to connect. To do this, you need to either wait for a callback to get fired, which you can
+set with `pa_context_set_state_callback()`, or you can continuously poll the context's state. Either way, you need to run this in a loop.
+All objects from here out are created from the context, and, I believe, you can't be creating these objects until the context is connected.
+This waiting loop is therefore unavoidable. In order for the waiting to ever complete, however, the main loop needs to be running. Before
+attempting to connect the context, the main loop needs to be started with `pa_threaded_mainloop_start()`.
+
+The reason for this asynchronous design is to support cases where you're connecting to a remote server, say through a local network or an
+internet connection. However, the *VAST* majority of cases don't involve this at all - they just connect to a local "server" running on the
+host machine. The fact that this would be the default rather than making `pa_context_connect()` synchronous tends to boggle the mind.
+
+Once the context has been created and connected you can start creating a stream. A PulseAudio stream is analogous to miniaudio's device.
+The initialization of a stream is fairly standard - you configure some attributes (analogous to miniaudio's device config) and then call
+`pa_stream_new()` to actually create it. Here is where we start to get into "operations". When configuring the stream, you can get
+information about the source (such as sample format, sample rate, etc.), however it's not synchronous. Instead, a `pa_operation` object
+is returned from `pa_context_get_source_info_by_name()` (capture) or `pa_context_get_sink_info_by_name()` (playback). Then, you need to
+run a loop (again!) to wait for the operation to complete which you can determine via a callback or polling, just like we did with the
+context. Then, as an added bonus, you need to decrement the reference counter of the `pa_operation` object to ensure memory is cleaned up.
+All of that just to retrieve basic information about a device!
+
+Once the basic information about the device has been retrieved, miniaudio can now create the stream with `ma_stream_new()`. Like the
+context, this needs to be connected. But we need to be careful here, because we're now about to introduce one of the most horrific design
+choices in PulseAudio.
+
+PulseAudio allows you to specify a callback that is fired when data can be written to or read from a stream. The language is important here
+because PulseAudio takes it literally, specifically the "can be". You would think these callbacks would be appropriate as the place for
+writing and reading data to and from the stream, and that would be right, except when it's not. When you initialize the stream, you can
+set a flag that tells PulseAudio to not start the stream automatically. This is required because miniaudio does not auto-start devices
+straight after initialization - you need to call `ma_device_start()` manually. The problem is that even when this flag is specified,
+PulseAudio will immediately fire its write or read callback. This is *technically* correct (based on the wording in the documentation)
+because indeed, data *can* be written at this point. The problem is that it's not *practical*. It makes sense that the write/read callback
+would be where a program will want to write or read data to or from the stream, but when it's called before the application has even
+requested that the stream be started, it's just not practical because the program probably isn't ready for any kind of data delivery at
+that point (it may still need to load files or whatnot). Instead, this callback should only be fired when the application requests the
+stream be started which is how it works with literally *every* other callback-based audio API. Since miniaudio forbids firing of the data
+callback until the device has been started (as it should be with *all* callback based APIs), logic needs to be added to ensure miniaudio
+doesn't just blindly fire the application-defined data callback from within the PulseAudio callback before the stream has actually been
+started. The device state is used for this - if the state is anything other than `ma_device_state_starting` or `ma_device_state_started`, the main data
+callback is not fired.
+
+This, unfortunately, is not the end of the problems with the PulseAudio write callback. Any normal callback based audio API will
+continuously fire the callback at regular intervals based on the size of the internal buffer. This will only ever be fired when the device
+is running, and will be fired regardless of whether or not the user actually wrote anything to the device/stream. This not the case in
+PulseAudio. In PulseAudio, the data callback will *only* be called if you wrote something to it previously. That means, if you don't call
+`pa_stream_write()`, the callback will not get fired. On the surface you wouldn't think this would matter because you should be always
+writing data, and if you don't have anything to write, just write silence. That's fine until you want to drain the stream. You see, if
+you're continuously writing data to the stream, the stream will never get drained! That means in order to drain the stream, you need to
+*not* write data to it! But remember, when you don't write data to the stream, the callback won't get fired again! Why is draining
+important? Because that's how we've defined stopping to work in miniaudio. In miniaudio, stopping the device requires it to be drained
+before returning from ma_device_stop(). So we've stopped the device, which requires us to drain, but draining requires us to *not* write
+data to the stream (or else it won't ever complete draining), but not writing to the stream means the callback won't get fired again!
+
+This becomes a problem when stopping and then restarting the device. When the device is stopped, it's drained, which requires us to *not*
+write anything to the stream. But then, since we didn't write anything to it, the write callback will *never* get called again if we just
+resume the stream naively. This means that starting the stream requires us to write data to the stream from outside the callback. This
+disconnect is something PulseAudio has got seriously wrong - there should only ever be a single source of data delivery, that being the
+callback. (I have tried using `pa_stream_flush()` to trigger the write callback to fire, but this just doesn't work for some reason.)
+
+Once you've created the stream, you need to connect it which involves the whole waiting procedure. This is the same process as the context,
+only this time you'll poll for the state with `pa_stream_get_status()`. The starting and stopping of a streaming is referred to as
+"corking" in PulseAudio. The analogy is corking a barrel. To start the stream, you uncork it, to stop it you cork it. Personally I think
+it's silly - why would you not just call it "starting" and "stopping" like any other normal audio API? Anyway, the act of corking is, you
+guessed it, asynchronous. This means you'll need our waiting loop as usual. Again, why this asynchronous design is the default is
+absolutely beyond me. Would it really be that hard to just make it run synchronously?
+
+Teardown is pretty simple (what?!). It's just a matter of calling the relevant `_unref()` function on each object in reverse order that
+they were initialized in.
+
+That's about it from the PulseAudio side. A bit ranty, I know, but they really need to fix that main loop and callback system. They're
+embarrassingly unpractical. The main loop thing is an easy fix - have synchronous versions of all APIs. If an application wants these to
+run asynchronously, they can execute them in a separate thread themselves. The desire to run these asynchronously is such a niche
+requirement - it makes no sense to make it the default. The stream write callback needs to be change, or an alternative provided, that is
+constantly fired, regardless of whether or not `pa_stream_write()` has been called, and it needs to take a pointer to a buffer as a
+parameter which the program just writes to directly rather than having to call `pa_stream_writable_size()` and `pa_stream_write()`. These
+changes alone will change PulseAudio from one of the worst audio APIs to one of the best.
+*/
+
+
+/*
+It is assumed pulseaudio.h is available when linking at compile time. When linking at compile time, we use the declarations in the header
+to check for type safety. We cannot do this when linking at run time because the header might not be available.
+*/
+#ifdef MA_NO_RUNTIME_LINKING
+
+/* pulseaudio.h marks some functions with "inline" which isn't always supported. Need to emulate it. */
+#if !defined(__cplusplus)
+    #if defined(__STRICT_ANSI__)
+        #if !defined(inline)
+            #define inline __inline__ __attribute__((always_inline))
+            #define MA_INLINE_DEFINED
+        #endif
+    #endif
+#endif
+#include <pulse/pulseaudio.h>
+#if defined(MA_INLINE_DEFINED)
+    #undef inline
+    #undef MA_INLINE_DEFINED
+#endif
+
+#define MA_PA_OK                                       PA_OK
+#define MA_PA_ERR_ACCESS                               PA_ERR_ACCESS
+#define MA_PA_ERR_INVALID                              PA_ERR_INVALID
+#define MA_PA_ERR_NOENTITY                             PA_ERR_NOENTITY
+#define MA_PA_ERR_NOTSUPPORTED                         PA_ERR_NOTSUPPORTED
+
+#define MA_PA_CHANNELS_MAX                             PA_CHANNELS_MAX
+#define MA_PA_RATE_MAX                                 PA_RATE_MAX
+
+typedef pa_context_flags_t ma_pa_context_flags_t;
+#define MA_PA_CONTEXT_NOFLAGS                          PA_CONTEXT_NOFLAGS
+#define MA_PA_CONTEXT_NOAUTOSPAWN                      PA_CONTEXT_NOAUTOSPAWN
+#define MA_PA_CONTEXT_NOFAIL                           PA_CONTEXT_NOFAIL
+
+typedef pa_stream_flags_t ma_pa_stream_flags_t;
+#define MA_PA_STREAM_NOFLAGS                           PA_STREAM_NOFLAGS
+#define MA_PA_STREAM_START_CORKED                      PA_STREAM_START_CORKED
+#define MA_PA_STREAM_INTERPOLATE_TIMING                PA_STREAM_INTERPOLATE_TIMING
+#define MA_PA_STREAM_NOT_MONOTONIC                     PA_STREAM_NOT_MONOTONIC
+#define MA_PA_STREAM_AUTO_TIMING_UPDATE                PA_STREAM_AUTO_TIMING_UPDATE
+#define MA_PA_STREAM_NO_REMAP_CHANNELS                 PA_STREAM_NO_REMAP_CHANNELS
+#define MA_PA_STREAM_NO_REMIX_CHANNELS                 PA_STREAM_NO_REMIX_CHANNELS
+#define MA_PA_STREAM_FIX_FORMAT                        PA_STREAM_FIX_FORMAT
+#define MA_PA_STREAM_FIX_RATE                          PA_STREAM_FIX_RATE
+#define MA_PA_STREAM_FIX_CHANNELS                      PA_STREAM_FIX_CHANNELS
+#define MA_PA_STREAM_DONT_MOVE                         PA_STREAM_DONT_MOVE
+#define MA_PA_STREAM_VARIABLE_RATE                     PA_STREAM_VARIABLE_RATE
+#define MA_PA_STREAM_PEAK_DETECT                       PA_STREAM_PEAK_DETECT
+#define MA_PA_STREAM_START_MUTED                       PA_STREAM_START_MUTED
+#define MA_PA_STREAM_ADJUST_LATENCY                    PA_STREAM_ADJUST_LATENCY
+#define MA_PA_STREAM_EARLY_REQUESTS                    PA_STREAM_EARLY_REQUESTS
+#define MA_PA_STREAM_DONT_INHIBIT_AUTO_SUSPEND         PA_STREAM_DONT_INHIBIT_AUTO_SUSPEND
+#define MA_PA_STREAM_START_UNMUTED                     PA_STREAM_START_UNMUTED
+#define MA_PA_STREAM_FAIL_ON_SUSPEND                   PA_STREAM_FAIL_ON_SUSPEND
+#define MA_PA_STREAM_RELATIVE_VOLUME                   PA_STREAM_RELATIVE_VOLUME
+#define MA_PA_STREAM_PASSTHROUGH                       PA_STREAM_PASSTHROUGH
+
+typedef pa_sink_flags_t ma_pa_sink_flags_t;
+#define MA_PA_SINK_NOFLAGS                             PA_SINK_NOFLAGS
+#define MA_PA_SINK_HW_VOLUME_CTRL                      PA_SINK_HW_VOLUME_CTRL
+#define MA_PA_SINK_LATENCY                             PA_SINK_LATENCY
+#define MA_PA_SINK_HARDWARE                            PA_SINK_HARDWARE
+#define MA_PA_SINK_NETWORK                             PA_SINK_NETWORK
+#define MA_PA_SINK_HW_MUTE_CTRL                        PA_SINK_HW_MUTE_CTRL
+#define MA_PA_SINK_DECIBEL_VOLUME                      PA_SINK_DECIBEL_VOLUME
+#define MA_PA_SINK_FLAT_VOLUME                         PA_SINK_FLAT_VOLUME
+#define MA_PA_SINK_DYNAMIC_LATENCY                     PA_SINK_DYNAMIC_LATENCY
+#define MA_PA_SINK_SET_FORMATS                         PA_SINK_SET_FORMATS
+
+typedef pa_source_flags_t ma_pa_source_flags_t;
+#define MA_PA_SOURCE_NOFLAGS                           PA_SOURCE_NOFLAGS
+#define MA_PA_SOURCE_HW_VOLUME_CTRL                    PA_SOURCE_HW_VOLUME_CTRL
+#define MA_PA_SOURCE_LATENCY                           PA_SOURCE_LATENCY
+#define MA_PA_SOURCE_HARDWARE                          PA_SOURCE_HARDWARE
+#define MA_PA_SOURCE_NETWORK                           PA_SOURCE_NETWORK
+#define MA_PA_SOURCE_HW_MUTE_CTRL                      PA_SOURCE_HW_MUTE_CTRL
+#define MA_PA_SOURCE_DECIBEL_VOLUME                    PA_SOURCE_DECIBEL_VOLUME
+#define MA_PA_SOURCE_DYNAMIC_LATENCY                   PA_SOURCE_DYNAMIC_LATENCY
+#define MA_PA_SOURCE_FLAT_VOLUME                       PA_SOURCE_FLAT_VOLUME
+
+typedef pa_context_state_t ma_pa_context_state_t;
+#define MA_PA_CONTEXT_UNCONNECTED                      PA_CONTEXT_UNCONNECTED
+#define MA_PA_CONTEXT_CONNECTING                       PA_CONTEXT_CONNECTING
+#define MA_PA_CONTEXT_AUTHORIZING                      PA_CONTEXT_AUTHORIZING
+#define MA_PA_CONTEXT_SETTING_NAME                     PA_CONTEXT_SETTING_NAME
+#define MA_PA_CONTEXT_READY                            PA_CONTEXT_READY
+#define MA_PA_CONTEXT_FAILED                           PA_CONTEXT_FAILED
+#define MA_PA_CONTEXT_TERMINATED                       PA_CONTEXT_TERMINATED
+
+typedef pa_stream_state_t ma_pa_stream_state_t;
+#define MA_PA_STREAM_UNCONNECTED                       PA_STREAM_UNCONNECTED
+#define MA_PA_STREAM_CREATING                          PA_STREAM_CREATING
+#define MA_PA_STREAM_READY                             PA_STREAM_READY
+#define MA_PA_STREAM_FAILED                            PA_STREAM_FAILED
+#define MA_PA_STREAM_TERMINATED                        PA_STREAM_TERMINATED
+
+typedef pa_operation_state_t ma_pa_operation_state_t;
+#define MA_PA_OPERATION_RUNNING                        PA_OPERATION_RUNNING
+#define MA_PA_OPERATION_DONE                           PA_OPERATION_DONE
+#define MA_PA_OPERATION_CANCELLED                      PA_OPERATION_CANCELLED
+
+typedef pa_sink_state_t ma_pa_sink_state_t;
+#define MA_PA_SINK_INVALID_STATE                       PA_SINK_INVALID_STATE
+#define MA_PA_SINK_RUNNING                             PA_SINK_RUNNING
+#define MA_PA_SINK_IDLE                                PA_SINK_IDLE
+#define MA_PA_SINK_SUSPENDED                           PA_SINK_SUSPENDED
+
+typedef pa_source_state_t ma_pa_source_state_t;
+#define MA_PA_SOURCE_INVALID_STATE                     PA_SOURCE_INVALID_STATE
+#define MA_PA_SOURCE_RUNNING                           PA_SOURCE_RUNNING
+#define MA_PA_SOURCE_IDLE                              PA_SOURCE_IDLE
+#define MA_PA_SOURCE_SUSPENDED                         PA_SOURCE_SUSPENDED
+
+typedef pa_seek_mode_t ma_pa_seek_mode_t;
+#define MA_PA_SEEK_RELATIVE                            PA_SEEK_RELATIVE
+#define MA_PA_SEEK_ABSOLUTE                            PA_SEEK_ABSOLUTE
+#define MA_PA_SEEK_RELATIVE_ON_READ                    PA_SEEK_RELATIVE_ON_READ
+#define MA_PA_SEEK_RELATIVE_END                        PA_SEEK_RELATIVE_END
+
+typedef pa_channel_position_t ma_pa_channel_position_t;
+#define MA_PA_CHANNEL_POSITION_INVALID                 PA_CHANNEL_POSITION_INVALID
+#define MA_PA_CHANNEL_POSITION_MONO                    PA_CHANNEL_POSITION_MONO
+#define MA_PA_CHANNEL_POSITION_FRONT_LEFT              PA_CHANNEL_POSITION_FRONT_LEFT
+#define MA_PA_CHANNEL_POSITION_FRONT_RIGHT             PA_CHANNEL_POSITION_FRONT_RIGHT
+#define MA_PA_CHANNEL_POSITION_FRONT_CENTER            PA_CHANNEL_POSITION_FRONT_CENTER
+#define MA_PA_CHANNEL_POSITION_REAR_CENTER             PA_CHANNEL_POSITION_REAR_CENTER
+#define MA_PA_CHANNEL_POSITION_REAR_LEFT               PA_CHANNEL_POSITION_REAR_LEFT
+#define MA_PA_CHANNEL_POSITION_REAR_RIGHT              PA_CHANNEL_POSITION_REAR_RIGHT
+#define MA_PA_CHANNEL_POSITION_LFE                     PA_CHANNEL_POSITION_LFE
+#define MA_PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER    PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER
+#define MA_PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER   PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER
+#define MA_PA_CHANNEL_POSITION_SIDE_LEFT               PA_CHANNEL_POSITION_SIDE_LEFT
+#define MA_PA_CHANNEL_POSITION_SIDE_RIGHT              PA_CHANNEL_POSITION_SIDE_RIGHT
+#define MA_PA_CHANNEL_POSITION_AUX0                    PA_CHANNEL_POSITION_AUX0
+#define MA_PA_CHANNEL_POSITION_AUX1                    PA_CHANNEL_POSITION_AUX1
+#define MA_PA_CHANNEL_POSITION_AUX2                    PA_CHANNEL_POSITION_AUX2
+#define MA_PA_CHANNEL_POSITION_AUX3                    PA_CHANNEL_POSITION_AUX3
+#define MA_PA_CHANNEL_POSITION_AUX4                    PA_CHANNEL_POSITION_AUX4
+#define MA_PA_CHANNEL_POSITION_AUX5                    PA_CHANNEL_POSITION_AUX5
+#define MA_PA_CHANNEL_POSITION_AUX6                    PA_CHANNEL_POSITION_AUX6
+#define MA_PA_CHANNEL_POSITION_AUX7                    PA_CHANNEL_POSITION_AUX7
+#define MA_PA_CHANNEL_POSITION_AUX8                    PA_CHANNEL_POSITION_AUX8
+#define MA_PA_CHANNEL_POSITION_AUX9                    PA_CHANNEL_POSITION_AUX9
+#define MA_PA_CHANNEL_POSITION_AUX10                   PA_CHANNEL_POSITION_AUX10
+#define MA_PA_CHANNEL_POSITION_AUX11                   PA_CHANNEL_POSITION_AUX11
+#define MA_PA_CHANNEL_POSITION_AUX12                   PA_CHANNEL_POSITION_AUX12
+#define MA_PA_CHANNEL_POSITION_AUX13                   PA_CHANNEL_POSITION_AUX13
+#define MA_PA_CHANNEL_POSITION_AUX14                   PA_CHANNEL_POSITION_AUX14
+#define MA_PA_CHANNEL_POSITION_AUX15                   PA_CHANNEL_POSITION_AUX15
+#define MA_PA_CHANNEL_POSITION_AUX16                   PA_CHANNEL_POSITION_AUX16
+#define MA_PA_CHANNEL_POSITION_AUX17                   PA_CHANNEL_POSITION_AUX17
+#define MA_PA_CHANNEL_POSITION_AUX18                   PA_CHANNEL_POSITION_AUX18
+#define MA_PA_CHANNEL_POSITION_AUX19                   PA_CHANNEL_POSITION_AUX19
+#define MA_PA_CHANNEL_POSITION_AUX20                   PA_CHANNEL_POSITION_AUX20
+#define MA_PA_CHANNEL_POSITION_AUX21                   PA_CHANNEL_POSITION_AUX21
+#define MA_PA_CHANNEL_POSITION_AUX22                   PA_CHANNEL_POSITION_AUX22
+#define MA_PA_CHANNEL_POSITION_AUX23                   PA_CHANNEL_POSITION_AUX23
+#define MA_PA_CHANNEL_POSITION_AUX24                   PA_CHANNEL_POSITION_AUX24
+#define MA_PA_CHANNEL_POSITION_AUX25                   PA_CHANNEL_POSITION_AUX25
+#define MA_PA_CHANNEL_POSITION_AUX26                   PA_CHANNEL_POSITION_AUX26
+#define MA_PA_CHANNEL_POSITION_AUX27                   PA_CHANNEL_POSITION_AUX27
+#define MA_PA_CHANNEL_POSITION_AUX28                   PA_CHANNEL_POSITION_AUX28
+#define MA_PA_CHANNEL_POSITION_AUX29                   PA_CHANNEL_POSITION_AUX29
+#define MA_PA_CHANNEL_POSITION_AUX30                   PA_CHANNEL_POSITION_AUX30
+#define MA_PA_CHANNEL_POSITION_AUX31                   PA_CHANNEL_POSITION_AUX31
+#define MA_PA_CHANNEL_POSITION_TOP_CENTER              PA_CHANNEL_POSITION_TOP_CENTER
+#define MA_PA_CHANNEL_POSITION_TOP_FRONT_LEFT          PA_CHANNEL_POSITION_TOP_FRONT_LEFT
+#define MA_PA_CHANNEL_POSITION_TOP_FRONT_RIGHT         PA_CHANNEL_POSITION_TOP_FRONT_RIGHT
+#define MA_PA_CHANNEL_POSITION_TOP_FRONT_CENTER        PA_CHANNEL_POSITION_TOP_FRONT_CENTER
+#define MA_PA_CHANNEL_POSITION_TOP_REAR_LEFT           PA_CHANNEL_POSITION_TOP_REAR_LEFT
+#define MA_PA_CHANNEL_POSITION_TOP_REAR_RIGHT          PA_CHANNEL_POSITION_TOP_REAR_RIGHT
+#define MA_PA_CHANNEL_POSITION_TOP_REAR_CENTER         PA_CHANNEL_POSITION_TOP_REAR_CENTER
+#define MA_PA_CHANNEL_POSITION_LEFT                    PA_CHANNEL_POSITION_LEFT
+#define MA_PA_CHANNEL_POSITION_RIGHT                   PA_CHANNEL_POSITION_RIGHT
+#define MA_PA_CHANNEL_POSITION_CENTER                  PA_CHANNEL_POSITION_CENTER
+#define MA_PA_CHANNEL_POSITION_SUBWOOFER               PA_CHANNEL_POSITION_SUBWOOFER
+
+typedef pa_channel_map_def_t ma_pa_channel_map_def_t;
+#define MA_PA_CHANNEL_MAP_AIFF                         PA_CHANNEL_MAP_AIFF
+#define MA_PA_CHANNEL_MAP_ALSA                         PA_CHANNEL_MAP_ALSA
+#define MA_PA_CHANNEL_MAP_AUX                          PA_CHANNEL_MAP_AUX
+#define MA_PA_CHANNEL_MAP_WAVEEX                       PA_CHANNEL_MAP_WAVEEX
+#define MA_PA_CHANNEL_MAP_OSS                          PA_CHANNEL_MAP_OSS
+#define MA_PA_CHANNEL_MAP_DEFAULT                      PA_CHANNEL_MAP_DEFAULT
+
+typedef pa_sample_format_t ma_pa_sample_format_t;
+#define MA_PA_SAMPLE_INVALID                           PA_SAMPLE_INVALID
+#define MA_PA_SAMPLE_U8                                PA_SAMPLE_U8
+#define MA_PA_SAMPLE_ALAW                              PA_SAMPLE_ALAW
+#define MA_PA_SAMPLE_ULAW                              PA_SAMPLE_ULAW
+#define MA_PA_SAMPLE_S16LE                             PA_SAMPLE_S16LE
+#define MA_PA_SAMPLE_S16BE                             PA_SAMPLE_S16BE
+#define MA_PA_SAMPLE_FLOAT32LE                         PA_SAMPLE_FLOAT32LE
+#define MA_PA_SAMPLE_FLOAT32BE                         PA_SAMPLE_FLOAT32BE
+#define MA_PA_SAMPLE_S32LE                             PA_SAMPLE_S32LE
+#define MA_PA_SAMPLE_S32BE                             PA_SAMPLE_S32BE
+#define MA_PA_SAMPLE_S24LE                             PA_SAMPLE_S24LE
+#define MA_PA_SAMPLE_S24BE                             PA_SAMPLE_S24BE
+#define MA_PA_SAMPLE_S24_32LE                          PA_SAMPLE_S24_32LE
+#define MA_PA_SAMPLE_S24_32BE                          PA_SAMPLE_S24_32BE
+
+typedef pa_mainloop             ma_pa_mainloop;
+typedef pa_threaded_mainloop    ma_pa_threaded_mainloop;
+typedef pa_mainloop_api         ma_pa_mainloop_api;
+typedef pa_context              ma_pa_context;
+typedef pa_operation            ma_pa_operation;
+typedef pa_stream               ma_pa_stream;
+typedef pa_spawn_api            ma_pa_spawn_api;
+typedef pa_buffer_attr          ma_pa_buffer_attr;
+typedef pa_channel_map          ma_pa_channel_map;
+typedef pa_cvolume              ma_pa_cvolume;
+typedef pa_sample_spec          ma_pa_sample_spec;
+typedef pa_sink_info            ma_pa_sink_info;
+typedef pa_source_info          ma_pa_source_info;
+
+typedef pa_context_notify_cb_t  ma_pa_context_notify_cb_t;
+typedef pa_sink_info_cb_t       ma_pa_sink_info_cb_t;
+typedef pa_source_info_cb_t     ma_pa_source_info_cb_t;
+typedef pa_stream_success_cb_t  ma_pa_stream_success_cb_t;
+typedef pa_stream_request_cb_t  ma_pa_stream_request_cb_t;
+typedef pa_stream_notify_cb_t   ma_pa_stream_notify_cb_t;
+typedef pa_free_cb_t            ma_pa_free_cb_t;
+#else
+#define MA_PA_OK                                       0
+#define MA_PA_ERR_ACCESS                               1
+#define MA_PA_ERR_INVALID                              2
+#define MA_PA_ERR_NOENTITY                             5
+#define MA_PA_ERR_NOTSUPPORTED                         19
+
+#define MA_PA_CHANNELS_MAX                             32
+#define MA_PA_RATE_MAX                                 384000
+
+typedef int ma_pa_context_flags_t;
+#define MA_PA_CONTEXT_NOFLAGS                          0x00000000
+#define MA_PA_CONTEXT_NOAUTOSPAWN                      0x00000001
+#define MA_PA_CONTEXT_NOFAIL                           0x00000002
+
+typedef int ma_pa_stream_flags_t;
+#define MA_PA_STREAM_NOFLAGS                           0x00000000
+#define MA_PA_STREAM_START_CORKED                      0x00000001
+#define MA_PA_STREAM_INTERPOLATE_TIMING                0x00000002
+#define MA_PA_STREAM_NOT_MONOTONIC                     0x00000004
+#define MA_PA_STREAM_AUTO_TIMING_UPDATE                0x00000008
+#define MA_PA_STREAM_NO_REMAP_CHANNELS                 0x00000010
+#define MA_PA_STREAM_NO_REMIX_CHANNELS                 0x00000020
+#define MA_PA_STREAM_FIX_FORMAT                        0x00000040
+#define MA_PA_STREAM_FIX_RATE                          0x00000080
+#define MA_PA_STREAM_FIX_CHANNELS                      0x00000100
+#define MA_PA_STREAM_DONT_MOVE                         0x00000200
+#define MA_PA_STREAM_VARIABLE_RATE                     0x00000400
+#define MA_PA_STREAM_PEAK_DETECT                       0x00000800
+#define MA_PA_STREAM_START_MUTED                       0x00001000
+#define MA_PA_STREAM_ADJUST_LATENCY                    0x00002000
+#define MA_PA_STREAM_EARLY_REQUESTS                    0x00004000
+#define MA_PA_STREAM_DONT_INHIBIT_AUTO_SUSPEND         0x00008000
+#define MA_PA_STREAM_START_UNMUTED                     0x00010000
+#define MA_PA_STREAM_FAIL_ON_SUSPEND                   0x00020000
+#define MA_PA_STREAM_RELATIVE_VOLUME                   0x00040000
+#define MA_PA_STREAM_PASSTHROUGH                       0x00080000
+
+typedef int ma_pa_sink_flags_t;
+#define MA_PA_SINK_NOFLAGS                             0x00000000
+#define MA_PA_SINK_HW_VOLUME_CTRL                      0x00000001
+#define MA_PA_SINK_LATENCY                             0x00000002
+#define MA_PA_SINK_HARDWARE                            0x00000004
+#define MA_PA_SINK_NETWORK                             0x00000008
+#define MA_PA_SINK_HW_MUTE_CTRL                        0x00000010
+#define MA_PA_SINK_DECIBEL_VOLUME                      0x00000020
+#define MA_PA_SINK_FLAT_VOLUME                         0x00000040
+#define MA_PA_SINK_DYNAMIC_LATENCY                     0x00000080
+#define MA_PA_SINK_SET_FORMATS                         0x00000100
+
+typedef int ma_pa_source_flags_t;
+#define MA_PA_SOURCE_NOFLAGS                           0x00000000
+#define MA_PA_SOURCE_HW_VOLUME_CTRL                    0x00000001
+#define MA_PA_SOURCE_LATENCY                           0x00000002
+#define MA_PA_SOURCE_HARDWARE                          0x00000004
+#define MA_PA_SOURCE_NETWORK                           0x00000008
+#define MA_PA_SOURCE_HW_MUTE_CTRL                      0x00000010
+#define MA_PA_SOURCE_DECIBEL_VOLUME                    0x00000020
+#define MA_PA_SOURCE_DYNAMIC_LATENCY                   0x00000040
+#define MA_PA_SOURCE_FLAT_VOLUME                       0x00000080
+
+typedef int ma_pa_context_state_t;
+#define MA_PA_CONTEXT_UNCONNECTED                      0
+#define MA_PA_CONTEXT_CONNECTING                       1
+#define MA_PA_CONTEXT_AUTHORIZING                      2
+#define MA_PA_CONTEXT_SETTING_NAME                     3
+#define MA_PA_CONTEXT_READY                            4
+#define MA_PA_CONTEXT_FAILED                           5
+#define MA_PA_CONTEXT_TERMINATED                       6
+
+typedef int ma_pa_stream_state_t;
+#define MA_PA_STREAM_UNCONNECTED                       0
+#define MA_PA_STREAM_CREATING                          1
+#define MA_PA_STREAM_READY                             2
+#define MA_PA_STREAM_FAILED                            3
+#define MA_PA_STREAM_TERMINATED                        4
+
+typedef int ma_pa_operation_state_t;
+#define MA_PA_OPERATION_RUNNING                        0
+#define MA_PA_OPERATION_DONE                           1
+#define MA_PA_OPERATION_CANCELLED                      2
+
+typedef int ma_pa_sink_state_t;
+#define MA_PA_SINK_INVALID_STATE                       -1
+#define MA_PA_SINK_RUNNING                             0
+#define MA_PA_SINK_IDLE                                1
+#define MA_PA_SINK_SUSPENDED                           2
+
+typedef int ma_pa_source_state_t;
+#define MA_PA_SOURCE_INVALID_STATE                     -1
+#define MA_PA_SOURCE_RUNNING                           0
+#define MA_PA_SOURCE_IDLE                              1
+#define MA_PA_SOURCE_SUSPENDED                         2
+
+typedef int ma_pa_seek_mode_t;
+#define MA_PA_SEEK_RELATIVE                            0
+#define MA_PA_SEEK_ABSOLUTE                            1
+#define MA_PA_SEEK_RELATIVE_ON_READ                    2
+#define MA_PA_SEEK_RELATIVE_END                        3
+
+typedef int ma_pa_channel_position_t;
+#define MA_PA_CHANNEL_POSITION_INVALID                 -1
+#define MA_PA_CHANNEL_POSITION_MONO                    0
+#define MA_PA_CHANNEL_POSITION_FRONT_LEFT              1
+#define MA_PA_CHANNEL_POSITION_FRONT_RIGHT             2
+#define MA_PA_CHANNEL_POSITION_FRONT_CENTER            3
+#define MA_PA_CHANNEL_POSITION_REAR_CENTER             4
+#define MA_PA_CHANNEL_POSITION_REAR_LEFT               5
+#define MA_PA_CHANNEL_POSITION_REAR_RIGHT              6
+#define MA_PA_CHANNEL_POSITION_LFE                     7
+#define MA_PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER    8
+#define MA_PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER   9
+#define MA_PA_CHANNEL_POSITION_SIDE_LEFT               10
+#define MA_PA_CHANNEL_POSITION_SIDE_RIGHT              11
+#define MA_PA_CHANNEL_POSITION_AUX0                    12
+#define MA_PA_CHANNEL_POSITION_AUX1                    13
+#define MA_PA_CHANNEL_POSITION_AUX2                    14
+#define MA_PA_CHANNEL_POSITION_AUX3                    15
+#define MA_PA_CHANNEL_POSITION_AUX4                    16
+#define MA_PA_CHANNEL_POSITION_AUX5                    17
+#define MA_PA_CHANNEL_POSITION_AUX6                    18
+#define MA_PA_CHANNEL_POSITION_AUX7                    19
+#define MA_PA_CHANNEL_POSITION_AUX8                    20
+#define MA_PA_CHANNEL_POSITION_AUX9                    21
+#define MA_PA_CHANNEL_POSITION_AUX10                   22
+#define MA_PA_CHANNEL_POSITION_AUX11                   23
+#define MA_PA_CHANNEL_POSITION_AUX12                   24
+#define MA_PA_CHANNEL_POSITION_AUX13                   25
+#define MA_PA_CHANNEL_POSITION_AUX14                   26
+#define MA_PA_CHANNEL_POSITION_AUX15                   27
+#define MA_PA_CHANNEL_POSITION_AUX16                   28
+#define MA_PA_CHANNEL_POSITION_AUX17                   29
+#define MA_PA_CHANNEL_POSITION_AUX18                   30
+#define MA_PA_CHANNEL_POSITION_AUX19                   31
+#define MA_PA_CHANNEL_POSITION_AUX20                   32
+#define MA_PA_CHANNEL_POSITION_AUX21                   33
+#define MA_PA_CHANNEL_POSITION_AUX22                   34
+#define MA_PA_CHANNEL_POSITION_AUX23                   35
+#define MA_PA_CHANNEL_POSITION_AUX24                   36
+#define MA_PA_CHANNEL_POSITION_AUX25                   37
+#define MA_PA_CHANNEL_POSITION_AUX26                   38
+#define MA_PA_CHANNEL_POSITION_AUX27                   39
+#define MA_PA_CHANNEL_POSITION_AUX28                   40
+#define MA_PA_CHANNEL_POSITION_AUX29                   41
+#define MA_PA_CHANNEL_POSITION_AUX30                   42
+#define MA_PA_CHANNEL_POSITION_AUX31                   43
+#define MA_PA_CHANNEL_POSITION_TOP_CENTER              44
+#define MA_PA_CHANNEL_POSITION_TOP_FRONT_LEFT          45
+#define MA_PA_CHANNEL_POSITION_TOP_FRONT_RIGHT         46
+#define MA_PA_CHANNEL_POSITION_TOP_FRONT_CENTER        47
+#define MA_PA_CHANNEL_POSITION_TOP_REAR_LEFT           48
+#define MA_PA_CHANNEL_POSITION_TOP_REAR_RIGHT          49
+#define MA_PA_CHANNEL_POSITION_TOP_REAR_CENTER         50
+#define MA_PA_CHANNEL_POSITION_LEFT                    MA_PA_CHANNEL_POSITION_FRONT_LEFT
+#define MA_PA_CHANNEL_POSITION_RIGHT                   MA_PA_CHANNEL_POSITION_FRONT_RIGHT
+#define MA_PA_CHANNEL_POSITION_CENTER                  MA_PA_CHANNEL_POSITION_FRONT_CENTER
+#define MA_PA_CHANNEL_POSITION_SUBWOOFER               MA_PA_CHANNEL_POSITION_LFE
+
+typedef int ma_pa_channel_map_def_t;
+#define MA_PA_CHANNEL_MAP_AIFF                         0
+#define MA_PA_CHANNEL_MAP_ALSA                         1
+#define MA_PA_CHANNEL_MAP_AUX                          2
+#define MA_PA_CHANNEL_MAP_WAVEEX                       3
+#define MA_PA_CHANNEL_MAP_OSS                          4
+#define MA_PA_CHANNEL_MAP_DEFAULT                      MA_PA_CHANNEL_MAP_AIFF
+
+typedef int ma_pa_sample_format_t;
+#define MA_PA_SAMPLE_INVALID                           -1
+#define MA_PA_SAMPLE_U8                                0
+#define MA_PA_SAMPLE_ALAW                              1
+#define MA_PA_SAMPLE_ULAW                              2
+#define MA_PA_SAMPLE_S16LE                             3
+#define MA_PA_SAMPLE_S16BE                             4
+#define MA_PA_SAMPLE_FLOAT32LE                         5
+#define MA_PA_SAMPLE_FLOAT32BE                         6
+#define MA_PA_SAMPLE_S32LE                             7
+#define MA_PA_SAMPLE_S32BE                             8
+#define MA_PA_SAMPLE_S24LE                             9
+#define MA_PA_SAMPLE_S24BE                             10
+#define MA_PA_SAMPLE_S24_32LE                          11
+#define MA_PA_SAMPLE_S24_32BE                          12
+
+typedef struct ma_pa_mainloop           ma_pa_mainloop;
+typedef struct ma_pa_threaded_mainloop  ma_pa_threaded_mainloop;
+typedef struct ma_pa_mainloop_api       ma_pa_mainloop_api;
+typedef struct ma_pa_context            ma_pa_context;
+typedef struct ma_pa_operation          ma_pa_operation;
+typedef struct ma_pa_stream             ma_pa_stream;
+typedef struct ma_pa_spawn_api          ma_pa_spawn_api;
+
+typedef struct
+{
+    ma_uint32 maxlength;
+    ma_uint32 tlength;
+    ma_uint32 prebuf;
+    ma_uint32 minreq;
+    ma_uint32 fragsize;
+} ma_pa_buffer_attr;
+
+typedef struct
+{
+    ma_uint8 channels;
+    ma_pa_channel_position_t map[MA_PA_CHANNELS_MAX];
+} ma_pa_channel_map;
+
+typedef struct
+{
+    ma_uint8 channels;
+    ma_uint32 values[MA_PA_CHANNELS_MAX];
+} ma_pa_cvolume;
+
+typedef struct
+{
+    ma_pa_sample_format_t format;
+    ma_uint32 rate;
+    ma_uint8 channels;
+} ma_pa_sample_spec;
+
+typedef struct
+{
+    const char* name;
+    ma_uint32 index;
+    const char* description;
+    ma_pa_sample_spec sample_spec;
+    ma_pa_channel_map channel_map;
+    ma_uint32 owner_module;
+    ma_pa_cvolume volume;
+    int mute;
+    ma_uint32 monitor_source;
+    const char* monitor_source_name;
+    ma_uint64 latency;
+    const char* driver;
+    ma_pa_sink_flags_t flags;
+    void* proplist;
+    ma_uint64 configured_latency;
+    ma_uint32 base_volume;
+    ma_pa_sink_state_t state;
+    ma_uint32 n_volume_steps;
+    ma_uint32 card;
+    ma_uint32 n_ports;
+    void** ports;
+    void* active_port;
+    ma_uint8 n_formats;
+    void** formats;
+} ma_pa_sink_info;
+
+typedef struct
+{
+    const char *name;
+    ma_uint32 index;
+    const char *description;
+    ma_pa_sample_spec sample_spec;
+    ma_pa_channel_map channel_map;
+    ma_uint32 owner_module;
+    ma_pa_cvolume volume;
+    int mute;
+    ma_uint32 monitor_of_sink;
+    const char *monitor_of_sink_name;
+    ma_uint64 latency;
+    const char *driver;
+    ma_pa_source_flags_t flags;
+    void* proplist;
+    ma_uint64 configured_latency;
+    ma_uint32 base_volume;
+    ma_pa_source_state_t state;
+    ma_uint32 n_volume_steps;
+    ma_uint32 card;
+    ma_uint32 n_ports;
+    void** ports;
+    void* active_port;
+    ma_uint8 n_formats;
+    void** formats;
+} ma_pa_source_info;
+
+typedef void (* ma_pa_context_notify_cb_t)(ma_pa_context* c, void* userdata);
+typedef void (* ma_pa_sink_info_cb_t)     (ma_pa_context* c, const ma_pa_sink_info* i, int eol, void* userdata);
+typedef void (* ma_pa_source_info_cb_t)   (ma_pa_context* c, const ma_pa_source_info* i, int eol, void* userdata);
+typedef void (* ma_pa_stream_success_cb_t)(ma_pa_stream* s, int success, void* userdata);
+typedef void (* ma_pa_stream_request_cb_t)(ma_pa_stream* s, size_t nbytes, void* userdata);
+typedef void (* ma_pa_stream_notify_cb_t) (ma_pa_stream* s, void* userdata);
+typedef void (* ma_pa_free_cb_t)          (void* p);
+#endif
+
+
+typedef ma_pa_mainloop*          (* ma_pa_mainloop_new_proc)                   (void);
+typedef void                     (* ma_pa_mainloop_free_proc)                  (ma_pa_mainloop* m);
+typedef void                     (* ma_pa_mainloop_quit_proc)                  (ma_pa_mainloop* m, int retval);
+typedef ma_pa_mainloop_api*      (* ma_pa_mainloop_get_api_proc)               (ma_pa_mainloop* m);
+typedef int                      (* ma_pa_mainloop_iterate_proc)               (ma_pa_mainloop* m, int block, int* retval);
+typedef void                     (* ma_pa_mainloop_wakeup_proc)                (ma_pa_mainloop* m);
+typedef ma_pa_threaded_mainloop* (* ma_pa_threaded_mainloop_new_proc)          (void);
+typedef void                     (* ma_pa_threaded_mainloop_free_proc)         (ma_pa_threaded_mainloop* m);
+typedef int                      (* ma_pa_threaded_mainloop_start_proc)        (ma_pa_threaded_mainloop* m);
+typedef void                     (* ma_pa_threaded_mainloop_stop_proc)         (ma_pa_threaded_mainloop* m);
+typedef void                     (* ma_pa_threaded_mainloop_lock_proc)         (ma_pa_threaded_mainloop* m);
+typedef void                     (* ma_pa_threaded_mainloop_unlock_proc)       (ma_pa_threaded_mainloop* m);
+typedef void                     (* ma_pa_threaded_mainloop_wait_proc)         (ma_pa_threaded_mainloop* m);
+typedef void                     (* ma_pa_threaded_mainloop_signal_proc)       (ma_pa_threaded_mainloop* m, int wait_for_accept);
+typedef void                     (* ma_pa_threaded_mainloop_accept_proc)       (ma_pa_threaded_mainloop* m);
+typedef int                      (* ma_pa_threaded_mainloop_get_retval_proc)   (ma_pa_threaded_mainloop* m);
+typedef ma_pa_mainloop_api*      (* ma_pa_threaded_mainloop_get_api_proc)      (ma_pa_threaded_mainloop* m);
+typedef int                      (* ma_pa_threaded_mainloop_in_thread_proc)    (ma_pa_threaded_mainloop* m);
+typedef void                     (* ma_pa_threaded_mainloop_set_name_proc)     (ma_pa_threaded_mainloop* m, const char* name);
+typedef ma_pa_context*           (* ma_pa_context_new_proc)                    (ma_pa_mainloop_api* mainloop, const char* name);
+typedef void                     (* ma_pa_context_unref_proc)                  (ma_pa_context* c);
+typedef int                      (* ma_pa_context_connect_proc)                (ma_pa_context* c, const char* server, ma_pa_context_flags_t flags, const ma_pa_spawn_api* api);
+typedef void                     (* ma_pa_context_disconnect_proc)             (ma_pa_context* c);
+typedef void                     (* ma_pa_context_set_state_callback_proc)     (ma_pa_context* c, ma_pa_context_notify_cb_t cb, void* userdata);
+typedef ma_pa_context_state_t    (* ma_pa_context_get_state_proc)              (ma_pa_context* c);
+typedef ma_pa_operation*         (* ma_pa_context_get_sink_info_list_proc)     (ma_pa_context* c, ma_pa_sink_info_cb_t cb, void* userdata);
+typedef ma_pa_operation*         (* ma_pa_context_get_source_info_list_proc)   (ma_pa_context* c, ma_pa_source_info_cb_t cb, void* userdata);
+typedef ma_pa_operation*         (* ma_pa_context_get_sink_info_by_name_proc)  (ma_pa_context* c, const char* name, ma_pa_sink_info_cb_t cb, void* userdata);
+typedef ma_pa_operation*         (* ma_pa_context_get_source_info_by_name_proc)(ma_pa_context* c, const char* name, ma_pa_source_info_cb_t cb, void* userdata);
+typedef void                     (* ma_pa_operation_unref_proc)                (ma_pa_operation* o);
+typedef ma_pa_operation_state_t  (* ma_pa_operation_get_state_proc)            (ma_pa_operation* o);
+typedef ma_pa_channel_map*       (* ma_pa_channel_map_init_extend_proc)        (ma_pa_channel_map* m, unsigned channels, ma_pa_channel_map_def_t def);
+typedef int                      (* ma_pa_channel_map_valid_proc)              (const ma_pa_channel_map* m);
+typedef int                      (* ma_pa_channel_map_compatible_proc)         (const ma_pa_channel_map* m, const ma_pa_sample_spec* ss);
+typedef ma_pa_stream*            (* ma_pa_stream_new_proc)                     (ma_pa_context* c, const char* name, const ma_pa_sample_spec* ss, const ma_pa_channel_map* map);
+typedef void                     (* ma_pa_stream_unref_proc)                   (ma_pa_stream* s);
+typedef int                      (* ma_pa_stream_connect_playback_proc)        (ma_pa_stream* s, const char* dev, const ma_pa_buffer_attr* attr, ma_pa_stream_flags_t flags, const ma_pa_cvolume* volume, ma_pa_stream* sync_stream);
+typedef int                      (* ma_pa_stream_connect_record_proc)          (ma_pa_stream* s, const char* dev, const ma_pa_buffer_attr* attr, ma_pa_stream_flags_t flags);
+typedef int                      (* ma_pa_stream_disconnect_proc)              (ma_pa_stream* s);
+typedef ma_pa_stream_state_t     (* ma_pa_stream_get_state_proc)               (ma_pa_stream* s);
+typedef const ma_pa_sample_spec* (* ma_pa_stream_get_sample_spec_proc)         (ma_pa_stream* s);
+typedef const ma_pa_channel_map* (* ma_pa_stream_get_channel_map_proc)         (ma_pa_stream* s);
+typedef const ma_pa_buffer_attr* (* ma_pa_stream_get_buffer_attr_proc)         (ma_pa_stream* s);
+typedef ma_pa_operation*         (* ma_pa_stream_set_buffer_attr_proc)         (ma_pa_stream* s, const ma_pa_buffer_attr* attr, ma_pa_stream_success_cb_t cb, void* userdata);
+typedef const char*              (* ma_pa_stream_get_device_name_proc)         (ma_pa_stream* s);
+typedef void                     (* ma_pa_stream_set_write_callback_proc)      (ma_pa_stream* s, ma_pa_stream_request_cb_t cb, void* userdata);
+typedef void                     (* ma_pa_stream_set_read_callback_proc)       (ma_pa_stream* s, ma_pa_stream_request_cb_t cb, void* userdata);
+typedef void                     (* ma_pa_stream_set_suspended_callback_proc)  (ma_pa_stream* s, ma_pa_stream_notify_cb_t cb, void* userdata);
+typedef void                     (* ma_pa_stream_set_moved_callback_proc)      (ma_pa_stream* s, ma_pa_stream_notify_cb_t cb, void* userdata);
+typedef int                      (* ma_pa_stream_is_suspended_proc)            (const ma_pa_stream* s);
+typedef ma_pa_operation*         (* ma_pa_stream_flush_proc)                   (ma_pa_stream* s, ma_pa_stream_success_cb_t cb, void* userdata);
+typedef ma_pa_operation*         (* ma_pa_stream_drain_proc)                   (ma_pa_stream* s, ma_pa_stream_success_cb_t cb, void* userdata);
+typedef int                      (* ma_pa_stream_is_corked_proc)               (ma_pa_stream* s);
+typedef ma_pa_operation*         (* ma_pa_stream_cork_proc)                    (ma_pa_stream* s, int b, ma_pa_stream_success_cb_t cb, void* userdata);
+typedef ma_pa_operation*         (* ma_pa_stream_trigger_proc)                 (ma_pa_stream* s, ma_pa_stream_success_cb_t cb, void* userdata);
+typedef int                      (* ma_pa_stream_begin_write_proc)             (ma_pa_stream* s, void** data, size_t* nbytes);
+typedef int                      (* ma_pa_stream_write_proc)                   (ma_pa_stream* s, const void* data, size_t nbytes, ma_pa_free_cb_t free_cb, int64_t offset, ma_pa_seek_mode_t seek);
+typedef int                      (* ma_pa_stream_peek_proc)                    (ma_pa_stream* s, const void** data, size_t* nbytes);
+typedef int                      (* ma_pa_stream_drop_proc)                    (ma_pa_stream* s);
+typedef size_t                   (* ma_pa_stream_writable_size_proc)           (ma_pa_stream* s);
+typedef size_t                   (* ma_pa_stream_readable_size_proc)           (ma_pa_stream* s);
+
+typedef struct
+{
+    ma_uint32 count;
+    ma_uint32 capacity;
+    ma_device_info* pInfo;
+} ma_pulse_device_enum_data;
+
+static ma_result ma_result_from_pulse(int result)
+{
+    if (result < 0) {
+        return MA_ERROR;
+    }
+
+    switch (result) {
+        case MA_PA_OK:           return MA_SUCCESS;
+        case MA_PA_ERR_ACCESS:   return MA_ACCESS_DENIED;
+        case MA_PA_ERR_INVALID:  return MA_INVALID_ARGS;
+        case MA_PA_ERR_NOENTITY: return MA_NO_DEVICE;
+        default:                 return MA_ERROR;
+    }
+}
+
+#if 0
+static ma_pa_sample_format_t ma_format_to_pulse(ma_format format)
+{
+    if (ma_is_little_endian()) {
+        switch (format) {
+            case ma_format_s16: return MA_PA_SAMPLE_S16LE;
+            case ma_format_s24: return MA_PA_SAMPLE_S24LE;
+            case ma_format_s32: return MA_PA_SAMPLE_S32LE;
+            case ma_format_f32: return MA_PA_SAMPLE_FLOAT32LE;
+            default: break;
+        }
+    } else {
+        switch (format) {
+            case ma_format_s16: return MA_PA_SAMPLE_S16BE;
+            case ma_format_s24: return MA_PA_SAMPLE_S24BE;
+            case ma_format_s32: return MA_PA_SAMPLE_S32BE;
+            case ma_format_f32: return MA_PA_SAMPLE_FLOAT32BE;
+            default: break;
+        }
+    }
+
+    /* Endian agnostic. */
+    switch (format) {
+        case ma_format_u8: return MA_PA_SAMPLE_U8;
+        default: return MA_PA_SAMPLE_INVALID;
+    }
+}
+#endif
+
+static ma_format ma_format_from_pulse(ma_pa_sample_format_t format)
+{
+    if (ma_is_little_endian()) {
+        switch (format) {
+            case MA_PA_SAMPLE_S16LE:     return ma_format_s16;
+            case MA_PA_SAMPLE_S24LE:     return ma_format_s24;
+            case MA_PA_SAMPLE_S32LE:     return ma_format_s32;
+            case MA_PA_SAMPLE_FLOAT32LE: return ma_format_f32;
+            default: break;
+        }
+    } else {
+        switch (format) {
+            case MA_PA_SAMPLE_S16BE:     return ma_format_s16;
+            case MA_PA_SAMPLE_S24BE:     return ma_format_s24;
+            case MA_PA_SAMPLE_S32BE:     return ma_format_s32;
+            case MA_PA_SAMPLE_FLOAT32BE: return ma_format_f32;
+            default: break;
+        }
+    }
+
+    /* Endian agnostic. */
+    switch (format) {
+        case MA_PA_SAMPLE_U8: return ma_format_u8;
+        default: return ma_format_unknown;
+    }
+}
+
+static ma_channel ma_channel_position_from_pulse(ma_pa_channel_position_t position)
+{
+    switch (position)
+    {
+        case MA_PA_CHANNEL_POSITION_INVALID:               return MA_CHANNEL_NONE;
+        case MA_PA_CHANNEL_POSITION_MONO:                  return MA_CHANNEL_MONO;
+        case MA_PA_CHANNEL_POSITION_FRONT_LEFT:            return MA_CHANNEL_FRONT_LEFT;
+        case MA_PA_CHANNEL_POSITION_FRONT_RIGHT:           return MA_CHANNEL_FRONT_RIGHT;
+        case MA_PA_CHANNEL_POSITION_FRONT_CENTER:          return MA_CHANNEL_FRONT_CENTER;
+        case MA_PA_CHANNEL_POSITION_REAR_CENTER:           return MA_CHANNEL_BACK_CENTER;
+        case MA_PA_CHANNEL_POSITION_REAR_LEFT:             return MA_CHANNEL_BACK_LEFT;
+        case MA_PA_CHANNEL_POSITION_REAR_RIGHT:            return MA_CHANNEL_BACK_RIGHT;
+        case MA_PA_CHANNEL_POSITION_LFE:                   return MA_CHANNEL_LFE;
+        case MA_PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER:  return MA_CHANNEL_FRONT_LEFT_CENTER;
+        case MA_PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER: return MA_CHANNEL_FRONT_RIGHT_CENTER;
+        case MA_PA_CHANNEL_POSITION_SIDE_LEFT:             return MA_CHANNEL_SIDE_LEFT;
+        case MA_PA_CHANNEL_POSITION_SIDE_RIGHT:            return MA_CHANNEL_SIDE_RIGHT;
+        case MA_PA_CHANNEL_POSITION_AUX0:                  return MA_CHANNEL_AUX_0;
+        case MA_PA_CHANNEL_POSITION_AUX1:                  return MA_CHANNEL_AUX_1;
+        case MA_PA_CHANNEL_POSITION_AUX2:                  return MA_CHANNEL_AUX_2;
+        case MA_PA_CHANNEL_POSITION_AUX3:                  return MA_CHANNEL_AUX_3;
+        case MA_PA_CHANNEL_POSITION_AUX4:                  return MA_CHANNEL_AUX_4;
+        case MA_PA_CHANNEL_POSITION_AUX5:                  return MA_CHANNEL_AUX_5;
+        case MA_PA_CHANNEL_POSITION_AUX6:                  return MA_CHANNEL_AUX_6;
+        case MA_PA_CHANNEL_POSITION_AUX7:                  return MA_CHANNEL_AUX_7;
+        case MA_PA_CHANNEL_POSITION_AUX8:                  return MA_CHANNEL_AUX_8;
+        case MA_PA_CHANNEL_POSITION_AUX9:                  return MA_CHANNEL_AUX_9;
+        case MA_PA_CHANNEL_POSITION_AUX10:                 return MA_CHANNEL_AUX_10;
+        case MA_PA_CHANNEL_POSITION_AUX11:                 return MA_CHANNEL_AUX_11;
+        case MA_PA_CHANNEL_POSITION_AUX12:                 return MA_CHANNEL_AUX_12;
+        case MA_PA_CHANNEL_POSITION_AUX13:                 return MA_CHANNEL_AUX_13;
+        case MA_PA_CHANNEL_POSITION_AUX14:                 return MA_CHANNEL_AUX_14;
+        case MA_PA_CHANNEL_POSITION_AUX15:                 return MA_CHANNEL_AUX_15;
+        case MA_PA_CHANNEL_POSITION_AUX16:                 return MA_CHANNEL_AUX_16;
+        case MA_PA_CHANNEL_POSITION_AUX17:                 return MA_CHANNEL_AUX_17;
+        case MA_PA_CHANNEL_POSITION_AUX18:                 return MA_CHANNEL_AUX_18;
+        case MA_PA_CHANNEL_POSITION_AUX19:                 return MA_CHANNEL_AUX_19;
+        case MA_PA_CHANNEL_POSITION_AUX20:                 return MA_CHANNEL_AUX_20;
+        case MA_PA_CHANNEL_POSITION_AUX21:                 return MA_CHANNEL_AUX_21;
+        case MA_PA_CHANNEL_POSITION_AUX22:                 return MA_CHANNEL_AUX_22;
+        case MA_PA_CHANNEL_POSITION_AUX23:                 return MA_CHANNEL_AUX_23;
+        case MA_PA_CHANNEL_POSITION_AUX24:                 return MA_CHANNEL_AUX_24;
+        case MA_PA_CHANNEL_POSITION_AUX25:                 return MA_CHANNEL_AUX_25;
+        case MA_PA_CHANNEL_POSITION_AUX26:                 return MA_CHANNEL_AUX_26;
+        case MA_PA_CHANNEL_POSITION_AUX27:                 return MA_CHANNEL_AUX_27;
+        case MA_PA_CHANNEL_POSITION_AUX28:                 return MA_CHANNEL_AUX_28;
+        case MA_PA_CHANNEL_POSITION_AUX29:                 return MA_CHANNEL_AUX_29;
+        case MA_PA_CHANNEL_POSITION_AUX30:                 return MA_CHANNEL_AUX_30;
+        case MA_PA_CHANNEL_POSITION_AUX31:                 return MA_CHANNEL_AUX_31;
+        case MA_PA_CHANNEL_POSITION_TOP_CENTER:            return MA_CHANNEL_TOP_CENTER;
+        case MA_PA_CHANNEL_POSITION_TOP_FRONT_LEFT:        return MA_CHANNEL_TOP_FRONT_LEFT;
+        case MA_PA_CHANNEL_POSITION_TOP_FRONT_RIGHT:       return MA_CHANNEL_TOP_FRONT_RIGHT;
+        case MA_PA_CHANNEL_POSITION_TOP_FRONT_CENTER:      return MA_CHANNEL_TOP_FRONT_CENTER;
+        case MA_PA_CHANNEL_POSITION_TOP_REAR_LEFT:         return MA_CHANNEL_TOP_BACK_LEFT;
+        case MA_PA_CHANNEL_POSITION_TOP_REAR_RIGHT:        return MA_CHANNEL_TOP_BACK_RIGHT;
+        case MA_PA_CHANNEL_POSITION_TOP_REAR_CENTER:       return MA_CHANNEL_TOP_BACK_CENTER;
+        default: return MA_CHANNEL_NONE;
+    }
+}
+
+#if 0
+static ma_pa_channel_position_t ma_channel_position_to_pulse(ma_channel position)
+{
+    switch (position)
+    {
+        case MA_CHANNEL_NONE:               return MA_PA_CHANNEL_POSITION_INVALID;
+        case MA_CHANNEL_FRONT_LEFT:         return MA_PA_CHANNEL_POSITION_FRONT_LEFT;
+        case MA_CHANNEL_FRONT_RIGHT:        return MA_PA_CHANNEL_POSITION_FRONT_RIGHT;
+        case MA_CHANNEL_FRONT_CENTER:       return MA_PA_CHANNEL_POSITION_FRONT_CENTER;
+        case MA_CHANNEL_LFE:                return MA_PA_CHANNEL_POSITION_LFE;
+        case MA_CHANNEL_BACK_LEFT:          return MA_PA_CHANNEL_POSITION_REAR_LEFT;
+        case MA_CHANNEL_BACK_RIGHT:         return MA_PA_CHANNEL_POSITION_REAR_RIGHT;
+        case MA_CHANNEL_FRONT_LEFT_CENTER:  return MA_PA_CHANNEL_POSITION_FRONT_LEFT_OF_CENTER;
+        case MA_CHANNEL_FRONT_RIGHT_CENTER: return MA_PA_CHANNEL_POSITION_FRONT_RIGHT_OF_CENTER;
+        case MA_CHANNEL_BACK_CENTER:        return MA_PA_CHANNEL_POSITION_REAR_CENTER;
+        case MA_CHANNEL_SIDE_LEFT:          return MA_PA_CHANNEL_POSITION_SIDE_LEFT;
+        case MA_CHANNEL_SIDE_RIGHT:         return MA_PA_CHANNEL_POSITION_SIDE_RIGHT;
+        case MA_CHANNEL_TOP_CENTER:         return MA_PA_CHANNEL_POSITION_TOP_CENTER;
+        case MA_CHANNEL_TOP_FRONT_LEFT:     return MA_PA_CHANNEL_POSITION_TOP_FRONT_LEFT;
+        case MA_CHANNEL_TOP_FRONT_CENTER:   return MA_PA_CHANNEL_POSITION_TOP_FRONT_CENTER;
+        case MA_CHANNEL_TOP_FRONT_RIGHT:    return MA_PA_CHANNEL_POSITION_TOP_FRONT_RIGHT;
+        case MA_CHANNEL_TOP_BACK_LEFT:      return MA_PA_CHANNEL_POSITION_TOP_REAR_LEFT;
+        case MA_CHANNEL_TOP_BACK_CENTER:    return MA_PA_CHANNEL_POSITION_TOP_REAR_CENTER;
+        case MA_CHANNEL_TOP_BACK_RIGHT:     return MA_PA_CHANNEL_POSITION_TOP_REAR_RIGHT;
+        case MA_CHANNEL_19:                 return MA_PA_CHANNEL_POSITION_AUX18;
+        case MA_CHANNEL_20:                 return MA_PA_CHANNEL_POSITION_AUX19;
+        case MA_CHANNEL_21:                 return MA_PA_CHANNEL_POSITION_AUX20;
+        case MA_CHANNEL_22:                 return MA_PA_CHANNEL_POSITION_AUX21;
+        case MA_CHANNEL_23:                 return MA_PA_CHANNEL_POSITION_AUX22;
+        case MA_CHANNEL_24:                 return MA_PA_CHANNEL_POSITION_AUX23;
+        case MA_CHANNEL_25:                 return MA_PA_CHANNEL_POSITION_AUX24;
+        case MA_CHANNEL_26:                 return MA_PA_CHANNEL_POSITION_AUX25;
+        case MA_CHANNEL_27:                 return MA_PA_CHANNEL_POSITION_AUX26;
+        case MA_CHANNEL_28:                 return MA_PA_CHANNEL_POSITION_AUX27;
+        case MA_CHANNEL_29:                 return MA_PA_CHANNEL_POSITION_AUX28;
+        case MA_CHANNEL_30:                 return MA_PA_CHANNEL_POSITION_AUX29;
+        case MA_CHANNEL_31:                 return MA_PA_CHANNEL_POSITION_AUX30;
+        case MA_CHANNEL_32:                 return MA_PA_CHANNEL_POSITION_AUX31;
+        default: return (ma_pa_channel_position_t)position;
+    }
+}
+#endif
+
+static ma_result ma_wait_for_operation__pulse(ma_context* pContext, ma_ptr pMainLoop, ma_pa_operation* pOP)
+{
+    int resultPA;
+    ma_pa_operation_state_t state;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pOP != NULL);
+
+    for (;;) {
+        state = ((ma_pa_operation_get_state_proc)pContext->pulse.pa_operation_get_state)(pOP);
+        if (state != MA_PA_OPERATION_RUNNING) {
+            break;  /* Done. */
+        }
+
+        resultPA = ((ma_pa_mainloop_iterate_proc)pContext->pulse.pa_mainloop_iterate)((ma_pa_mainloop*)pMainLoop, 1, NULL);
+        if (resultPA < 0) {
+            return ma_result_from_pulse(resultPA);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_wait_for_operation_and_unref__pulse(ma_context* pContext, ma_ptr pMainLoop, ma_pa_operation* pOP)
+{
+    ma_result result;
+
+    if (pOP == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_wait_for_operation__pulse(pContext, pMainLoop, pOP);
+    ((ma_pa_operation_unref_proc)pContext->pulse.pa_operation_unref)(pOP);
+
+    return result;
+}
+
+static ma_result ma_wait_for_pa_context_to_connect__pulse(ma_context* pContext, ma_ptr pMainLoop, ma_ptr pPulseContext)
+{
+    int resultPA;
+    ma_pa_context_state_t state;
+
+    for (;;) {
+        state = ((ma_pa_context_get_state_proc)pContext->pulse.pa_context_get_state)((ma_pa_context*)pPulseContext);
+        if (state == MA_PA_CONTEXT_READY) {
+            break;  /* Done. */
+        }
+
+        if (state == MA_PA_CONTEXT_FAILED || state == MA_PA_CONTEXT_TERMINATED) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] An error occurred while connecting the PulseAudio context.");
+            return MA_ERROR;
+        }
+
+        resultPA = ((ma_pa_mainloop_iterate_proc)pContext->pulse.pa_mainloop_iterate)((ma_pa_mainloop*)pMainLoop, 1, NULL);
+        if (resultPA < 0) {
+            return ma_result_from_pulse(resultPA);
+        }
+    }
+
+    /* Should never get here. */
+    return MA_SUCCESS;
+}
+
+static ma_result ma_wait_for_pa_stream_to_connect__pulse(ma_context* pContext, ma_ptr pMainLoop, ma_ptr pStream)
+{
+    int resultPA;
+    ma_pa_stream_state_t state;
+
+    for (;;) {
+        state = ((ma_pa_stream_get_state_proc)pContext->pulse.pa_stream_get_state)((ma_pa_stream*)pStream);
+        if (state == MA_PA_STREAM_READY) {
+            break;  /* Done. */
+        }
+
+        if (state == MA_PA_STREAM_FAILED || state == MA_PA_STREAM_TERMINATED) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] An error occurred while connecting the PulseAudio stream.");
+            return MA_ERROR;
+        }
+
+        resultPA = ((ma_pa_mainloop_iterate_proc)pContext->pulse.pa_mainloop_iterate)((ma_pa_mainloop*)pMainLoop, 1, NULL);
+        if (resultPA < 0) {
+            return ma_result_from_pulse(resultPA);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_init_pa_mainloop_and_pa_context__pulse(ma_context* pContext, const char* pApplicationName, const char* pServerName, ma_bool32 tryAutoSpawn, ma_ptr* ppMainLoop, ma_ptr* ppPulseContext)
+{
+    ma_result result;
+    ma_ptr pMainLoop;
+    ma_ptr pPulseContext;
+
+    MA_ASSERT(ppMainLoop     != NULL);
+    MA_ASSERT(ppPulseContext != NULL);
+
+    /* The PulseAudio context maps well to miniaudio's notion of a context. The pa_context object will be initialized as part of the ma_context. */
+    pMainLoop = ((ma_pa_mainloop_new_proc)pContext->pulse.pa_mainloop_new)();
+    if (pMainLoop == NULL) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to create mainloop.");
+        return MA_FAILED_TO_INIT_BACKEND;
+    }
+
+    pPulseContext = ((ma_pa_context_new_proc)pContext->pulse.pa_context_new)(((ma_pa_mainloop_get_api_proc)pContext->pulse.pa_mainloop_get_api)((ma_pa_mainloop*)pMainLoop), pApplicationName);
+    if (pPulseContext == NULL) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to create PulseAudio context.");
+        ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)(pMainLoop));
+        return MA_FAILED_TO_INIT_BACKEND;
+    }
+
+    /* Now we need to connect to the context. Everything is asynchronous so we need to wait for it to connect before returning. */
+    result = ma_result_from_pulse(((ma_pa_context_connect_proc)pContext->pulse.pa_context_connect)((ma_pa_context*)pPulseContext, pServerName, (tryAutoSpawn) ? 0 : MA_PA_CONTEXT_NOAUTOSPAWN, NULL));
+    if (result != MA_SUCCESS) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to connect PulseAudio context.");
+        ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)(pMainLoop));
+        return result;
+    }
+
+    /* Since ma_context_init() runs synchronously we need to wait for the PulseAudio context to connect before we return. */
+    result = ma_wait_for_pa_context_to_connect__pulse(pContext, pMainLoop, pPulseContext);
+    if (result != MA_SUCCESS) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[PulseAudio] Waiting for connection failed.");
+        ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)(pMainLoop));
+        return result;
+    }
+
+    *ppMainLoop     = pMainLoop;
+    *ppPulseContext = pPulseContext;
+
+    return MA_SUCCESS;
+}
+
+
+static void ma_device_sink_info_callback(ma_pa_context* pPulseContext, const ma_pa_sink_info* pInfo, int endOfList, void* pUserData)
+{
+    ma_pa_sink_info* pInfoOut;
+
+    if (endOfList > 0) {
+        return;
+    }
+
+    /*
+    There has been a report that indicates that pInfo can be null which results
+    in a null pointer dereference below. We'll check for this for safety.
+    */
+    if (pInfo == NULL) {
+        return;
+    }
+
+    pInfoOut = (ma_pa_sink_info*)pUserData;
+    MA_ASSERT(pInfoOut != NULL);
+
+    *pInfoOut = *pInfo;
+
+    (void)pPulseContext; /* Unused. */
+}
+
+static void ma_device_source_info_callback(ma_pa_context* pPulseContext, const ma_pa_source_info* pInfo, int endOfList, void* pUserData)
+{
+    ma_pa_source_info* pInfoOut;
+
+    if (endOfList > 0) {
+        return;
+    }
+
+    /*
+    There has been a report that indicates that pInfo can be null which results
+    in a null pointer dereference below. We'll check for this for safety.
+    */
+    if (pInfo == NULL) {
+        return;
+    }
+
+    pInfoOut = (ma_pa_source_info*)pUserData;
+    MA_ASSERT(pInfoOut != NULL);
+
+    *pInfoOut = *pInfo;
+
+    (void)pPulseContext; /* Unused. */
+}
+
+#if 0
+static void ma_device_sink_name_callback(ma_pa_context* pPulseContext, const ma_pa_sink_info* pInfo, int endOfList, void* pUserData)
+{
+    ma_device* pDevice;
+
+    if (endOfList > 0) {
+        return;
+    }
+
+    pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), pInfo->description, (size_t)-1);
+
+    (void)pPulseContext; /* Unused. */
+}
+
+static void ma_device_source_name_callback(ma_pa_context* pPulseContext, const ma_pa_source_info* pInfo, int endOfList, void* pUserData)
+{
+    ma_device* pDevice;
+
+    if (endOfList > 0) {
+        return;
+    }
+
+    pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), pInfo->description, (size_t)-1);
+
+    (void)pPulseContext; /* Unused. */
+}
+#endif
+
+static ma_result ma_context_get_sink_info__pulse(ma_context* pContext, const char* pDeviceName, ma_pa_sink_info* pSinkInfo)
+{
+    ma_pa_operation* pOP;
+
+    pOP = ((ma_pa_context_get_sink_info_by_name_proc)pContext->pulse.pa_context_get_sink_info_by_name)((ma_pa_context*)pContext->pulse.pPulseContext, pDeviceName, ma_device_sink_info_callback, pSinkInfo);
+    if (pOP == NULL) {
+        return MA_ERROR;
+    }
+
+    return ma_wait_for_operation_and_unref__pulse(pContext, pContext->pulse.pMainLoop, pOP);
+}
+
+static ma_result ma_context_get_source_info__pulse(ma_context* pContext, const char* pDeviceName, ma_pa_source_info* pSourceInfo)
+{
+    ma_pa_operation* pOP;
+
+    pOP = ((ma_pa_context_get_source_info_by_name_proc)pContext->pulse.pa_context_get_source_info_by_name)((ma_pa_context*)pContext->pulse.pPulseContext, pDeviceName, ma_device_source_info_callback, pSourceInfo);
+    if (pOP == NULL) {
+        return MA_ERROR;
+    }
+
+    return ma_wait_for_operation_and_unref__pulse(pContext, pContext->pulse.pMainLoop, pOP);
+}
+
+static ma_result ma_context_get_default_device_index__pulse(ma_context* pContext, ma_device_type deviceType, ma_uint32* pIndex)
+{
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pIndex   != NULL);
+
+    if (pIndex != NULL) {
+        *pIndex = (ma_uint32)-1;
+    }
+
+    if (deviceType == ma_device_type_playback) {
+        ma_pa_sink_info sinkInfo;
+        result = ma_context_get_sink_info__pulse(pContext, NULL, &sinkInfo);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        if (pIndex != NULL) {
+            *pIndex = sinkInfo.index;
+        }
+    }
+
+    if (deviceType == ma_device_type_capture) {
+        ma_pa_source_info sourceInfo;
+        result = ma_context_get_source_info__pulse(pContext, NULL, &sourceInfo);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        if (pIndex != NULL) {
+            *pIndex = sourceInfo.index;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+typedef struct
+{
+    ma_context* pContext;
+    ma_enum_devices_callback_proc callback;
+    void* pUserData;
+    ma_bool32 isTerminated;
+    ma_uint32 defaultDeviceIndexPlayback;
+    ma_uint32 defaultDeviceIndexCapture;
+} ma_context_enumerate_devices_callback_data__pulse;
+
+static void ma_context_enumerate_devices_sink_callback__pulse(ma_pa_context* pPulseContext, const ma_pa_sink_info* pSinkInfo, int endOfList, void* pUserData)
+{
+    ma_context_enumerate_devices_callback_data__pulse* pData = (ma_context_enumerate_devices_callback_data__pulse*)pUserData;
+    ma_device_info deviceInfo;
+
+    MA_ASSERT(pData != NULL);
+
+    if (endOfList || pData->isTerminated) {
+        return;
+    }
+
+    MA_ZERO_OBJECT(&deviceInfo);
+
+    /* The name from PulseAudio is the ID for miniaudio. */
+    if (pSinkInfo->name != NULL) {
+        ma_strncpy_s(deviceInfo.id.pulse, sizeof(deviceInfo.id.pulse), pSinkInfo->name, (size_t)-1);
+    }
+
+    /* The description from PulseAudio is the name for miniaudio. */
+    if (pSinkInfo->description != NULL) {
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), pSinkInfo->description, (size_t)-1);
+    }
+
+    if (pSinkInfo->index == pData->defaultDeviceIndexPlayback) {
+        deviceInfo.isDefault = MA_TRUE;
+    }
+
+    pData->isTerminated = !pData->callback(pData->pContext, ma_device_type_playback, &deviceInfo, pData->pUserData);
+
+    (void)pPulseContext; /* Unused. */
+}
+
+static void ma_context_enumerate_devices_source_callback__pulse(ma_pa_context* pPulseContext, const ma_pa_source_info* pSourceInfo, int endOfList, void* pUserData)
+{
+    ma_context_enumerate_devices_callback_data__pulse* pData = (ma_context_enumerate_devices_callback_data__pulse*)pUserData;
+    ma_device_info deviceInfo;
+
+    MA_ASSERT(pData != NULL);
+
+    if (endOfList || pData->isTerminated) {
+        return;
+    }
+
+    MA_ZERO_OBJECT(&deviceInfo);
+
+    /* The name from PulseAudio is the ID for miniaudio. */
+    if (pSourceInfo->name != NULL) {
+        ma_strncpy_s(deviceInfo.id.pulse, sizeof(deviceInfo.id.pulse), pSourceInfo->name, (size_t)-1);
+    }
+
+    /* The description from PulseAudio is the name for miniaudio. */
+    if (pSourceInfo->description != NULL) {
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), pSourceInfo->description, (size_t)-1);
+    }
+
+    if (pSourceInfo->index == pData->defaultDeviceIndexCapture) {
+        deviceInfo.isDefault = MA_TRUE;
+    }
+
+    pData->isTerminated = !pData->callback(pData->pContext, ma_device_type_capture, &deviceInfo, pData->pUserData);
+
+    (void)pPulseContext; /* Unused. */
+}
+
+static ma_result ma_context_enumerate_devices__pulse(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_result result = MA_SUCCESS;
+    ma_context_enumerate_devices_callback_data__pulse callbackData;
+    ma_pa_operation* pOP = NULL;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    callbackData.pContext = pContext;
+    callbackData.callback = callback;
+    callbackData.pUserData = pUserData;
+    callbackData.isTerminated = MA_FALSE;
+    callbackData.defaultDeviceIndexPlayback = (ma_uint32)-1;
+    callbackData.defaultDeviceIndexCapture  = (ma_uint32)-1;
+
+    /* We need to get the index of the default devices. */
+    ma_context_get_default_device_index__pulse(pContext, ma_device_type_playback, &callbackData.defaultDeviceIndexPlayback);
+    ma_context_get_default_device_index__pulse(pContext, ma_device_type_capture,  &callbackData.defaultDeviceIndexCapture);
+
+    /* Playback. */
+    if (!callbackData.isTerminated) {
+        pOP = ((ma_pa_context_get_sink_info_list_proc)pContext->pulse.pa_context_get_sink_info_list)((ma_pa_context*)(pContext->pulse.pPulseContext), ma_context_enumerate_devices_sink_callback__pulse, &callbackData);
+        if (pOP == NULL) {
+            result = MA_ERROR;
+            goto done;
+        }
+
+        result = ma_wait_for_operation__pulse(pContext, pContext->pulse.pMainLoop, pOP);
+        ((ma_pa_operation_unref_proc)pContext->pulse.pa_operation_unref)(pOP);
+
+        if (result != MA_SUCCESS) {
+            goto done;
+        }
+    }
+
+
+    /* Capture. */
+    if (!callbackData.isTerminated) {
+        pOP = ((ma_pa_context_get_source_info_list_proc)pContext->pulse.pa_context_get_source_info_list)((ma_pa_context*)(pContext->pulse.pPulseContext), ma_context_enumerate_devices_source_callback__pulse, &callbackData);
+        if (pOP == NULL) {
+            result = MA_ERROR;
+            goto done;
+        }
+
+        result = ma_wait_for_operation__pulse(pContext, pContext->pulse.pMainLoop, pOP);
+        ((ma_pa_operation_unref_proc)pContext->pulse.pa_operation_unref)(pOP);
+
+        if (result != MA_SUCCESS) {
+            goto done;
+        }
+    }
+
+done:
+    return result;
+}
+
+
+typedef struct
+{
+    ma_device_info* pDeviceInfo;
+    ma_uint32 defaultDeviceIndex;
+    ma_bool32 foundDevice;
+} ma_context_get_device_info_callback_data__pulse;
+
+static void ma_context_get_device_info_sink_callback__pulse(ma_pa_context* pPulseContext, const ma_pa_sink_info* pInfo, int endOfList, void* pUserData)
+{
+    ma_context_get_device_info_callback_data__pulse* pData = (ma_context_get_device_info_callback_data__pulse*)pUserData;
+
+    if (endOfList > 0) {
+        return;
+    }
+
+    MA_ASSERT(pData != NULL);
+    pData->foundDevice = MA_TRUE;
+
+    if (pInfo->name != NULL) {
+        ma_strncpy_s(pData->pDeviceInfo->id.pulse, sizeof(pData->pDeviceInfo->id.pulse), pInfo->name, (size_t)-1);
+    }
+
+    if (pInfo->description != NULL) {
+        ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), pInfo->description, (size_t)-1);
+    }
+
+    /*
+    We're just reporting a single data format here. I think technically PulseAudio might support
+    all formats, but I don't trust that PulseAudio will do *anything* right, so I'm just going to
+    report the "native" device format.
+    */
+    pData->pDeviceInfo->nativeDataFormats[0].format     = ma_format_from_pulse(pInfo->sample_spec.format);
+    pData->pDeviceInfo->nativeDataFormats[0].channels   = pInfo->sample_spec.channels;
+    pData->pDeviceInfo->nativeDataFormats[0].sampleRate = pInfo->sample_spec.rate;
+    pData->pDeviceInfo->nativeDataFormats[0].flags      = 0;
+    pData->pDeviceInfo->nativeDataFormatCount = 1;
+
+    if (pData->defaultDeviceIndex == pInfo->index) {
+        pData->pDeviceInfo->isDefault = MA_TRUE;
+    }
+
+    (void)pPulseContext; /* Unused. */
+}
+
+static void ma_context_get_device_info_source_callback__pulse(ma_pa_context* pPulseContext, const ma_pa_source_info* pInfo, int endOfList, void* pUserData)
+{
+    ma_context_get_device_info_callback_data__pulse* pData = (ma_context_get_device_info_callback_data__pulse*)pUserData;
+
+    if (endOfList > 0) {
+        return;
+    }
+
+    MA_ASSERT(pData != NULL);
+    pData->foundDevice = MA_TRUE;
+
+    if (pInfo->name != NULL) {
+        ma_strncpy_s(pData->pDeviceInfo->id.pulse, sizeof(pData->pDeviceInfo->id.pulse), pInfo->name, (size_t)-1);
+    }
+
+    if (pInfo->description != NULL) {
+        ma_strncpy_s(pData->pDeviceInfo->name, sizeof(pData->pDeviceInfo->name), pInfo->description, (size_t)-1);
+    }
+
+    /*
+    We're just reporting a single data format here. I think technically PulseAudio might support
+    all formats, but I don't trust that PulseAudio will do *anything* right, so I'm just going to
+    report the "native" device format.
+    */
+    pData->pDeviceInfo->nativeDataFormats[0].format     = ma_format_from_pulse(pInfo->sample_spec.format);
+    pData->pDeviceInfo->nativeDataFormats[0].channels   = pInfo->sample_spec.channels;
+    pData->pDeviceInfo->nativeDataFormats[0].sampleRate = pInfo->sample_spec.rate;
+    pData->pDeviceInfo->nativeDataFormats[0].flags      = 0;
+    pData->pDeviceInfo->nativeDataFormatCount = 1;
+
+    if (pData->defaultDeviceIndex == pInfo->index) {
+        pData->pDeviceInfo->isDefault = MA_TRUE;
+    }
+
+    (void)pPulseContext; /* Unused. */
+}
+
+static ma_result ma_context_get_device_info__pulse(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    ma_result result = MA_SUCCESS;
+    ma_context_get_device_info_callback_data__pulse callbackData;
+    ma_pa_operation* pOP = NULL;
+    const char* pDeviceName = NULL;
+
+    MA_ASSERT(pContext != NULL);
+
+    callbackData.pDeviceInfo = pDeviceInfo;
+    callbackData.foundDevice = MA_FALSE;
+
+    if (pDeviceID != NULL) {
+        pDeviceName = pDeviceID->pulse;
+    } else {
+        pDeviceName = NULL;
+    }
+
+    result = ma_context_get_default_device_index__pulse(pContext, deviceType, &callbackData.defaultDeviceIndex);
+
+    if (deviceType == ma_device_type_playback) {
+        pOP = ((ma_pa_context_get_sink_info_by_name_proc)pContext->pulse.pa_context_get_sink_info_by_name)((ma_pa_context*)(pContext->pulse.pPulseContext), pDeviceName, ma_context_get_device_info_sink_callback__pulse, &callbackData);
+    } else {
+        pOP = ((ma_pa_context_get_source_info_by_name_proc)pContext->pulse.pa_context_get_source_info_by_name)((ma_pa_context*)(pContext->pulse.pPulseContext), pDeviceName, ma_context_get_device_info_source_callback__pulse, &callbackData);
+    }
+
+    if (pOP != NULL) {
+        ma_wait_for_operation_and_unref__pulse(pContext, pContext->pulse.pMainLoop, pOP);
+    } else {
+        result = MA_ERROR;
+        goto done;
+    }
+
+    if (!callbackData.foundDevice) {
+        result = MA_NO_DEVICE;
+        goto done;
+    }
+
+done:
+    return result;
+}
+
+static ma_result ma_device_uninit__pulse(ma_device* pDevice)
+{
+    ma_context* pContext;
+
+    MA_ASSERT(pDevice != NULL);
+
+    pContext = pDevice->pContext;
+    MA_ASSERT(pContext != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ((ma_pa_stream_disconnect_proc)pContext->pulse.pa_stream_disconnect)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
+        ((ma_pa_stream_unref_proc)pContext->pulse.pa_stream_unref)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ((ma_pa_stream_disconnect_proc)pContext->pulse.pa_stream_disconnect)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
+        ((ma_pa_stream_unref_proc)pContext->pulse.pa_stream_unref)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
+    }
+
+    if (pDevice->type == ma_device_type_duplex) {
+        ma_duplex_rb_uninit(&pDevice->duplexRB);
+    }
+
+    ((ma_pa_context_disconnect_proc)pContext->pulse.pa_context_disconnect)((ma_pa_context*)pDevice->pulse.pPulseContext);
+    ((ma_pa_context_unref_proc)pContext->pulse.pa_context_unref)((ma_pa_context*)pDevice->pulse.pPulseContext);
+    ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)pDevice->pulse.pMainLoop);
+
+    return MA_SUCCESS;
+}
+
+static ma_pa_buffer_attr ma_device__pa_buffer_attr_new(ma_uint32 periodSizeInFrames, ma_uint32 periods, const ma_pa_sample_spec* ss)
+{
+    ma_pa_buffer_attr attr;
+    attr.maxlength = periodSizeInFrames * periods * ma_get_bytes_per_frame(ma_format_from_pulse(ss->format), ss->channels);
+    attr.tlength   = attr.maxlength / periods;
+    attr.prebuf    = (ma_uint32)-1;
+    attr.minreq    = (ma_uint32)-1;
+    attr.fragsize  = attr.maxlength / periods;
+
+    return attr;
+}
+
+static ma_pa_stream* ma_device__pa_stream_new__pulse(ma_device* pDevice, const char* pStreamName, const ma_pa_sample_spec* ss, const ma_pa_channel_map* cmap)
+{
+    static ma_atomic_uint32 g_StreamCounter = { 0 };
+    char actualStreamName[256];
+
+    if (pStreamName != NULL) {
+        ma_strncpy_s(actualStreamName, sizeof(actualStreamName), pStreamName, (size_t)-1);
+    } else {
+        const char* pBaseName = "miniaudio:";
+        size_t baseNameLen = strlen(pBaseName);
+        ma_strcpy_s(actualStreamName, sizeof(actualStreamName), pBaseName);
+        ma_itoa_s((int)ma_atomic_uint32_get(&g_StreamCounter), actualStreamName + baseNameLen, sizeof(actualStreamName)-baseNameLen, 10);
+    }
+    ma_atomic_uint32_fetch_add(&g_StreamCounter, 1);
+
+    return ((ma_pa_stream_new_proc)pDevice->pContext->pulse.pa_stream_new)((ma_pa_context*)pDevice->pulse.pPulseContext, actualStreamName, ss, cmap);
+}
+
+
+static void ma_device_on_read__pulse(ma_pa_stream* pStream, size_t byteCount, void* pUserData)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    ma_uint32 bpf;
+    ma_uint32 deviceState;
+    ma_uint64 frameCount;
+    ma_uint64 framesProcessed;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /*
+    Don't do anything if the device isn't initialized yet. Yes, this can happen because PulseAudio
+    can fire this callback before the stream has even started. Ridiculous.
+    */
+    deviceState = ma_device_get_state(pDevice);
+    if (deviceState != ma_device_state_starting && deviceState != ma_device_state_started) {
+        return;
+    }
+
+    bpf = ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+    MA_ASSERT(bpf > 0);
+
+    frameCount = byteCount / bpf;
+    framesProcessed = 0;
+
+    while (ma_device_get_state(pDevice) == ma_device_state_started && framesProcessed < frameCount) {
+        const void* pMappedPCMFrames;
+        size_t bytesMapped;
+        ma_uint64 framesMapped;
+
+        int pulseResult = ((ma_pa_stream_peek_proc)pDevice->pContext->pulse.pa_stream_peek)(pStream, &pMappedPCMFrames, &bytesMapped);
+        if (pulseResult < 0) {
+            break; /* Failed to map. Abort. */
+        }
+
+        framesMapped = bytesMapped / bpf;
+        if (framesMapped > 0) {
+            if (pMappedPCMFrames != NULL) {
+                ma_device_handle_backend_data_callback(pDevice, NULL, pMappedPCMFrames, framesMapped);
+            } else {
+                /* It's a hole. */
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[PulseAudio] ma_device_on_read__pulse: Hole.\n");
+            }
+
+            pulseResult = ((ma_pa_stream_drop_proc)pDevice->pContext->pulse.pa_stream_drop)(pStream);
+            if (pulseResult < 0) {
+                break;  /* Failed to drop the buffer. */
+            }
+
+            framesProcessed += framesMapped;
+
+        } else {
+            /* Nothing was mapped. Just abort. */
+            break;
+        }
+    }
+}
+
+static ma_result ma_device_write_to_stream__pulse(ma_device* pDevice, ma_pa_stream* pStream, ma_uint64* pFramesProcessed)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 framesProcessed = 0;
+    size_t bytesMapped;
+    ma_uint32 bpf;
+    ma_uint32 deviceState;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pStream != NULL);
+
+    bpf = ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+    MA_ASSERT(bpf > 0);
+
+    deviceState = ma_device_get_state(pDevice);
+
+    bytesMapped = ((ma_pa_stream_writable_size_proc)pDevice->pContext->pulse.pa_stream_writable_size)(pStream);
+    if (bytesMapped != (size_t)-1) {
+        if (bytesMapped > 0) {
+            ma_uint64 framesMapped;
+            void* pMappedPCMFrames;
+            int pulseResult = ((ma_pa_stream_begin_write_proc)pDevice->pContext->pulse.pa_stream_begin_write)(pStream, &pMappedPCMFrames, &bytesMapped);
+            if (pulseResult < 0) {
+                result = ma_result_from_pulse(pulseResult);
+                goto done;
+            }
+
+            framesMapped = bytesMapped / bpf;
+
+            if (deviceState == ma_device_state_started || deviceState == ma_device_state_starting) {  /* Check for starting state just in case this is being used to do the initial fill. */
+                ma_device_handle_backend_data_callback(pDevice, pMappedPCMFrames, NULL, framesMapped);
+            } else {
+                /* Device is not started. Write silence. */
+                ma_silence_pcm_frames(pMappedPCMFrames, framesMapped, pDevice->playback.format, pDevice->playback.channels);
+            }
+
+            pulseResult = ((ma_pa_stream_write_proc)pDevice->pContext->pulse.pa_stream_write)(pStream, pMappedPCMFrames, bytesMapped, NULL, 0, MA_PA_SEEK_RELATIVE);
+            if (pulseResult < 0) {
+                result = ma_result_from_pulse(pulseResult);
+                goto done;  /* Failed to write data to stream. */
+            }
+
+            framesProcessed += framesMapped;
+        } else {
+            result = MA_SUCCESS;  /* No data available for writing. */
+            goto done;
+        }
+    } else {
+        result = MA_ERROR;  /* Failed to retrieve the writable size. Abort. */
+        goto done;
+    }
+
+done:
+    if (pFramesProcessed != NULL) {
+        *pFramesProcessed = framesProcessed;
+    }
+
+    return result;
+}
+
+static void ma_device_on_write__pulse(ma_pa_stream* pStream, size_t byteCount, void* pUserData)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    ma_uint32 bpf;
+    ma_uint64 frameCount;
+    ma_uint64 framesProcessed;
+    ma_uint32 deviceState;
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /*
+    Don't do anything if the device isn't initialized yet. Yes, this can happen because PulseAudio
+    can fire this callback before the stream has even started. Ridiculous.
+    */
+    deviceState = ma_device_get_state(pDevice);
+    if (deviceState != ma_device_state_starting && deviceState != ma_device_state_started) {
+        return;
+    }
+
+    bpf = ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+    MA_ASSERT(bpf > 0);
+
+    frameCount = byteCount / bpf;
+    framesProcessed = 0;
+
+    while (framesProcessed < frameCount) {
+        ma_uint64 framesProcessedThisIteration;
+
+        /* Don't keep trying to process frames if the device isn't started. */
+        deviceState = ma_device_get_state(pDevice);
+        if (deviceState != ma_device_state_starting && deviceState != ma_device_state_started) {
+            break;
+        }
+
+        result = ma_device_write_to_stream__pulse(pDevice, pStream, &framesProcessedThisIteration);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        framesProcessed += framesProcessedThisIteration;
+    }
+}
+
+static void ma_device_on_suspended__pulse(ma_pa_stream* pStream, void* pUserData)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    int suspended;
+
+    (void)pStream;
+
+    suspended = ((ma_pa_stream_is_suspended_proc)pDevice->pContext->pulse.pa_stream_is_suspended)(pStream);
+    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[Pulse] Device suspended state changed. pa_stream_is_suspended() returned %d.\n", suspended);
+
+    if (suspended < 0) {
+        return;
+    }
+
+    if (suspended == 1) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[Pulse] Device suspended state changed. Suspended.\n");
+        ma_device__on_notification_stopped(pDevice);
+    } else {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "[Pulse] Device suspended state changed. Resumed.\n");
+        ma_device__on_notification_started(pDevice);
+    }
+}
+
+static void ma_device_on_rerouted__pulse(ma_pa_stream* pStream, void* pUserData)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+
+    (void)pStream;
+    (void)pUserData;
+
+    ma_device__on_notification_rerouted(pDevice);
+}
+
+static ma_uint32 ma_calculate_period_size_in_frames_from_descriptor__pulse(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
+{
+    /*
+    There have been reports from users where buffers of < ~20ms result glitches when running through
+    PipeWire. To work around this we're going to have to use a different default buffer size.
+    */
+    const ma_uint32 defaultPeriodSizeInMilliseconds_LowLatency   = 25;
+    const ma_uint32 defaultPeriodSizeInMilliseconds_Conservative = MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE;
+
+    MA_ASSERT(nativeSampleRate != 0);
+
+    if (pDescriptor->periodSizeInFrames == 0) {
+        if (pDescriptor->periodSizeInMilliseconds == 0) {
+            if (performanceProfile == ma_performance_profile_low_latency) {
+                return ma_calculate_buffer_size_in_frames_from_milliseconds(defaultPeriodSizeInMilliseconds_LowLatency, nativeSampleRate);
+            } else {
+                return ma_calculate_buffer_size_in_frames_from_milliseconds(defaultPeriodSizeInMilliseconds_Conservative, nativeSampleRate);
+            }
+        } else {
+            return ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptor->periodSizeInMilliseconds, nativeSampleRate);
+        }
+    } else {
+        return pDescriptor->periodSizeInFrames;
+    }
+}
+
+static ma_result ma_device_init__pulse(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    /*
+    Notes for PulseAudio:
+
+      - When both the period size in frames and milliseconds are 0, we default to miniaudio's
+        default buffer sizes rather than leaving it up to PulseAudio because I don't trust
+        PulseAudio to give us any kind of reasonable latency by default.
+
+      - Do not ever, *ever* forget to use MA_PA_STREAM_ADJUST_LATENCY. If you don't specify this
+        flag, capture mode will just not work properly until you open another PulseAudio app.
+    */
+
+    ma_result result = MA_SUCCESS;
+    int error = 0;
+    const char* devPlayback = NULL;
+    const char* devCapture  = NULL;
+    ma_format format = ma_format_unknown;
+    ma_uint32 channels = 0;
+    ma_uint32 sampleRate = 0;
+    ma_pa_sink_info sinkInfo;
+    ma_pa_source_info sourceInfo;
+    ma_pa_sample_spec ss;
+    ma_pa_channel_map cmap;
+    ma_pa_buffer_attr attr;
+    const ma_pa_sample_spec* pActualSS   = NULL;
+    const ma_pa_buffer_attr* pActualAttr = NULL;
+    const ma_pa_channel_map* pActualChannelMap = NULL;
+    ma_uint32 iChannel;
+    ma_pa_stream_flags_t streamFlags;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ZERO_OBJECT(&pDevice->pulse);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    /* No exclusive mode with the PulseAudio backend. */
+    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pConfig->playback.shareMode == ma_share_mode_exclusive) ||
+        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pConfig->capture.shareMode  == ma_share_mode_exclusive)) {
+        return MA_SHARE_MODE_NOT_SUPPORTED;
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        if (pDescriptorPlayback->pDeviceID != NULL) {
+            devPlayback = pDescriptorPlayback->pDeviceID->pulse;
+        }
+
+        format     = pDescriptorPlayback->format;
+        channels   = pDescriptorPlayback->channels;
+        sampleRate = pDescriptorPlayback->sampleRate;
+    }
+
+    if (pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) {
+        if (pDescriptorCapture->pDeviceID != NULL) {
+            devCapture = pDescriptorCapture->pDeviceID->pulse;
+        }
+
+        format     = pDescriptorCapture->format;
+        channels   = pDescriptorCapture->channels;
+        sampleRate = pDescriptorCapture->sampleRate;
+    }
+
+
+
+    result = ma_init_pa_mainloop_and_pa_context__pulse(pDevice->pContext, pDevice->pContext->pulse.pApplicationName, pDevice->pContext->pulse.pServerName, MA_FALSE, &pDevice->pulse.pMainLoop, &pDevice->pulse.pPulseContext);
+    if (result != MA_SUCCESS) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to initialize PA mainloop and context for device.\n");
+        return result;
+    }
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        result = ma_context_get_source_info__pulse(pDevice->pContext, devCapture, &sourceInfo);
+        if (result != MA_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to retrieve source info for capture device.");
+            goto on_error0;
+        }
+
+        ss   = sourceInfo.sample_spec;
+        cmap = sourceInfo.channel_map;
+
+        /* Use the requested channel count if we have one. */
+        if (pDescriptorCapture->channels != 0) {
+            ss.channels = pDescriptorCapture->channels;
+        }
+
+        /* Use a default channel map. */
+        ((ma_pa_channel_map_init_extend_proc)pDevice->pContext->pulse.pa_channel_map_init_extend)(&cmap, ss.channels, pConfig->pulse.channelMap);
+
+        /* Use the requested sample rate if one was specified. */
+        if (pDescriptorCapture->sampleRate != 0) {
+            ss.rate = pDescriptorCapture->sampleRate;
+        }
+        streamFlags = MA_PA_STREAM_START_CORKED | MA_PA_STREAM_ADJUST_LATENCY;
+
+        if (ma_format_from_pulse(ss.format) == ma_format_unknown) {
+            if (ma_is_little_endian()) {
+                ss.format = MA_PA_SAMPLE_FLOAT32LE;
+            } else {
+                ss.format = MA_PA_SAMPLE_FLOAT32BE;
+            }
+            streamFlags |= MA_PA_STREAM_FIX_FORMAT;
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.format not supported by miniaudio. Defaulting to PA_SAMPLE_FLOAT32.\n");
+        }
+        if (ss.rate == 0) {
+            ss.rate = MA_DEFAULT_SAMPLE_RATE;
+            streamFlags |= MA_PA_STREAM_FIX_RATE;
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.rate = 0. Defaulting to %d.\n", ss.rate);
+        }
+        if (ss.channels == 0) {
+            ss.channels = MA_DEFAULT_CHANNELS;
+            streamFlags |= MA_PA_STREAM_FIX_CHANNELS;
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.channels = 0. Defaulting to %d.\n", ss.channels);
+        }
+
+        /* We now have enough information to calculate our actual period size in frames. */
+        pDescriptorCapture->periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__pulse(pDescriptorCapture, ss.rate, pConfig->performanceProfile);
+
+        attr = ma_device__pa_buffer_attr_new(pDescriptorCapture->periodSizeInFrames, pDescriptorCapture->periodCount, &ss);
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Capture attr: maxlength=%d, tlength=%d, prebuf=%d, minreq=%d, fragsize=%d; periodSizeInFrames=%d\n", attr.maxlength, attr.tlength, attr.prebuf, attr.minreq, attr.fragsize, pDescriptorCapture->periodSizeInFrames);
+
+        pDevice->pulse.pStreamCapture = ma_device__pa_stream_new__pulse(pDevice, pConfig->pulse.pStreamNameCapture, &ss, &cmap);
+        if (pDevice->pulse.pStreamCapture == NULL) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to create PulseAudio capture stream.\n");
+            result = MA_ERROR;
+            goto on_error0;
+        }
+
+
+        /* The callback needs to be set before connecting the stream. */
+        ((ma_pa_stream_set_read_callback_proc)pDevice->pContext->pulse.pa_stream_set_read_callback)((ma_pa_stream*)pDevice->pulse.pStreamCapture, ma_device_on_read__pulse, pDevice);
+
+        /* State callback for checking when the device has been corked. */
+        ((ma_pa_stream_set_suspended_callback_proc)pDevice->pContext->pulse.pa_stream_set_suspended_callback)((ma_pa_stream*)pDevice->pulse.pStreamCapture, ma_device_on_suspended__pulse, pDevice);
+
+        /* Rerouting notification. */
+        ((ma_pa_stream_set_moved_callback_proc)pDevice->pContext->pulse.pa_stream_set_moved_callback)((ma_pa_stream*)pDevice->pulse.pStreamCapture, ma_device_on_rerouted__pulse, pDevice);
+
+
+        /* Connect after we've got all of our internal state set up. */
+        if (devCapture != NULL) {
+            streamFlags |= MA_PA_STREAM_DONT_MOVE;
+        }
+
+        error = ((ma_pa_stream_connect_record_proc)pDevice->pContext->pulse.pa_stream_connect_record)((ma_pa_stream*)pDevice->pulse.pStreamCapture, devCapture, &attr, streamFlags);
+        if (error != MA_PA_OK) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to connect PulseAudio capture stream.");
+            result = ma_result_from_pulse(error);
+            goto on_error1;
+        }
+
+        result = ma_wait_for_pa_stream_to_connect__pulse(pDevice->pContext, pDevice->pulse.pMainLoop, (ma_pa_stream*)pDevice->pulse.pStreamCapture);
+        if (result != MA_SUCCESS) {
+            goto on_error2;
+        }
+
+
+        /* Internal format. */
+        pActualSS = ((ma_pa_stream_get_sample_spec_proc)pDevice->pContext->pulse.pa_stream_get_sample_spec)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
+        if (pActualSS != NULL) {
+            ss = *pActualSS;
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Capture sample spec: format=%s, channels=%d, rate=%d\n", ma_get_format_name(ma_format_from_pulse(ss.format)), ss.channels, ss.rate);
+        } else {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Failed to retrieve capture sample spec.\n");
+        }
+
+        pDescriptorCapture->format     = ma_format_from_pulse(ss.format);
+        pDescriptorCapture->channels   = ss.channels;
+        pDescriptorCapture->sampleRate = ss.rate;
+
+        if (pDescriptorCapture->format == ma_format_unknown || pDescriptorCapture->channels == 0 || pDescriptorCapture->sampleRate == 0) {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Capture sample spec is invalid. Device unusable by miniaudio. format=%s, channels=%d, sampleRate=%d.\n", ma_get_format_name(pDescriptorCapture->format), pDescriptorCapture->channels, pDescriptorCapture->sampleRate);
+            result = MA_ERROR;
+            goto on_error4;
+        }
+
+
+        /* Internal channel map. */
+        pActualChannelMap = ((ma_pa_stream_get_channel_map_proc)pDevice->pContext->pulse.pa_stream_get_channel_map)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
+        if (pActualChannelMap == NULL) {
+            pActualChannelMap = &cmap;  /* Fallback just in case. */
+        }
+
+        /*
+        Bug in PipeWire. There have been reports that PipeWire is returning AUX channels when reporting
+        the channel map. To somewhat workaround this, I'm hacking in a hard coded channel map for mono
+        and stereo. In this case it should be safe to assume mono = MONO and stereo = LEFT/RIGHT. For
+        all other channel counts we need to just put up with whatever PipeWire reports and hope it gets
+        fixed sooner than later. I might remove this hack later.
+        */
+        if (pDescriptorCapture->channels > 2) {
+            for (iChannel = 0; iChannel < pDescriptorCapture->channels; iChannel += 1) {
+                pDescriptorCapture->channelMap[iChannel] = ma_channel_position_from_pulse(pActualChannelMap->map[iChannel]);
+            }
+        } else {
+            /* Hack for mono and stereo. */
+            if (pDescriptorCapture->channels == 1) {
+                pDescriptorCapture->channelMap[0] = MA_CHANNEL_MONO;
+            } else if (pDescriptorCapture->channels == 2) {
+                pDescriptorCapture->channelMap[0] = MA_CHANNEL_FRONT_LEFT;
+                pDescriptorCapture->channelMap[1] = MA_CHANNEL_FRONT_RIGHT;
+            } else {
+                MA_ASSERT(MA_FALSE);    /* Should never hit this. */
+            }
+        }
+
+
+        /* Buffer. */
+        pActualAttr = ((ma_pa_stream_get_buffer_attr_proc)pDevice->pContext->pulse.pa_stream_get_buffer_attr)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
+        if (pActualAttr != NULL) {
+            attr = *pActualAttr;
+        }
+
+        if (attr.fragsize > 0) {
+            pDescriptorCapture->periodCount = ma_max(attr.maxlength / attr.fragsize, 1);
+        } else {
+            pDescriptorCapture->periodCount = 1;
+        }
+
+        pDescriptorCapture->periodSizeInFrames = attr.maxlength / ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels) / pDescriptorCapture->periodCount;
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Capture actual attr: maxlength=%d, tlength=%d, prebuf=%d, minreq=%d, fragsize=%d; periodSizeInFrames=%d\n", attr.maxlength, attr.tlength, attr.prebuf, attr.minreq, attr.fragsize, pDescriptorCapture->periodSizeInFrames);
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        result = ma_context_get_sink_info__pulse(pDevice->pContext, devPlayback, &sinkInfo);
+        if (result != MA_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to retrieve sink info for playback device.\n");
+            goto on_error2;
+        }
+
+        ss   = sinkInfo.sample_spec;
+        cmap = sinkInfo.channel_map;
+
+        /* Use the requested channel count if we have one. */
+        if (pDescriptorPlayback->channels != 0) {
+            ss.channels = pDescriptorPlayback->channels;
+        }
+
+        /* Use a default channel map. */
+        ((ma_pa_channel_map_init_extend_proc)pDevice->pContext->pulse.pa_channel_map_init_extend)(&cmap, ss.channels, pConfig->pulse.channelMap);
+
+
+        /* Use the requested sample rate if one was specified. */
+        if (pDescriptorPlayback->sampleRate != 0) {
+            ss.rate = pDescriptorPlayback->sampleRate;
+        }
+
+        streamFlags = MA_PA_STREAM_START_CORKED | MA_PA_STREAM_ADJUST_LATENCY;
+        if (ma_format_from_pulse(ss.format) == ma_format_unknown) {
+            if (ma_is_little_endian()) {
+                ss.format = MA_PA_SAMPLE_FLOAT32LE;
+            } else {
+                ss.format = MA_PA_SAMPLE_FLOAT32BE;
+            }
+            streamFlags |= MA_PA_STREAM_FIX_FORMAT;
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.format not supported by miniaudio. Defaulting to PA_SAMPLE_FLOAT32.\n");
+        }
+        if (ss.rate == 0) {
+            ss.rate = MA_DEFAULT_SAMPLE_RATE;
+            streamFlags |= MA_PA_STREAM_FIX_RATE;
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.rate = 0. Defaulting to %d.\n", ss.rate);
+        }
+        if (ss.channels == 0) {
+            ss.channels = MA_DEFAULT_CHANNELS;
+            streamFlags |= MA_PA_STREAM_FIX_CHANNELS;
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] sample_spec.channels = 0. Defaulting to %d.\n", ss.channels);
+        }
+
+        /* We now have enough information to calculate the actual buffer size in frames. */
+        pDescriptorPlayback->periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__pulse(pDescriptorPlayback, ss.rate, pConfig->performanceProfile);
+
+        attr = ma_device__pa_buffer_attr_new(pDescriptorPlayback->periodSizeInFrames, pDescriptorPlayback->periodCount, &ss);
+
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Playback attr: maxlength=%d, tlength=%d, prebuf=%d, minreq=%d, fragsize=%d; periodSizeInFrames=%d\n", attr.maxlength, attr.tlength, attr.prebuf, attr.minreq, attr.fragsize, pDescriptorPlayback->periodSizeInFrames);
+
+        pDevice->pulse.pStreamPlayback = ma_device__pa_stream_new__pulse(pDevice, pConfig->pulse.pStreamNamePlayback, &ss, &cmap);
+        if (pDevice->pulse.pStreamPlayback == NULL) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to create PulseAudio playback stream.\n");
+            result = MA_ERROR;
+            goto on_error2;
+        }
+
+
+        /*
+        Note that this callback will be fired as soon as the stream is connected, even though it's started as corked. The callback needs to handle a
+        device state of ma_device_state_uninitialized.
+        */
+        ((ma_pa_stream_set_write_callback_proc)pDevice->pContext->pulse.pa_stream_set_write_callback)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, ma_device_on_write__pulse, pDevice);
+
+        /* State callback for checking when the device has been corked. */
+        ((ma_pa_stream_set_suspended_callback_proc)pDevice->pContext->pulse.pa_stream_set_suspended_callback)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, ma_device_on_suspended__pulse, pDevice);
+
+        /* Rerouting notification. */
+        ((ma_pa_stream_set_moved_callback_proc)pDevice->pContext->pulse.pa_stream_set_moved_callback)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, ma_device_on_rerouted__pulse, pDevice);
+
+
+        /* Connect after we've got all of our internal state set up. */
+        if (devPlayback != NULL) {
+            streamFlags |= MA_PA_STREAM_DONT_MOVE;
+        }
+
+        error = ((ma_pa_stream_connect_playback_proc)pDevice->pContext->pulse.pa_stream_connect_playback)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, devPlayback, &attr, streamFlags, NULL, NULL);
+        if (error != MA_PA_OK) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to connect PulseAudio playback stream.");
+            result = ma_result_from_pulse(error);
+            goto on_error3;
+        }
+
+        result = ma_wait_for_pa_stream_to_connect__pulse(pDevice->pContext, pDevice->pulse.pMainLoop, (ma_pa_stream*)pDevice->pulse.pStreamPlayback);
+        if (result != MA_SUCCESS) {
+            goto on_error3;
+        }
+
+
+        /* Internal format. */
+        pActualSS = ((ma_pa_stream_get_sample_spec_proc)pDevice->pContext->pulse.pa_stream_get_sample_spec)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
+        if (pActualSS != NULL) {
+            ss = *pActualSS;
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Playback sample spec: format=%s, channels=%d, rate=%d\n", ma_get_format_name(ma_format_from_pulse(ss.format)), ss.channels, ss.rate);
+        } else {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Failed to retrieve playback sample spec.\n");
+        }
+
+        pDescriptorPlayback->format     = ma_format_from_pulse(ss.format);
+        pDescriptorPlayback->channels   = ss.channels;
+        pDescriptorPlayback->sampleRate = ss.rate;
+
+        if (pDescriptorPlayback->format == ma_format_unknown || pDescriptorPlayback->channels == 0 || pDescriptorPlayback->sampleRate == 0) {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Playback sample spec is invalid. Device unusable by miniaudio. format=%s, channels=%d, sampleRate=%d.\n", ma_get_format_name(pDescriptorPlayback->format), pDescriptorPlayback->channels, pDescriptorPlayback->sampleRate);
+            result = MA_ERROR;
+            goto on_error4;
+        }
+
+
+        /* Internal channel map. */
+        pActualChannelMap = ((ma_pa_stream_get_channel_map_proc)pDevice->pContext->pulse.pa_stream_get_channel_map)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
+        if (pActualChannelMap == NULL) {
+            pActualChannelMap = &cmap;  /* Fallback just in case. */
+        }
+
+        /*
+        Bug in PipeWire. There have been reports that PipeWire is returning AUX channels when reporting
+        the channel map. To somewhat workaround this, I'm hacking in a hard coded channel map for mono
+        and stereo. In this case it should be safe to assume mono = MONO and stereo = LEFT/RIGHT. For
+        all other channel counts we need to just put up with whatever PipeWire reports and hope it gets
+        fixed sooner than later. I might remove this hack later.
+        */
+        if (pDescriptorPlayback->channels > 2) {
+            for (iChannel = 0; iChannel < pDescriptorPlayback->channels; iChannel += 1) {
+                pDescriptorPlayback->channelMap[iChannel] = ma_channel_position_from_pulse(pActualChannelMap->map[iChannel]);
+            }
+        } else {
+            /* Hack for mono and stereo. */
+            if (pDescriptorPlayback->channels == 1) {
+                pDescriptorPlayback->channelMap[0] = MA_CHANNEL_MONO;
+            } else if (pDescriptorPlayback->channels == 2) {
+                pDescriptorPlayback->channelMap[0] = MA_CHANNEL_FRONT_LEFT;
+                pDescriptorPlayback->channelMap[1] = MA_CHANNEL_FRONT_RIGHT;
+            } else {
+                MA_ASSERT(MA_FALSE);    /* Should never hit this. */
+            }
+        }
+
+
+        /* Buffer. */
+        pActualAttr = ((ma_pa_stream_get_buffer_attr_proc)pDevice->pContext->pulse.pa_stream_get_buffer_attr)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
+        if (pActualAttr != NULL) {
+            attr = *pActualAttr;
+        }
+
+        if (attr.tlength > 0) {
+            pDescriptorPlayback->periodCount = ma_max(attr.maxlength / attr.tlength, 1);
+        } else {
+            pDescriptorPlayback->periodCount = 1;
+        }
+
+        pDescriptorPlayback->periodSizeInFrames = attr.maxlength / ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels) / pDescriptorPlayback->periodCount;
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[PulseAudio] Playback actual attr: maxlength=%d, tlength=%d, prebuf=%d, minreq=%d, fragsize=%d; internalPeriodSizeInFrames=%d\n", attr.maxlength, attr.tlength, attr.prebuf, attr.minreq, attr.fragsize, pDescriptorPlayback->periodSizeInFrames);
+    }
+
+
+    /*
+    We need a ring buffer for handling duplex mode. We can use the main duplex ring buffer in the main
+    part of the ma_device struct. We cannot, however, depend on ma_device_init() initializing this for
+    us later on because that will only do it if it's a fully asynchronous backend - i.e. the
+    onDeviceDataLoop callback is NULL, which is not the case for PulseAudio.
+    */
+    if (pConfig->deviceType == ma_device_type_duplex) {
+        ma_format rbFormat     = (format != ma_format_unknown) ? format     : pDescriptorCapture->format;
+        ma_uint32 rbChannels   = (channels   > 0)              ? channels   : pDescriptorCapture->channels;
+        ma_uint32 rbSampleRate = (sampleRate > 0)              ? sampleRate : pDescriptorCapture->sampleRate;
+
+        result = ma_duplex_rb_init(rbFormat, rbChannels, rbSampleRate, pDescriptorCapture->sampleRate, pDescriptorCapture->periodSizeInFrames, &pDevice->pContext->allocationCallbacks, &pDevice->duplexRB);
+        if (result != MA_SUCCESS) {
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to initialize ring buffer. %s.\n", ma_result_description(result));
+            goto on_error4;
+        }
+    }
+
+    return MA_SUCCESS;
+
+
+on_error4:
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ((ma_pa_stream_disconnect_proc)pDevice->pContext->pulse.pa_stream_disconnect)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
+    }
+on_error3:
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ((ma_pa_stream_unref_proc)pDevice->pContext->pulse.pa_stream_unref)((ma_pa_stream*)pDevice->pulse.pStreamPlayback);
+    }
+on_error2:
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ((ma_pa_stream_disconnect_proc)pDevice->pContext->pulse.pa_stream_disconnect)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
+    }
+on_error1:
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ((ma_pa_stream_unref_proc)pDevice->pContext->pulse.pa_stream_unref)((ma_pa_stream*)pDevice->pulse.pStreamCapture);
+    }
+on_error0:
+    return result;
+}
+
+
+static void ma_pulse_operation_complete_callback(ma_pa_stream* pStream, int success, void* pUserData)
+{
+    ma_bool32* pIsSuccessful = (ma_bool32*)pUserData;
+    MA_ASSERT(pIsSuccessful != NULL);
+
+    *pIsSuccessful = (ma_bool32)success;
+
+    (void)pStream; /* Unused. */
+}
+
+static ma_result ma_device__cork_stream__pulse(ma_device* pDevice, ma_device_type deviceType, int cork)
+{
+    ma_context* pContext = pDevice->pContext;
+    ma_bool32 wasSuccessful;
+    ma_pa_stream* pStream;
+    ma_pa_operation* pOP;
+    ma_result result;
+
+    /* This should not be called with a duplex device type. */
+    if (deviceType == ma_device_type_duplex) {
+        return MA_INVALID_ARGS;
+    }
+
+    wasSuccessful = MA_FALSE;
+
+    pStream = (ma_pa_stream*)((deviceType == ma_device_type_capture) ? pDevice->pulse.pStreamCapture : pDevice->pulse.pStreamPlayback);
+    MA_ASSERT(pStream != NULL);
+
+    pOP = ((ma_pa_stream_cork_proc)pContext->pulse.pa_stream_cork)(pStream, cork, ma_pulse_operation_complete_callback, &wasSuccessful);
+    if (pOP == NULL) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to cork PulseAudio stream.");
+        return MA_ERROR;
+    }
+
+    result = ma_wait_for_operation_and_unref__pulse(pDevice->pContext, pDevice->pulse.pMainLoop, pOP);
+    if (result != MA_SUCCESS) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] An error occurred while waiting for the PulseAudio stream to cork.");
+        return result;
+    }
+
+    if (!wasSuccessful) {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[PulseAudio] Failed to %s PulseAudio stream.", (cork) ? "stop" : "start");
+        return MA_ERROR;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start__pulse(ma_device* pDevice)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        result = ma_device__cork_stream__pulse(pDevice, ma_device_type_capture, 0);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        /*
+        We need to fill some data before uncorking. Not doing this will result in the write callback
+        never getting fired. We're not going to abort if writing fails because I still want the device
+        to get uncorked.
+        */
+        ma_device_write_to_stream__pulse(pDevice, (ma_pa_stream*)(pDevice->pulse.pStreamPlayback), NULL);   /* No need to check the result here. Always want to fall through an uncork.*/
+
+        result = ma_device__cork_stream__pulse(pDevice, ma_device_type_playback, 0);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__pulse(ma_device* pDevice)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        result = ma_device__cork_stream__pulse(pDevice, ma_device_type_capture, 1);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        /*
+        Ideally we would drain the device here, but there's been cases where PulseAudio seems to be
+        broken on some systems to the point where no audio processing seems to happen. When this
+        happens, draining never completes and we get stuck here. For now I'm disabling draining of
+        the device so we don't just freeze the application.
+        */
+    #if 0
+        ma_pa_operation* pOP = ((ma_pa_stream_drain_proc)pDevice->pContext->pulse.pa_stream_drain)((ma_pa_stream*)pDevice->pulse.pStreamPlayback, ma_pulse_operation_complete_callback, &wasSuccessful);
+        ma_wait_for_operation_and_unref__pulse(pDevice->pContext, pDevice->pulse.pMainLoop, pOP);
+    #endif
+
+        result = ma_device__cork_stream__pulse(pDevice, ma_device_type_playback, 1);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_data_loop__pulse(ma_device* pDevice)
+{
+    int resultPA;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /* NOTE: Don't start the device here. It'll be done at a higher level. */
+
+    /*
+    All data is handled through callbacks. All we need to do is iterate over the main loop and let
+    the callbacks deal with it.
+    */
+    while (ma_device_get_state(pDevice) == ma_device_state_started) {
+        resultPA = ((ma_pa_mainloop_iterate_proc)pDevice->pContext->pulse.pa_mainloop_iterate)((ma_pa_mainloop*)pDevice->pulse.pMainLoop, 1, NULL);
+        if (resultPA < 0) {
+            break;
+        }
+    }
+
+    /* NOTE: Don't stop the device here. It'll be done at a higher level. */
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_data_loop_wakeup__pulse(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    ((ma_pa_mainloop_wakeup_proc)pDevice->pContext->pulse.pa_mainloop_wakeup)((ma_pa_mainloop*)pDevice->pulse.pMainLoop);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_uninit__pulse(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_pulseaudio);
+
+    ((ma_pa_context_disconnect_proc)pContext->pulse.pa_context_disconnect)((ma_pa_context*)pContext->pulse.pPulseContext);
+    ((ma_pa_context_unref_proc)pContext->pulse.pa_context_unref)((ma_pa_context*)pContext->pulse.pPulseContext);
+    ((ma_pa_mainloop_free_proc)pContext->pulse.pa_mainloop_free)((ma_pa_mainloop*)pContext->pulse.pMainLoop);
+
+    ma_free(pContext->pulse.pServerName, &pContext->allocationCallbacks);
+    ma_free(pContext->pulse.pApplicationName, &pContext->allocationCallbacks);
+
+#ifndef MA_NO_RUNTIME_LINKING
+    ma_dlclose(ma_context_get_log(pContext), pContext->pulse.pulseSO);
+#endif
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__pulse(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    ma_result result;
+#ifndef MA_NO_RUNTIME_LINKING
+    const char* libpulseNames[] = {
+        "libpulse.so",
+        "libpulse.so.0"
+    };
+    size_t i;
+
+    for (i = 0; i < ma_countof(libpulseNames); ++i) {
+        pContext->pulse.pulseSO = ma_dlopen(ma_context_get_log(pContext), libpulseNames[i]);
+        if (pContext->pulse.pulseSO != NULL) {
+            break;
+        }
+    }
+
+    if (pContext->pulse.pulseSO == NULL) {
+        return MA_NO_BACKEND;
+    }
+
+    pContext->pulse.pa_mainloop_new                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_new");
+    pContext->pulse.pa_mainloop_free                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_free");
+    pContext->pulse.pa_mainloop_quit                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_quit");
+    pContext->pulse.pa_mainloop_get_api                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_get_api");
+    pContext->pulse.pa_mainloop_iterate                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_iterate");
+    pContext->pulse.pa_mainloop_wakeup                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_mainloop_wakeup");
+    pContext->pulse.pa_threaded_mainloop_new           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_new");
+    pContext->pulse.pa_threaded_mainloop_free          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_free");
+    pContext->pulse.pa_threaded_mainloop_start         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_start");
+    pContext->pulse.pa_threaded_mainloop_stop          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_stop");
+    pContext->pulse.pa_threaded_mainloop_lock          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_lock");
+    pContext->pulse.pa_threaded_mainloop_unlock        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_unlock");
+    pContext->pulse.pa_threaded_mainloop_wait          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_wait");
+    pContext->pulse.pa_threaded_mainloop_signal        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_signal");
+    pContext->pulse.pa_threaded_mainloop_accept        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_accept");
+    pContext->pulse.pa_threaded_mainloop_get_retval    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_get_retval");
+    pContext->pulse.pa_threaded_mainloop_get_api       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_get_api");
+    pContext->pulse.pa_threaded_mainloop_in_thread     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_in_thread");
+    pContext->pulse.pa_threaded_mainloop_set_name      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_threaded_mainloop_set_name");
+    pContext->pulse.pa_context_new                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_new");
+    pContext->pulse.pa_context_unref                   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_unref");
+    pContext->pulse.pa_context_connect                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_connect");
+    pContext->pulse.pa_context_disconnect              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_disconnect");
+    pContext->pulse.pa_context_set_state_callback      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_set_state_callback");
+    pContext->pulse.pa_context_get_state               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_state");
+    pContext->pulse.pa_context_get_sink_info_list      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_sink_info_list");
+    pContext->pulse.pa_context_get_source_info_list    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_source_info_list");
+    pContext->pulse.pa_context_get_sink_info_by_name   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_sink_info_by_name");
+    pContext->pulse.pa_context_get_source_info_by_name = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_context_get_source_info_by_name");
+    pContext->pulse.pa_operation_unref                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_operation_unref");
+    pContext->pulse.pa_operation_get_state             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_operation_get_state");
+    pContext->pulse.pa_channel_map_init_extend         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_channel_map_init_extend");
+    pContext->pulse.pa_channel_map_valid               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_channel_map_valid");
+    pContext->pulse.pa_channel_map_compatible          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_channel_map_compatible");
+    pContext->pulse.pa_stream_new                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_new");
+    pContext->pulse.pa_stream_unref                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_unref");
+    pContext->pulse.pa_stream_connect_playback         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_connect_playback");
+    pContext->pulse.pa_stream_connect_record           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_connect_record");
+    pContext->pulse.pa_stream_disconnect               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_disconnect");
+    pContext->pulse.pa_stream_get_state                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_state");
+    pContext->pulse.pa_stream_get_sample_spec          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_sample_spec");
+    pContext->pulse.pa_stream_get_channel_map          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_channel_map");
+    pContext->pulse.pa_stream_get_buffer_attr          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_buffer_attr");
+    pContext->pulse.pa_stream_set_buffer_attr          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_buffer_attr");
+    pContext->pulse.pa_stream_get_device_name          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_get_device_name");
+    pContext->pulse.pa_stream_set_write_callback       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_write_callback");
+    pContext->pulse.pa_stream_set_read_callback        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_read_callback");
+    pContext->pulse.pa_stream_set_suspended_callback   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_suspended_callback");
+    pContext->pulse.pa_stream_set_moved_callback       = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_set_moved_callback");
+    pContext->pulse.pa_stream_is_suspended             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_is_suspended");
+    pContext->pulse.pa_stream_flush                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_flush");
+    pContext->pulse.pa_stream_drain                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_drain");
+    pContext->pulse.pa_stream_is_corked                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_is_corked");
+    pContext->pulse.pa_stream_cork                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_cork");
+    pContext->pulse.pa_stream_trigger                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_trigger");
+    pContext->pulse.pa_stream_begin_write              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_begin_write");
+    pContext->pulse.pa_stream_write                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_write");
+    pContext->pulse.pa_stream_peek                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_peek");
+    pContext->pulse.pa_stream_drop                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_drop");
+    pContext->pulse.pa_stream_writable_size            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_writable_size");
+    pContext->pulse.pa_stream_readable_size            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->pulse.pulseSO, "pa_stream_readable_size");
+#else
+    /* This strange assignment system is just for type safety. */
+    ma_pa_mainloop_new_proc                    _pa_mainloop_new                   = pa_mainloop_new;
+    ma_pa_mainloop_free_proc                   _pa_mainloop_free                  = pa_mainloop_free;
+    ma_pa_mainloop_quit_proc                   _pa_mainloop_quit                  = pa_mainloop_quit;
+    ma_pa_mainloop_get_api_proc                _pa_mainloop_get_api               = pa_mainloop_get_api;
+    ma_pa_mainloop_iterate_proc                _pa_mainloop_iterate               = pa_mainloop_iterate;
+    ma_pa_mainloop_wakeup_proc                 _pa_mainloop_wakeup                = pa_mainloop_wakeup;
+    ma_pa_threaded_mainloop_new_proc           _pa_threaded_mainloop_new          = pa_threaded_mainloop_new;
+    ma_pa_threaded_mainloop_free_proc          _pa_threaded_mainloop_free         = pa_threaded_mainloop_free;
+    ma_pa_threaded_mainloop_start_proc         _pa_threaded_mainloop_start        = pa_threaded_mainloop_start;
+    ma_pa_threaded_mainloop_stop_proc          _pa_threaded_mainloop_stop         = pa_threaded_mainloop_stop;
+    ma_pa_threaded_mainloop_lock_proc          _pa_threaded_mainloop_lock         = pa_threaded_mainloop_lock;
+    ma_pa_threaded_mainloop_unlock_proc        _pa_threaded_mainloop_unlock       = pa_threaded_mainloop_unlock;
+    ma_pa_threaded_mainloop_wait_proc          _pa_threaded_mainloop_wait         = pa_threaded_mainloop_wait;
+    ma_pa_threaded_mainloop_signal_proc        _pa_threaded_mainloop_signal       = pa_threaded_mainloop_signal;
+    ma_pa_threaded_mainloop_accept_proc        _pa_threaded_mainloop_accept       = pa_threaded_mainloop_accept;
+    ma_pa_threaded_mainloop_get_retval_proc    _pa_threaded_mainloop_get_retval   = pa_threaded_mainloop_get_retval;
+    ma_pa_threaded_mainloop_get_api_proc       _pa_threaded_mainloop_get_api      = pa_threaded_mainloop_get_api;
+    ma_pa_threaded_mainloop_in_thread_proc     _pa_threaded_mainloop_in_thread    = pa_threaded_mainloop_in_thread;
+    ma_pa_threaded_mainloop_set_name_proc      _pa_threaded_mainloop_set_name     = pa_threaded_mainloop_set_name;
+    ma_pa_context_new_proc                     _pa_context_new                    = pa_context_new;
+    ma_pa_context_unref_proc                   _pa_context_unref                  = pa_context_unref;
+    ma_pa_context_connect_proc                 _pa_context_connect                = pa_context_connect;
+    ma_pa_context_disconnect_proc              _pa_context_disconnect             = pa_context_disconnect;
+    ma_pa_context_set_state_callback_proc      _pa_context_set_state_callback     = pa_context_set_state_callback;
+    ma_pa_context_get_state_proc               _pa_context_get_state              = pa_context_get_state;
+    ma_pa_context_get_sink_info_list_proc      _pa_context_get_sink_info_list     = pa_context_get_sink_info_list;
+    ma_pa_context_get_source_info_list_proc    _pa_context_get_source_info_list   = pa_context_get_source_info_list;
+    ma_pa_context_get_sink_info_by_name_proc   _pa_context_get_sink_info_by_name  = pa_context_get_sink_info_by_name;
+    ma_pa_context_get_source_info_by_name_proc _pa_context_get_source_info_by_name= pa_context_get_source_info_by_name;
+    ma_pa_operation_unref_proc                 _pa_operation_unref                = pa_operation_unref;
+    ma_pa_operation_get_state_proc             _pa_operation_get_state            = pa_operation_get_state;
+    ma_pa_channel_map_init_extend_proc         _pa_channel_map_init_extend        = pa_channel_map_init_extend;
+    ma_pa_channel_map_valid_proc               _pa_channel_map_valid              = pa_channel_map_valid;
+    ma_pa_channel_map_compatible_proc          _pa_channel_map_compatible         = pa_channel_map_compatible;
+    ma_pa_stream_new_proc                      _pa_stream_new                     = pa_stream_new;
+    ma_pa_stream_unref_proc                    _pa_stream_unref                   = pa_stream_unref;
+    ma_pa_stream_connect_playback_proc         _pa_stream_connect_playback        = pa_stream_connect_playback;
+    ma_pa_stream_connect_record_proc           _pa_stream_connect_record          = pa_stream_connect_record;
+    ma_pa_stream_disconnect_proc               _pa_stream_disconnect              = pa_stream_disconnect;
+    ma_pa_stream_get_state_proc                _pa_stream_get_state               = pa_stream_get_state;
+    ma_pa_stream_get_sample_spec_proc          _pa_stream_get_sample_spec         = pa_stream_get_sample_spec;
+    ma_pa_stream_get_channel_map_proc          _pa_stream_get_channel_map         = pa_stream_get_channel_map;
+    ma_pa_stream_get_buffer_attr_proc          _pa_stream_get_buffer_attr         = pa_stream_get_buffer_attr;
+    ma_pa_stream_set_buffer_attr_proc          _pa_stream_set_buffer_attr         = pa_stream_set_buffer_attr;
+    ma_pa_stream_get_device_name_proc          _pa_stream_get_device_name         = pa_stream_get_device_name;
+    ma_pa_stream_set_write_callback_proc       _pa_stream_set_write_callback      = pa_stream_set_write_callback;
+    ma_pa_stream_set_read_callback_proc        _pa_stream_set_read_callback       = pa_stream_set_read_callback;
+    ma_pa_stream_set_suspended_callback_proc   _pa_stream_set_suspended_callback  = pa_stream_set_suspended_callback;
+    ma_pa_stream_set_moved_callback_proc       _pa_stream_set_moved_callback      = pa_stream_set_moved_callback;
+    ma_pa_stream_is_suspended_proc             _pa_stream_is_suspended            = pa_stream_is_suspended;
+    ma_pa_stream_flush_proc                    _pa_stream_flush                   = pa_stream_flush;
+    ma_pa_stream_drain_proc                    _pa_stream_drain                   = pa_stream_drain;
+    ma_pa_stream_is_corked_proc                _pa_stream_is_corked               = pa_stream_is_corked;
+    ma_pa_stream_cork_proc                     _pa_stream_cork                    = pa_stream_cork;
+    ma_pa_stream_trigger_proc                  _pa_stream_trigger                 = pa_stream_trigger;
+    ma_pa_stream_begin_write_proc              _pa_stream_begin_write             = pa_stream_begin_write;
+    ma_pa_stream_write_proc                    _pa_stream_write                   = pa_stream_write;
+    ma_pa_stream_peek_proc                     _pa_stream_peek                    = pa_stream_peek;
+    ma_pa_stream_drop_proc                     _pa_stream_drop                    = pa_stream_drop;
+    ma_pa_stream_writable_size_proc            _pa_stream_writable_size           = pa_stream_writable_size;
+    ma_pa_stream_readable_size_proc            _pa_stream_readable_size           = pa_stream_readable_size;
+
+    pContext->pulse.pa_mainloop_new                    = (ma_proc)_pa_mainloop_new;
+    pContext->pulse.pa_mainloop_free                   = (ma_proc)_pa_mainloop_free;
+    pContext->pulse.pa_mainloop_quit                   = (ma_proc)_pa_mainloop_quit;
+    pContext->pulse.pa_mainloop_get_api                = (ma_proc)_pa_mainloop_get_api;
+    pContext->pulse.pa_mainloop_iterate                = (ma_proc)_pa_mainloop_iterate;
+    pContext->pulse.pa_mainloop_wakeup                 = (ma_proc)_pa_mainloop_wakeup;
+    pContext->pulse.pa_threaded_mainloop_new           = (ma_proc)_pa_threaded_mainloop_new;
+    pContext->pulse.pa_threaded_mainloop_free          = (ma_proc)_pa_threaded_mainloop_free;
+    pContext->pulse.pa_threaded_mainloop_start         = (ma_proc)_pa_threaded_mainloop_start;
+    pContext->pulse.pa_threaded_mainloop_stop          = (ma_proc)_pa_threaded_mainloop_stop;
+    pContext->pulse.pa_threaded_mainloop_lock          = (ma_proc)_pa_threaded_mainloop_lock;
+    pContext->pulse.pa_threaded_mainloop_unlock        = (ma_proc)_pa_threaded_mainloop_unlock;
+    pContext->pulse.pa_threaded_mainloop_wait          = (ma_proc)_pa_threaded_mainloop_wait;
+    pContext->pulse.pa_threaded_mainloop_signal        = (ma_proc)_pa_threaded_mainloop_signal;
+    pContext->pulse.pa_threaded_mainloop_accept        = (ma_proc)_pa_threaded_mainloop_accept;
+    pContext->pulse.pa_threaded_mainloop_get_retval    = (ma_proc)_pa_threaded_mainloop_get_retval;
+    pContext->pulse.pa_threaded_mainloop_get_api       = (ma_proc)_pa_threaded_mainloop_get_api;
+    pContext->pulse.pa_threaded_mainloop_in_thread     = (ma_proc)_pa_threaded_mainloop_in_thread;
+    pContext->pulse.pa_threaded_mainloop_set_name      = (ma_proc)_pa_threaded_mainloop_set_name;
+    pContext->pulse.pa_context_new                     = (ma_proc)_pa_context_new;
+    pContext->pulse.pa_context_unref                   = (ma_proc)_pa_context_unref;
+    pContext->pulse.pa_context_connect                 = (ma_proc)_pa_context_connect;
+    pContext->pulse.pa_context_disconnect              = (ma_proc)_pa_context_disconnect;
+    pContext->pulse.pa_context_set_state_callback      = (ma_proc)_pa_context_set_state_callback;
+    pContext->pulse.pa_context_get_state               = (ma_proc)_pa_context_get_state;
+    pContext->pulse.pa_context_get_sink_info_list      = (ma_proc)_pa_context_get_sink_info_list;
+    pContext->pulse.pa_context_get_source_info_list    = (ma_proc)_pa_context_get_source_info_list;
+    pContext->pulse.pa_context_get_sink_info_by_name   = (ma_proc)_pa_context_get_sink_info_by_name;
+    pContext->pulse.pa_context_get_source_info_by_name = (ma_proc)_pa_context_get_source_info_by_name;
+    pContext->pulse.pa_operation_unref                 = (ma_proc)_pa_operation_unref;
+    pContext->pulse.pa_operation_get_state             = (ma_proc)_pa_operation_get_state;
+    pContext->pulse.pa_channel_map_init_extend         = (ma_proc)_pa_channel_map_init_extend;
+    pContext->pulse.pa_channel_map_valid               = (ma_proc)_pa_channel_map_valid;
+    pContext->pulse.pa_channel_map_compatible          = (ma_proc)_pa_channel_map_compatible;
+    pContext->pulse.pa_stream_new                      = (ma_proc)_pa_stream_new;
+    pContext->pulse.pa_stream_unref                    = (ma_proc)_pa_stream_unref;
+    pContext->pulse.pa_stream_connect_playback         = (ma_proc)_pa_stream_connect_playback;
+    pContext->pulse.pa_stream_connect_record           = (ma_proc)_pa_stream_connect_record;
+    pContext->pulse.pa_stream_disconnect               = (ma_proc)_pa_stream_disconnect;
+    pContext->pulse.pa_stream_get_state                = (ma_proc)_pa_stream_get_state;
+    pContext->pulse.pa_stream_get_sample_spec          = (ma_proc)_pa_stream_get_sample_spec;
+    pContext->pulse.pa_stream_get_channel_map          = (ma_proc)_pa_stream_get_channel_map;
+    pContext->pulse.pa_stream_get_buffer_attr          = (ma_proc)_pa_stream_get_buffer_attr;
+    pContext->pulse.pa_stream_set_buffer_attr          = (ma_proc)_pa_stream_set_buffer_attr;
+    pContext->pulse.pa_stream_get_device_name          = (ma_proc)_pa_stream_get_device_name;
+    pContext->pulse.pa_stream_set_write_callback       = (ma_proc)_pa_stream_set_write_callback;
+    pContext->pulse.pa_stream_set_read_callback        = (ma_proc)_pa_stream_set_read_callback;
+    pContext->pulse.pa_stream_set_suspended_callback   = (ma_proc)_pa_stream_set_suspended_callback;
+    pContext->pulse.pa_stream_set_moved_callback       = (ma_proc)_pa_stream_set_moved_callback;
+    pContext->pulse.pa_stream_is_suspended             = (ma_proc)_pa_stream_is_suspended;
+    pContext->pulse.pa_stream_flush                    = (ma_proc)_pa_stream_flush;
+    pContext->pulse.pa_stream_drain                    = (ma_proc)_pa_stream_drain;
+    pContext->pulse.pa_stream_is_corked                = (ma_proc)_pa_stream_is_corked;
+    pContext->pulse.pa_stream_cork                     = (ma_proc)_pa_stream_cork;
+    pContext->pulse.pa_stream_trigger                  = (ma_proc)_pa_stream_trigger;
+    pContext->pulse.pa_stream_begin_write              = (ma_proc)_pa_stream_begin_write;
+    pContext->pulse.pa_stream_write                    = (ma_proc)_pa_stream_write;
+    pContext->pulse.pa_stream_peek                     = (ma_proc)_pa_stream_peek;
+    pContext->pulse.pa_stream_drop                     = (ma_proc)_pa_stream_drop;
+    pContext->pulse.pa_stream_writable_size            = (ma_proc)_pa_stream_writable_size;
+    pContext->pulse.pa_stream_readable_size            = (ma_proc)_pa_stream_readable_size;
+#endif
+
+    /* We need to make a copy of the application and server names so we can pass them to the pa_context of each device. */
+    pContext->pulse.pApplicationName = ma_copy_string(pConfig->pulse.pApplicationName, &pContext->allocationCallbacks);
+    if (pContext->pulse.pApplicationName == NULL && pConfig->pulse.pApplicationName != NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    pContext->pulse.pServerName = ma_copy_string(pConfig->pulse.pServerName, &pContext->allocationCallbacks);
+    if (pContext->pulse.pServerName == NULL && pConfig->pulse.pServerName != NULL) {
+        ma_free(pContext->pulse.pApplicationName, &pContext->allocationCallbacks);
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_init_pa_mainloop_and_pa_context__pulse(pContext, pConfig->pulse.pApplicationName, pConfig->pulse.pServerName, pConfig->pulse.tryAutoSpawn, &pContext->pulse.pMainLoop, &pContext->pulse.pPulseContext);
+    if (result != MA_SUCCESS) {
+        ma_free(pContext->pulse.pServerName, &pContext->allocationCallbacks);
+        ma_free(pContext->pulse.pApplicationName, &pContext->allocationCallbacks);
+    #ifndef MA_NO_RUNTIME_LINKING
+        ma_dlclose(ma_context_get_log(pContext), pContext->pulse.pulseSO);
+    #endif
+        return result;
+    }
+
+    /* With pa_mainloop we run a synchronous backend, but we implement our own main loop. */
+    pCallbacks->onContextInit             = ma_context_init__pulse;
+    pCallbacks->onContextUninit           = ma_context_uninit__pulse;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__pulse;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__pulse;
+    pCallbacks->onDeviceInit              = ma_device_init__pulse;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__pulse;
+    pCallbacks->onDeviceStart             = ma_device_start__pulse;
+    pCallbacks->onDeviceStop              = ma_device_stop__pulse;
+    pCallbacks->onDeviceRead              = NULL;   /* Not used because we're implementing onDeviceDataLoop. */
+    pCallbacks->onDeviceWrite             = NULL;   /* Not used because we're implementing onDeviceDataLoop. */
+    pCallbacks->onDeviceDataLoop          = ma_device_data_loop__pulse;
+    pCallbacks->onDeviceDataLoopWakeup    = ma_device_data_loop_wakeup__pulse;
+
+    return MA_SUCCESS;
+}
+#endif
+
+
+/******************************************************************************
+
+JACK Backend
+
+******************************************************************************/
+#ifdef MA_HAS_JACK
+
+/* It is assumed jack.h is available when compile-time linking is being used. */
+#ifdef MA_NO_RUNTIME_LINKING
+#include <jack/jack.h>
+
+typedef jack_nframes_t              ma_jack_nframes_t;
+typedef jack_options_t              ma_jack_options_t;
+typedef jack_status_t               ma_jack_status_t;
+typedef jack_client_t               ma_jack_client_t;
+typedef jack_port_t                 ma_jack_port_t;
+typedef JackProcessCallback         ma_JackProcessCallback;
+typedef JackBufferSizeCallback      ma_JackBufferSizeCallback;
+typedef JackShutdownCallback        ma_JackShutdownCallback;
+#define MA_JACK_DEFAULT_AUDIO_TYPE  JACK_DEFAULT_AUDIO_TYPE
+#define ma_JackNoStartServer        JackNoStartServer
+#define ma_JackPortIsInput          JackPortIsInput
+#define ma_JackPortIsOutput         JackPortIsOutput
+#define ma_JackPortIsPhysical       JackPortIsPhysical
+#else
+typedef ma_uint32               ma_jack_nframes_t;
+typedef int                     ma_jack_options_t;
+typedef int                     ma_jack_status_t;
+typedef struct ma_jack_client_t ma_jack_client_t;
+typedef struct ma_jack_port_t   ma_jack_port_t;
+typedef int  (* ma_JackProcessCallback)   (ma_jack_nframes_t nframes, void* arg);
+typedef int  (* ma_JackBufferSizeCallback)(ma_jack_nframes_t nframes, void* arg);
+typedef void (* ma_JackShutdownCallback)  (void* arg);
+#define MA_JACK_DEFAULT_AUDIO_TYPE "32 bit float mono audio"
+#define ma_JackNoStartServer       1
+#define ma_JackPortIsInput         1
+#define ma_JackPortIsOutput        2
+#define ma_JackPortIsPhysical      4
+#endif
+
+typedef ma_jack_client_t* (* ma_jack_client_open_proc)             (const char* client_name, ma_jack_options_t options, ma_jack_status_t* status, ...);
+typedef int               (* ma_jack_client_close_proc)            (ma_jack_client_t* client);
+typedef int               (* ma_jack_client_name_size_proc)        (void);
+typedef int               (* ma_jack_set_process_callback_proc)    (ma_jack_client_t* client, ma_JackProcessCallback process_callback, void* arg);
+typedef int               (* ma_jack_set_buffer_size_callback_proc)(ma_jack_client_t* client, ma_JackBufferSizeCallback bufsize_callback, void* arg);
+typedef void              (* ma_jack_on_shutdown_proc)             (ma_jack_client_t* client, ma_JackShutdownCallback function, void* arg);
+typedef ma_jack_nframes_t (* ma_jack_get_sample_rate_proc)         (ma_jack_client_t* client);
+typedef ma_jack_nframes_t (* ma_jack_get_buffer_size_proc)         (ma_jack_client_t* client);
+typedef const char**      (* ma_jack_get_ports_proc)               (ma_jack_client_t* client, const char* port_name_pattern, const char* type_name_pattern, unsigned long flags);
+typedef int               (* ma_jack_activate_proc)                (ma_jack_client_t* client);
+typedef int               (* ma_jack_deactivate_proc)              (ma_jack_client_t* client);
+typedef int               (* ma_jack_connect_proc)                 (ma_jack_client_t* client, const char* source_port, const char* destination_port);
+typedef ma_jack_port_t*   (* ma_jack_port_register_proc)           (ma_jack_client_t* client, const char* port_name, const char* port_type, unsigned long flags, unsigned long buffer_size);
+typedef const char*       (* ma_jack_port_name_proc)               (const ma_jack_port_t* port);
+typedef void*             (* ma_jack_port_get_buffer_proc)         (ma_jack_port_t* port, ma_jack_nframes_t nframes);
+typedef void              (* ma_jack_free_proc)                    (void* ptr);
+
+static ma_result ma_context_open_client__jack(ma_context* pContext, ma_jack_client_t** ppClient)
+{
+    size_t maxClientNameSize;
+    char clientName[256];
+    ma_jack_status_t status;
+    ma_jack_client_t* pClient;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(ppClient != NULL);
+
+    if (ppClient) {
+        *ppClient = NULL;
+    }
+
+    maxClientNameSize = ((ma_jack_client_name_size_proc)pContext->jack.jack_client_name_size)(); /* Includes null terminator. */
+    ma_strncpy_s(clientName, ma_min(sizeof(clientName), maxClientNameSize), (pContext->jack.pClientName != NULL) ? pContext->jack.pClientName : "miniaudio", (size_t)-1);
+
+    pClient = ((ma_jack_client_open_proc)pContext->jack.jack_client_open)(clientName, (pContext->jack.tryStartServer) ? 0 : ma_JackNoStartServer, &status, NULL);
+    if (pClient == NULL) {
+        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+    }
+
+    if (ppClient) {
+        *ppClient = pClient;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_context_enumerate_devices__jack(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_bool32 cbResult = MA_TRUE;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    /* Playback. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+        deviceInfo.isDefault = MA_TRUE;    /* JACK only uses default devices. */
+        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+    }
+
+    /* Capture. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+        deviceInfo.isDefault = MA_TRUE;    /* JACK only uses default devices. */
+        cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+    }
+
+    (void)cbResult; /* For silencing a static analysis warning. */
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info__jack(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    ma_jack_client_t* pClient;
+    ma_result result;
+    const char** ppPorts;
+
+    MA_ASSERT(pContext != NULL);
+
+    if (pDeviceID != NULL && pDeviceID->jack != 0) {
+        return MA_NO_DEVICE;   /* Don't know the device. */
+    }
+
+    /* Name / Description */
+    if (deviceType == ma_device_type_playback) {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+    } else {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+    }
+
+    /* Jack only uses default devices. */
+    pDeviceInfo->isDefault = MA_TRUE;
+
+    /* Jack only supports f32 and has a specific channel count and sample rate. */
+    pDeviceInfo->nativeDataFormats[0].format = ma_format_f32;
+
+    /* The channel count and sample rate can only be determined by opening the device. */
+    result = ma_context_open_client__jack(pContext, &pClient);
+    if (result != MA_SUCCESS) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[JACK] Failed to open client.");
+        return result;
+    }
+
+    pDeviceInfo->nativeDataFormats[0].sampleRate = ((ma_jack_get_sample_rate_proc)pContext->jack.jack_get_sample_rate)((ma_jack_client_t*)pClient);
+    pDeviceInfo->nativeDataFormats[0].channels   = 0;
+
+    ppPorts = ((ma_jack_get_ports_proc)pContext->jack.jack_get_ports)((ma_jack_client_t*)pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ((deviceType == ma_device_type_playback) ? ma_JackPortIsInput : ma_JackPortIsOutput));
+    if (ppPorts == NULL) {
+        ((ma_jack_client_close_proc)pContext->jack.jack_client_close)((ma_jack_client_t*)pClient);
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[JACK] Failed to query physical ports.");
+        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+    }
+
+    while (ppPorts[pDeviceInfo->nativeDataFormats[0].channels] != NULL) {
+        pDeviceInfo->nativeDataFormats[0].channels += 1;
+    }
+
+    pDeviceInfo->nativeDataFormats[0].flags = 0;
+    pDeviceInfo->nativeDataFormatCount = 1;
+
+    ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppPorts);
+    ((ma_jack_client_close_proc)pContext->jack.jack_client_close)((ma_jack_client_t*)pClient);
+
+    (void)pContext;
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_device_uninit__jack(ma_device* pDevice)
+{
+    ma_context* pContext;
+
+    MA_ASSERT(pDevice != NULL);
+
+    pContext = pDevice->pContext;
+    MA_ASSERT(pContext != NULL);
+
+    if (pDevice->jack.pClient != NULL) {
+        ((ma_jack_client_close_proc)pContext->jack.jack_client_close)((ma_jack_client_t*)pDevice->jack.pClient);
+    }
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ma_free(pDevice->jack.pIntermediaryBufferCapture, &pDevice->pContext->allocationCallbacks);
+        ma_free(pDevice->jack.ppPortsCapture, &pDevice->pContext->allocationCallbacks);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_free(pDevice->jack.pIntermediaryBufferPlayback, &pDevice->pContext->allocationCallbacks);
+        ma_free(pDevice->jack.ppPortsPlayback, &pDevice->pContext->allocationCallbacks);
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_device__jack_shutdown_callback(void* pUserData)
+{
+    /* JACK died. Stop the device. */
+    ma_device* pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    ma_device_stop(pDevice);
+}
+
+static int ma_device__jack_buffer_size_callback(ma_jack_nframes_t frameCount, void* pUserData)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        size_t newBufferSize = frameCount * (pDevice->capture.internalChannels * ma_get_bytes_per_sample(pDevice->capture.internalFormat));
+        float* pNewBuffer = (float*)ma_calloc(newBufferSize, &pDevice->pContext->allocationCallbacks);
+        if (pNewBuffer == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        ma_free(pDevice->jack.pIntermediaryBufferCapture, &pDevice->pContext->allocationCallbacks);
+
+        pDevice->jack.pIntermediaryBufferCapture = pNewBuffer;
+        pDevice->playback.internalPeriodSizeInFrames = frameCount;
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        size_t newBufferSize = frameCount * (pDevice->playback.internalChannels * ma_get_bytes_per_sample(pDevice->playback.internalFormat));
+        float* pNewBuffer = (float*)ma_calloc(newBufferSize, &pDevice->pContext->allocationCallbacks);
+        if (pNewBuffer == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        ma_free(pDevice->jack.pIntermediaryBufferPlayback, &pDevice->pContext->allocationCallbacks);
+
+        pDevice->jack.pIntermediaryBufferPlayback = pNewBuffer;
+        pDevice->playback.internalPeriodSizeInFrames = frameCount;
+    }
+
+    return 0;
+}
+
+static int ma_device__jack_process_callback(ma_jack_nframes_t frameCount, void* pUserData)
+{
+    ma_device* pDevice;
+    ma_context* pContext;
+    ma_uint32 iChannel;
+
+    pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    pContext = pDevice->pContext;
+    MA_ASSERT(pContext != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        /* Channels need to be interleaved. */
+        for (iChannel = 0; iChannel < pDevice->capture.internalChannels; ++iChannel) {
+            const float* pSrc = (const float*)((ma_jack_port_get_buffer_proc)pContext->jack.jack_port_get_buffer)((ma_jack_port_t*)pDevice->jack.ppPortsCapture[iChannel], frameCount);
+            if (pSrc != NULL) {
+                float* pDst = pDevice->jack.pIntermediaryBufferCapture + iChannel;
+                ma_jack_nframes_t iFrame;
+                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                    *pDst = *pSrc;
+
+                    pDst += pDevice->capture.internalChannels;
+                    pSrc += 1;
+                }
+            }
+        }
+
+        ma_device_handle_backend_data_callback(pDevice, NULL, pDevice->jack.pIntermediaryBufferCapture, frameCount);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_device_handle_backend_data_callback(pDevice, pDevice->jack.pIntermediaryBufferPlayback, NULL, frameCount);
+
+        /* Channels need to be deinterleaved. */
+        for (iChannel = 0; iChannel < pDevice->playback.internalChannels; ++iChannel) {
+            float* pDst = (float*)((ma_jack_port_get_buffer_proc)pContext->jack.jack_port_get_buffer)((ma_jack_port_t*)pDevice->jack.ppPortsPlayback[iChannel], frameCount);
+            if (pDst != NULL) {
+                const float* pSrc = pDevice->jack.pIntermediaryBufferPlayback + iChannel;
+                ma_jack_nframes_t iFrame;
+                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                    *pDst = *pSrc;
+
+                    pDst += 1;
+                    pSrc += pDevice->playback.internalChannels;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static ma_result ma_device_init__jack(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    ma_result result;
+    ma_uint32 periodSizeInFrames;
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(pDevice != NULL);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Loopback mode not supported.");
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    /* Only supporting default devices with JACK. */
+    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->pDeviceID != NULL && pDescriptorPlayback->pDeviceID->jack != 0) ||
+        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->pDeviceID  != NULL && pDescriptorCapture->pDeviceID->jack  != 0)) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Only default devices are supported.");
+        return MA_NO_DEVICE;
+    }
+
+    /* No exclusive mode with the JACK backend. */
+    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
+        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Exclusive mode not supported.");
+        return MA_SHARE_MODE_NOT_SUPPORTED;
+    }
+
+    /* Open the client. */
+    result = ma_context_open_client__jack(pDevice->pContext, (ma_jack_client_t**)&pDevice->jack.pClient);
+    if (result != MA_SUCCESS) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to open client.");
+        return result;
+    }
+
+    /* Callbacks. */
+    if (((ma_jack_set_process_callback_proc)pDevice->pContext->jack.jack_set_process_callback)((ma_jack_client_t*)pDevice->jack.pClient, ma_device__jack_process_callback, pDevice) != 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to set process callback.");
+        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+    }
+    if (((ma_jack_set_buffer_size_callback_proc)pDevice->pContext->jack.jack_set_buffer_size_callback)((ma_jack_client_t*)pDevice->jack.pClient, ma_device__jack_buffer_size_callback, pDevice) != 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to set buffer size callback.");
+        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+    }
+
+    ((ma_jack_on_shutdown_proc)pDevice->pContext->jack.jack_on_shutdown)((ma_jack_client_t*)pDevice->jack.pClient, ma_device__jack_shutdown_callback, pDevice);
+
+
+    /* The buffer size in frames can change. */
+    periodSizeInFrames = ((ma_jack_get_buffer_size_proc)pDevice->pContext->jack.jack_get_buffer_size)((ma_jack_client_t*)pDevice->jack.pClient);
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ma_uint32 iPort;
+        const char** ppPorts;
+
+        pDescriptorCapture->format     = ma_format_f32;
+        pDescriptorCapture->channels   = 0;
+        pDescriptorCapture->sampleRate = ((ma_jack_get_sample_rate_proc)pDevice->pContext->jack.jack_get_sample_rate)((ma_jack_client_t*)pDevice->jack.pClient);
+        ma_channel_map_init_standard(ma_standard_channel_map_alsa, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorCapture->channels);
+
+        ppPorts = ((ma_jack_get_ports_proc)pDevice->pContext->jack.jack_get_ports)((ma_jack_client_t*)pDevice->jack.pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ma_JackPortIsOutput);
+        if (ppPorts == NULL) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to query physical ports.");
+            return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+        }
+
+        /* Need to count the number of ports first so we can allocate some memory. */
+        while (ppPorts[pDescriptorCapture->channels] != NULL) {
+            pDescriptorCapture->channels += 1;
+        }
+
+        pDevice->jack.ppPortsCapture = (ma_ptr*)ma_malloc(sizeof(*pDevice->jack.ppPortsCapture) * pDescriptorCapture->channels, &pDevice->pContext->allocationCallbacks);
+        if (pDevice->jack.ppPortsCapture == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        for (iPort = 0; iPort < pDescriptorCapture->channels; iPort += 1) {
+            char name[64];
+            ma_strcpy_s(name, sizeof(name), "capture");
+            ma_itoa_s((int)iPort, name+7, sizeof(name)-7, 10); /* 7 = length of "capture" */
+
+            pDevice->jack.ppPortsCapture[iPort] = ((ma_jack_port_register_proc)pDevice->pContext->jack.jack_port_register)((ma_jack_client_t*)pDevice->jack.pClient, name, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsInput, 0);
+            if (pDevice->jack.ppPortsCapture[iPort] == NULL) {
+                ((ma_jack_free_proc)pDevice->pContext->jack.jack_free)((void*)ppPorts);
+                ma_device_uninit__jack(pDevice);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to register ports.");
+                return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+            }
+        }
+
+        ((ma_jack_free_proc)pDevice->pContext->jack.jack_free)((void*)ppPorts);
+
+        pDescriptorCapture->periodSizeInFrames = periodSizeInFrames;
+        pDescriptorCapture->periodCount        = 1; /* There's no notion of a period in JACK. Just set to 1. */
+
+        pDevice->jack.pIntermediaryBufferCapture = (float*)ma_calloc(pDescriptorCapture->periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels), &pDevice->pContext->allocationCallbacks);
+        if (pDevice->jack.pIntermediaryBufferCapture == NULL) {
+            ma_device_uninit__jack(pDevice);
+            return MA_OUT_OF_MEMORY;
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_uint32 iPort;
+        const char** ppPorts;
+
+        pDescriptorPlayback->format     = ma_format_f32;
+        pDescriptorPlayback->channels   = 0;
+        pDescriptorPlayback->sampleRate = ((ma_jack_get_sample_rate_proc)pDevice->pContext->jack.jack_get_sample_rate)((ma_jack_client_t*)pDevice->jack.pClient);
+        ma_channel_map_init_standard(ma_standard_channel_map_alsa, pDescriptorPlayback->channelMap, ma_countof(pDescriptorPlayback->channelMap), pDescriptorPlayback->channels);
+
+        ppPorts = ((ma_jack_get_ports_proc)pDevice->pContext->jack.jack_get_ports)((ma_jack_client_t*)pDevice->jack.pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ma_JackPortIsInput);
+        if (ppPorts == NULL) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to query physical ports.");
+            return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+        }
+
+        /* Need to count the number of ports first so we can allocate some memory. */
+        while (ppPorts[pDescriptorPlayback->channels] != NULL) {
+            pDescriptorPlayback->channels += 1;
+        }
+
+        pDevice->jack.ppPortsPlayback = (ma_ptr*)ma_malloc(sizeof(*pDevice->jack.ppPortsPlayback) * pDescriptorPlayback->channels, &pDevice->pContext->allocationCallbacks);
+        if (pDevice->jack.ppPortsPlayback == NULL) {
+            ma_free(pDevice->jack.ppPortsCapture, &pDevice->pContext->allocationCallbacks);
+            return MA_OUT_OF_MEMORY;
+        }
+
+        for (iPort = 0; iPort < pDescriptorPlayback->channels; iPort += 1) {
+            char name[64];
+            ma_strcpy_s(name, sizeof(name), "playback");
+            ma_itoa_s((int)iPort, name+8, sizeof(name)-8, 10); /* 8 = length of "playback" */
+
+            pDevice->jack.ppPortsPlayback[iPort] = ((ma_jack_port_register_proc)pDevice->pContext->jack.jack_port_register)((ma_jack_client_t*)pDevice->jack.pClient, name, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsOutput, 0);
+            if (pDevice->jack.ppPortsPlayback[iPort] == NULL) {
+                ((ma_jack_free_proc)pDevice->pContext->jack.jack_free)((void*)ppPorts);
+                ma_device_uninit__jack(pDevice);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to register ports.");
+                return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+            }
+        }
+
+        ((ma_jack_free_proc)pDevice->pContext->jack.jack_free)((void*)ppPorts);
+
+        pDescriptorPlayback->periodSizeInFrames = periodSizeInFrames;
+        pDescriptorPlayback->periodCount        = 1;   /* There's no notion of a period in JACK. Just set to 1. */
+
+        pDevice->jack.pIntermediaryBufferPlayback = (float*)ma_calloc(pDescriptorPlayback->periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels), &pDevice->pContext->allocationCallbacks);
+        if (pDevice->jack.pIntermediaryBufferPlayback == NULL) {
+            ma_device_uninit__jack(pDevice);
+            return MA_OUT_OF_MEMORY;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_device_start__jack(ma_device* pDevice)
+{
+    ma_context* pContext = pDevice->pContext;
+    int resultJACK;
+    size_t i;
+
+    resultJACK = ((ma_jack_activate_proc)pContext->jack.jack_activate)((ma_jack_client_t*)pDevice->jack.pClient);
+    if (resultJACK != 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to activate the JACK client.");
+        return MA_FAILED_TO_START_BACKEND_DEVICE;
+    }
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        const char** ppServerPorts = ((ma_jack_get_ports_proc)pContext->jack.jack_get_ports)((ma_jack_client_t*)pDevice->jack.pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ma_JackPortIsOutput);
+        if (ppServerPorts == NULL) {
+            ((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to retrieve physical ports.");
+            return MA_ERROR;
+        }
+
+        for (i = 0; ppServerPorts[i] != NULL; ++i) {
+            const char* pServerPort = ppServerPorts[i];
+            const char* pClientPort = ((ma_jack_port_name_proc)pContext->jack.jack_port_name)((ma_jack_port_t*)pDevice->jack.ppPortsCapture[i]);
+
+            resultJACK = ((ma_jack_connect_proc)pContext->jack.jack_connect)((ma_jack_client_t*)pDevice->jack.pClient, pServerPort, pClientPort);
+            if (resultJACK != 0) {
+                ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppServerPorts);
+                ((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to connect ports.");
+                return MA_ERROR;
+            }
+        }
+
+        ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppServerPorts);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        const char** ppServerPorts = ((ma_jack_get_ports_proc)pContext->jack.jack_get_ports)((ma_jack_client_t*)pDevice->jack.pClient, NULL, MA_JACK_DEFAULT_AUDIO_TYPE, ma_JackPortIsPhysical | ma_JackPortIsInput);
+        if (ppServerPorts == NULL) {
+            ((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to retrieve physical ports.");
+            return MA_ERROR;
+        }
+
+        for (i = 0; ppServerPorts[i] != NULL; ++i) {
+            const char* pServerPort = ppServerPorts[i];
+            const char* pClientPort = ((ma_jack_port_name_proc)pContext->jack.jack_port_name)((ma_jack_port_t*)pDevice->jack.ppPortsPlayback[i]);
+
+            resultJACK = ((ma_jack_connect_proc)pContext->jack.jack_connect)((ma_jack_client_t*)pDevice->jack.pClient, pClientPort, pServerPort);
+            if (resultJACK != 0) {
+                ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppServerPorts);
+                ((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] Failed to connect ports.");
+                return MA_ERROR;
+            }
+        }
+
+        ((ma_jack_free_proc)pContext->jack.jack_free)((void*)ppServerPorts);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__jack(ma_device* pDevice)
+{
+    ma_context* pContext = pDevice->pContext;
+
+    if (((ma_jack_deactivate_proc)pContext->jack.jack_deactivate)((ma_jack_client_t*)pDevice->jack.pClient) != 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[JACK] An error occurred when deactivating the JACK client.");
+        return MA_ERROR;
+    }
+
+    ma_device__on_notification_stopped(pDevice);
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_context_uninit__jack(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_jack);
+
+    ma_free(pContext->jack.pClientName, &pContext->allocationCallbacks);
+    pContext->jack.pClientName = NULL;
+
+#ifndef MA_NO_RUNTIME_LINKING
+    ma_dlclose(ma_context_get_log(pContext), pContext->jack.jackSO);
+#endif
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__jack(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+#ifndef MA_NO_RUNTIME_LINKING
+    const char* libjackNames[] = {
+#if defined(MA_WIN32)
+        "libjack.dll",
+        "libjack64.dll"
+#endif
+#if defined(MA_UNIX)
+        "libjack.so",
+        "libjack.so.0"
+#endif
+    };
+    size_t i;
+
+    for (i = 0; i < ma_countof(libjackNames); ++i) {
+        pContext->jack.jackSO = ma_dlopen(ma_context_get_log(pContext), libjackNames[i]);
+        if (pContext->jack.jackSO != NULL) {
+            break;
+        }
+    }
+
+    if (pContext->jack.jackSO == NULL) {
+        return MA_NO_BACKEND;
+    }
+
+    pContext->jack.jack_client_open              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_client_open");
+    pContext->jack.jack_client_close             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_client_close");
+    pContext->jack.jack_client_name_size         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_client_name_size");
+    pContext->jack.jack_set_process_callback     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_set_process_callback");
+    pContext->jack.jack_set_buffer_size_callback = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_set_buffer_size_callback");
+    pContext->jack.jack_on_shutdown              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_on_shutdown");
+    pContext->jack.jack_get_sample_rate          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_get_sample_rate");
+    pContext->jack.jack_get_buffer_size          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_get_buffer_size");
+    pContext->jack.jack_get_ports                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_get_ports");
+    pContext->jack.jack_activate                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_activate");
+    pContext->jack.jack_deactivate               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_deactivate");
+    pContext->jack.jack_connect                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_connect");
+    pContext->jack.jack_port_register            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_port_register");
+    pContext->jack.jack_port_name                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_port_name");
+    pContext->jack.jack_port_get_buffer          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_port_get_buffer");
+    pContext->jack.jack_free                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->jack.jackSO, "jack_free");
+#else
+    /*
+    This strange assignment system is here just to ensure type safety of miniaudio's function pointer
+    types. If anything differs slightly the compiler should throw a warning.
+    */
+    ma_jack_client_open_proc              _jack_client_open              = jack_client_open;
+    ma_jack_client_close_proc             _jack_client_close             = jack_client_close;
+    ma_jack_client_name_size_proc         _jack_client_name_size         = jack_client_name_size;
+    ma_jack_set_process_callback_proc     _jack_set_process_callback     = jack_set_process_callback;
+    ma_jack_set_buffer_size_callback_proc _jack_set_buffer_size_callback = jack_set_buffer_size_callback;
+    ma_jack_on_shutdown_proc              _jack_on_shutdown              = jack_on_shutdown;
+    ma_jack_get_sample_rate_proc          _jack_get_sample_rate          = jack_get_sample_rate;
+    ma_jack_get_buffer_size_proc          _jack_get_buffer_size          = jack_get_buffer_size;
+    ma_jack_get_ports_proc                _jack_get_ports                = jack_get_ports;
+    ma_jack_activate_proc                 _jack_activate                 = jack_activate;
+    ma_jack_deactivate_proc               _jack_deactivate               = jack_deactivate;
+    ma_jack_connect_proc                  _jack_connect                  = jack_connect;
+    ma_jack_port_register_proc            _jack_port_register            = jack_port_register;
+    ma_jack_port_name_proc                _jack_port_name                = jack_port_name;
+    ma_jack_port_get_buffer_proc          _jack_port_get_buffer          = jack_port_get_buffer;
+    ma_jack_free_proc                     _jack_free                     = jack_free;
+
+    pContext->jack.jack_client_open              = (ma_proc)_jack_client_open;
+    pContext->jack.jack_client_close             = (ma_proc)_jack_client_close;
+    pContext->jack.jack_client_name_size         = (ma_proc)_jack_client_name_size;
+    pContext->jack.jack_set_process_callback     = (ma_proc)_jack_set_process_callback;
+    pContext->jack.jack_set_buffer_size_callback = (ma_proc)_jack_set_buffer_size_callback;
+    pContext->jack.jack_on_shutdown              = (ma_proc)_jack_on_shutdown;
+    pContext->jack.jack_get_sample_rate          = (ma_proc)_jack_get_sample_rate;
+    pContext->jack.jack_get_buffer_size          = (ma_proc)_jack_get_buffer_size;
+    pContext->jack.jack_get_ports                = (ma_proc)_jack_get_ports;
+    pContext->jack.jack_activate                 = (ma_proc)_jack_activate;
+    pContext->jack.jack_deactivate               = (ma_proc)_jack_deactivate;
+    pContext->jack.jack_connect                  = (ma_proc)_jack_connect;
+    pContext->jack.jack_port_register            = (ma_proc)_jack_port_register;
+    pContext->jack.jack_port_name                = (ma_proc)_jack_port_name;
+    pContext->jack.jack_port_get_buffer          = (ma_proc)_jack_port_get_buffer;
+    pContext->jack.jack_free                     = (ma_proc)_jack_free;
+#endif
+
+    if (pConfig->jack.pClientName != NULL) {
+        pContext->jack.pClientName = ma_copy_string(pConfig->jack.pClientName, &pContext->allocationCallbacks);
+    }
+    pContext->jack.tryStartServer = pConfig->jack.tryStartServer;
+
+    /*
+    Getting here means the JACK library is installed, but it doesn't necessarily mean it's usable. We need to quickly test this by connecting
+    a temporary client.
+    */
+    {
+        ma_jack_client_t* pDummyClient;
+        ma_result result = ma_context_open_client__jack(pContext, &pDummyClient);
+        if (result != MA_SUCCESS) {
+            ma_free(pContext->jack.pClientName, &pContext->allocationCallbacks);
+        #ifndef MA_NO_RUNTIME_LINKING
+            ma_dlclose(ma_context_get_log(pContext), pContext->jack.jackSO);
+        #endif
+            return MA_NO_BACKEND;
+        }
+
+        ((ma_jack_client_close_proc)pContext->jack.jack_client_close)((ma_jack_client_t*)pDummyClient);
+    }
+
+
+    pCallbacks->onContextInit             = ma_context_init__jack;
+    pCallbacks->onContextUninit           = ma_context_uninit__jack;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__jack;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__jack;
+    pCallbacks->onDeviceInit              = ma_device_init__jack;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__jack;
+    pCallbacks->onDeviceStart             = ma_device_start__jack;
+    pCallbacks->onDeviceStop              = ma_device_stop__jack;
+    pCallbacks->onDeviceRead              = NULL;   /* Not used because JACK is asynchronous. */
+    pCallbacks->onDeviceWrite             = NULL;   /* Not used because JACK is asynchronous. */
+    pCallbacks->onDeviceDataLoop          = NULL;   /* Not used because JACK is asynchronous. */
+
+    return MA_SUCCESS;
+}
+#endif  /* MA_HAS_JACK */
+
+
+
+/******************************************************************************
+
+Core Audio Backend
+
+References
+==========
+- Technical Note TN2091: Device input using the HAL Output Audio Unit
+    https://developer.apple.com/library/archive/technotes/tn2091/_index.html
+
+******************************************************************************/
+#ifdef MA_HAS_COREAUDIO
+#include <TargetConditionals.h>
+
+#if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1
+    #define MA_APPLE_MOBILE
+    #if defined(TARGET_OS_TV) && TARGET_OS_TV == 1
+        #define MA_APPLE_TV
+    #endif
+    #if defined(TARGET_OS_WATCH) && TARGET_OS_WATCH == 1
+        #define MA_APPLE_WATCH
+    #endif
+    #if __has_feature(objc_arc)
+        #define MA_BRIDGE_TRANSFER  __bridge_transfer
+        #define MA_BRIDGE_RETAINED  __bridge_retained
+    #else
+        #define MA_BRIDGE_TRANSFER
+        #define MA_BRIDGE_RETAINED
+    #endif
+#else
+    #define MA_APPLE_DESKTOP
+#endif
+
+#if defined(MA_APPLE_DESKTOP)
+#include <CoreAudio/CoreAudio.h>
+#else
+#include <AVFoundation/AVFoundation.h>
+#endif
+
+#include <AudioToolbox/AudioToolbox.h>
+
+/* CoreFoundation */
+typedef Boolean (* ma_CFStringGetCString_proc)(CFStringRef theString, char* buffer, CFIndex bufferSize, CFStringEncoding encoding);
+typedef void (* ma_CFRelease_proc)(CFTypeRef cf);
+
+/* CoreAudio */
+#if defined(MA_APPLE_DESKTOP)
+typedef OSStatus (* ma_AudioObjectGetPropertyData_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, UInt32 inQualifierDataSize, const void* inQualifierData, UInt32* ioDataSize, void* outData);
+typedef OSStatus (* ma_AudioObjectGetPropertyDataSize_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, UInt32 inQualifierDataSize, const void* inQualifierData, UInt32* outDataSize);
+typedef OSStatus (* ma_AudioObjectSetPropertyData_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, UInt32 inQualifierDataSize, const void* inQualifierData, UInt32 inDataSize, const void* inData);
+typedef OSStatus (* ma_AudioObjectAddPropertyListener_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, AudioObjectPropertyListenerProc inListener, void* inClientData);
+typedef OSStatus (* ma_AudioObjectRemovePropertyListener_proc)(AudioObjectID inObjectID, const AudioObjectPropertyAddress* inAddress, AudioObjectPropertyListenerProc inListener, void* inClientData);
+#endif
+
+/* AudioToolbox */
+typedef AudioComponent (* ma_AudioComponentFindNext_proc)(AudioComponent inComponent, const AudioComponentDescription* inDesc);
+typedef OSStatus (* ma_AudioComponentInstanceDispose_proc)(AudioComponentInstance inInstance);
+typedef OSStatus (* ma_AudioComponentInstanceNew_proc)(AudioComponent inComponent, AudioComponentInstance* outInstance);
+typedef OSStatus (* ma_AudioOutputUnitStart_proc)(AudioUnit inUnit);
+typedef OSStatus (* ma_AudioOutputUnitStop_proc)(AudioUnit inUnit);
+typedef OSStatus (* ma_AudioUnitAddPropertyListener_proc)(AudioUnit inUnit, AudioUnitPropertyID inID, AudioUnitPropertyListenerProc inProc, void* inProcUserData);
+typedef OSStatus (* ma_AudioUnitGetPropertyInfo_proc)(AudioUnit inUnit, AudioUnitPropertyID inID, AudioUnitScope inScope, AudioUnitElement inElement, UInt32* outDataSize, Boolean* outWriteable);
+typedef OSStatus (* ma_AudioUnitGetProperty_proc)(AudioUnit inUnit, AudioUnitPropertyID inID, AudioUnitScope inScope, AudioUnitElement inElement, void* outData, UInt32* ioDataSize);
+typedef OSStatus (* ma_AudioUnitSetProperty_proc)(AudioUnit inUnit, AudioUnitPropertyID inID, AudioUnitScope inScope, AudioUnitElement inElement, const void* inData, UInt32 inDataSize);
+typedef OSStatus (* ma_AudioUnitInitialize_proc)(AudioUnit inUnit);
+typedef OSStatus (* ma_AudioUnitRender_proc)(AudioUnit inUnit, AudioUnitRenderActionFlags* ioActionFlags, const AudioTimeStamp* inTimeStamp, UInt32 inOutputBusNumber, UInt32 inNumberFrames, AudioBufferList* ioData);
+
+
+#define MA_COREAUDIO_OUTPUT_BUS    0
+#define MA_COREAUDIO_INPUT_BUS     1
+
+#if defined(MA_APPLE_DESKTOP)
+static ma_result ma_device_reinit_internal__coreaudio(ma_device* pDevice, ma_device_type deviceType, ma_bool32 disposePreviousAudioUnit);
+#endif
+
+/*
+Core Audio
+
+So far, Core Audio has been the worst backend to work with due to being both unintuitive and having almost no documentation
+apart from comments in the headers (which admittedly are quite good). For my own purposes, and for anybody out there whose
+needing to figure out how this darn thing works, I'm going to outline a few things here.
+
+Since miniaudio is a fairly low-level API, one of the things it needs is control over specific devices, and it needs to be
+able to identify whether or not it can be used as playback and/or capture. The AudioObject API is the only one I've seen
+that supports this level of detail. There was some public domain sample code I stumbled across that used the AudioComponent
+and AudioUnit APIs, but I couldn't see anything that gave low-level control over device selection and capabilities (the
+distinction between playback and capture in particular). Therefore, miniaudio is using the AudioObject API.
+
+Most (all?) functions in the AudioObject API take a AudioObjectID as its input. This is the device identifier. When
+retrieving global information, such as the device list, you use kAudioObjectSystemObject. When retrieving device-specific
+data, you pass in the ID for that device. In order to retrieve device-specific IDs you need to enumerate over each of the
+devices. This is done using the AudioObjectGetPropertyDataSize() and AudioObjectGetPropertyData() APIs which seem to be
+the central APIs for retrieving information about the system and specific devices.
+
+To use the AudioObjectGetPropertyData() API you need to use the notion of a property address. A property address is a
+structure with three variables and is used to identify which property you are getting or setting. The first is the "selector"
+which is basically the specific property that you're wanting to retrieve or set. The second is the "scope", which is
+typically set to kAudioObjectPropertyScopeGlobal, kAudioObjectPropertyScopeInput for input-specific properties and
+kAudioObjectPropertyScopeOutput for output-specific properties. The last is the "element" which is always set to
+kAudioObjectPropertyElementMain in miniaudio's case. I don't know of any cases where this would be set to anything different.
+
+Back to the earlier issue of device retrieval, you first use the AudioObjectGetPropertyDataSize() API to retrieve the size
+of the raw data which is just a list of AudioDeviceID's. You use the kAudioObjectSystemObject AudioObjectID, and a property
+address with the kAudioHardwarePropertyDevices selector and the kAudioObjectPropertyScopeGlobal scope. Once you have the
+size, allocate a block of memory of that size and then call AudioObjectGetPropertyData(). The data is just a list of
+AudioDeviceID's so just do "dataSize/sizeof(AudioDeviceID)" to know the device count.
+*/
+
+#if defined(MA_APPLE_MOBILE)
+static void ma_device__on_notification_interruption_began(ma_device* pDevice)
+{
+    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_interruption_began));
+}
+
+static void ma_device__on_notification_interruption_ended(ma_device* pDevice)
+{
+    ma_device__on_notification(ma_device_notification_init(pDevice, ma_device_notification_type_interruption_ended));
+}
+#endif
+
+static ma_result ma_result_from_OSStatus(OSStatus status)
+{
+    switch (status)
+    {
+        case noErr:                                   return MA_SUCCESS;
+    #if defined(MA_APPLE_DESKTOP)
+        case kAudioHardwareNotRunningError:           return MA_DEVICE_NOT_STARTED;
+        case kAudioHardwareUnspecifiedError:          return MA_ERROR;
+        case kAudioHardwareUnknownPropertyError:      return MA_INVALID_ARGS;
+        case kAudioHardwareBadPropertySizeError:      return MA_INVALID_OPERATION;
+        case kAudioHardwareIllegalOperationError:     return MA_INVALID_OPERATION;
+        case kAudioHardwareBadObjectError:            return MA_INVALID_ARGS;
+        case kAudioHardwareBadDeviceError:            return MA_INVALID_ARGS;
+        case kAudioHardwareBadStreamError:            return MA_INVALID_ARGS;
+        case kAudioHardwareUnsupportedOperationError: return MA_INVALID_OPERATION;
+        case kAudioDeviceUnsupportedFormatError:      return MA_FORMAT_NOT_SUPPORTED;
+        case kAudioDevicePermissionsError:            return MA_ACCESS_DENIED;
+    #endif
+        default:                                      return MA_ERROR;
+    }
+}
+
+#if 0
+static ma_channel ma_channel_from_AudioChannelBitmap(AudioChannelBitmap bit)
+{
+    switch (bit)
+    {
+        case kAudioChannelBit_Left:                 return MA_CHANNEL_LEFT;
+        case kAudioChannelBit_Right:                return MA_CHANNEL_RIGHT;
+        case kAudioChannelBit_Center:               return MA_CHANNEL_FRONT_CENTER;
+        case kAudioChannelBit_LFEScreen:            return MA_CHANNEL_LFE;
+        case kAudioChannelBit_LeftSurround:         return MA_CHANNEL_BACK_LEFT;
+        case kAudioChannelBit_RightSurround:        return MA_CHANNEL_BACK_RIGHT;
+        case kAudioChannelBit_LeftCenter:           return MA_CHANNEL_FRONT_LEFT_CENTER;
+        case kAudioChannelBit_RightCenter:          return MA_CHANNEL_FRONT_RIGHT_CENTER;
+        case kAudioChannelBit_CenterSurround:       return MA_CHANNEL_BACK_CENTER;
+        case kAudioChannelBit_LeftSurroundDirect:   return MA_CHANNEL_SIDE_LEFT;
+        case kAudioChannelBit_RightSurroundDirect:  return MA_CHANNEL_SIDE_RIGHT;
+        case kAudioChannelBit_TopCenterSurround:    return MA_CHANNEL_TOP_CENTER;
+        case kAudioChannelBit_VerticalHeightLeft:   return MA_CHANNEL_TOP_FRONT_LEFT;
+        case kAudioChannelBit_VerticalHeightCenter: return MA_CHANNEL_TOP_FRONT_CENTER;
+        case kAudioChannelBit_VerticalHeightRight:  return MA_CHANNEL_TOP_FRONT_RIGHT;
+        case kAudioChannelBit_TopBackLeft:          return MA_CHANNEL_TOP_BACK_LEFT;
+        case kAudioChannelBit_TopBackCenter:        return MA_CHANNEL_TOP_BACK_CENTER;
+        case kAudioChannelBit_TopBackRight:         return MA_CHANNEL_TOP_BACK_RIGHT;
+        default:                                    return MA_CHANNEL_NONE;
+    }
+}
+#endif
+
+static ma_result ma_format_from_AudioStreamBasicDescription(const AudioStreamBasicDescription* pDescription, ma_format* pFormatOut)
+{
+    MA_ASSERT(pDescription != NULL);
+    MA_ASSERT(pFormatOut != NULL);
+
+    *pFormatOut = ma_format_unknown;   /* Safety. */
+
+    /* There's a few things miniaudio doesn't support. */
+    if (pDescription->mFormatID != kAudioFormatLinearPCM) {
+        return MA_FORMAT_NOT_SUPPORTED;
+    }
+
+    /* We don't support any non-packed formats that are aligned high. */
+    if ((pDescription->mFormatFlags & kLinearPCMFormatFlagIsAlignedHigh) != 0) {
+        return MA_FORMAT_NOT_SUPPORTED;
+    }
+
+    /* Only supporting native-endian. */
+    if ((ma_is_little_endian() && (pDescription->mFormatFlags & kAudioFormatFlagIsBigEndian) != 0) || (ma_is_big_endian() && (pDescription->mFormatFlags & kAudioFormatFlagIsBigEndian) == 0)) {
+        return MA_FORMAT_NOT_SUPPORTED;
+    }
+
+    /* We are not currently supporting non-interleaved formats (this will be added in a future version of miniaudio). */
+    /*if ((pDescription->mFormatFlags & kAudioFormatFlagIsNonInterleaved) != 0) {
+        return MA_FORMAT_NOT_SUPPORTED;
+    }*/
+
+    if ((pDescription->mFormatFlags & kLinearPCMFormatFlagIsFloat) != 0) {
+        if (pDescription->mBitsPerChannel == 32) {
+            *pFormatOut = ma_format_f32;
+            return MA_SUCCESS;
+        }
+    } else {
+        if ((pDescription->mFormatFlags & kLinearPCMFormatFlagIsSignedInteger) != 0) {
+            if (pDescription->mBitsPerChannel == 16) {
+                *pFormatOut = ma_format_s16;
+                return MA_SUCCESS;
+            } else if (pDescription->mBitsPerChannel == 24) {
+                if (pDescription->mBytesPerFrame == (pDescription->mBitsPerChannel/8 * pDescription->mChannelsPerFrame)) {
+                    *pFormatOut = ma_format_s24;
+                    return MA_SUCCESS;
+                } else {
+                    if (pDescription->mBytesPerFrame/pDescription->mChannelsPerFrame == sizeof(ma_int32)) {
+                        /* TODO: Implement ma_format_s24_32. */
+                        /**pFormatOut = ma_format_s24_32;*/
+                        /*return MA_SUCCESS;*/
+                        return MA_FORMAT_NOT_SUPPORTED;
+                    }
+                }
+            } else if (pDescription->mBitsPerChannel == 32) {
+                *pFormatOut = ma_format_s32;
+                return MA_SUCCESS;
+            }
+        } else {
+            if (pDescription->mBitsPerChannel == 8) {
+                *pFormatOut = ma_format_u8;
+                return MA_SUCCESS;
+            }
+        }
+    }
+
+    /* Getting here means the format is not supported. */
+    return MA_FORMAT_NOT_SUPPORTED;
+}
+
+#if defined(MA_APPLE_DESKTOP)
+static ma_channel ma_channel_from_AudioChannelLabel(AudioChannelLabel label)
+{
+    switch (label)
+    {
+        case kAudioChannelLabel_Unknown:              return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Unused:               return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_UseCoordinates:       return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Left:                 return MA_CHANNEL_LEFT;
+        case kAudioChannelLabel_Right:                return MA_CHANNEL_RIGHT;
+        case kAudioChannelLabel_Center:               return MA_CHANNEL_FRONT_CENTER;
+        case kAudioChannelLabel_LFEScreen:            return MA_CHANNEL_LFE;
+        case kAudioChannelLabel_LeftSurround:         return MA_CHANNEL_BACK_LEFT;
+        case kAudioChannelLabel_RightSurround:        return MA_CHANNEL_BACK_RIGHT;
+        case kAudioChannelLabel_LeftCenter:           return MA_CHANNEL_FRONT_LEFT_CENTER;
+        case kAudioChannelLabel_RightCenter:          return MA_CHANNEL_FRONT_RIGHT_CENTER;
+        case kAudioChannelLabel_CenterSurround:       return MA_CHANNEL_BACK_CENTER;
+        case kAudioChannelLabel_LeftSurroundDirect:   return MA_CHANNEL_SIDE_LEFT;
+        case kAudioChannelLabel_RightSurroundDirect:  return MA_CHANNEL_SIDE_RIGHT;
+        case kAudioChannelLabel_TopCenterSurround:    return MA_CHANNEL_TOP_CENTER;
+        case kAudioChannelLabel_VerticalHeightLeft:   return MA_CHANNEL_TOP_FRONT_LEFT;
+        case kAudioChannelLabel_VerticalHeightCenter: return MA_CHANNEL_TOP_FRONT_CENTER;
+        case kAudioChannelLabel_VerticalHeightRight:  return MA_CHANNEL_TOP_FRONT_RIGHT;
+        case kAudioChannelLabel_TopBackLeft:          return MA_CHANNEL_TOP_BACK_LEFT;
+        case kAudioChannelLabel_TopBackCenter:        return MA_CHANNEL_TOP_BACK_CENTER;
+        case kAudioChannelLabel_TopBackRight:         return MA_CHANNEL_TOP_BACK_RIGHT;
+        case kAudioChannelLabel_RearSurroundLeft:     return MA_CHANNEL_BACK_LEFT;
+        case kAudioChannelLabel_RearSurroundRight:    return MA_CHANNEL_BACK_RIGHT;
+        case kAudioChannelLabel_LeftWide:             return MA_CHANNEL_SIDE_LEFT;
+        case kAudioChannelLabel_RightWide:            return MA_CHANNEL_SIDE_RIGHT;
+        case kAudioChannelLabel_LFE2:                 return MA_CHANNEL_LFE;
+        case kAudioChannelLabel_LeftTotal:            return MA_CHANNEL_LEFT;
+        case kAudioChannelLabel_RightTotal:           return MA_CHANNEL_RIGHT;
+        case kAudioChannelLabel_HearingImpaired:      return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Narration:            return MA_CHANNEL_MONO;
+        case kAudioChannelLabel_Mono:                 return MA_CHANNEL_MONO;
+        case kAudioChannelLabel_DialogCentricMix:     return MA_CHANNEL_MONO;
+        case kAudioChannelLabel_CenterSurroundDirect: return MA_CHANNEL_BACK_CENTER;
+        case kAudioChannelLabel_Haptic:               return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Ambisonic_W:          return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Ambisonic_X:          return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Ambisonic_Y:          return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Ambisonic_Z:          return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_MS_Mid:               return MA_CHANNEL_LEFT;
+        case kAudioChannelLabel_MS_Side:              return MA_CHANNEL_RIGHT;
+        case kAudioChannelLabel_XY_X:                 return MA_CHANNEL_LEFT;
+        case kAudioChannelLabel_XY_Y:                 return MA_CHANNEL_RIGHT;
+        case kAudioChannelLabel_HeadphonesLeft:       return MA_CHANNEL_LEFT;
+        case kAudioChannelLabel_HeadphonesRight:      return MA_CHANNEL_RIGHT;
+        case kAudioChannelLabel_ClickTrack:           return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_ForeignLanguage:      return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Discrete:             return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_Discrete_0:           return MA_CHANNEL_AUX_0;
+        case kAudioChannelLabel_Discrete_1:           return MA_CHANNEL_AUX_1;
+        case kAudioChannelLabel_Discrete_2:           return MA_CHANNEL_AUX_2;
+        case kAudioChannelLabel_Discrete_3:           return MA_CHANNEL_AUX_3;
+        case kAudioChannelLabel_Discrete_4:           return MA_CHANNEL_AUX_4;
+        case kAudioChannelLabel_Discrete_5:           return MA_CHANNEL_AUX_5;
+        case kAudioChannelLabel_Discrete_6:           return MA_CHANNEL_AUX_6;
+        case kAudioChannelLabel_Discrete_7:           return MA_CHANNEL_AUX_7;
+        case kAudioChannelLabel_Discrete_8:           return MA_CHANNEL_AUX_8;
+        case kAudioChannelLabel_Discrete_9:           return MA_CHANNEL_AUX_9;
+        case kAudioChannelLabel_Discrete_10:          return MA_CHANNEL_AUX_10;
+        case kAudioChannelLabel_Discrete_11:          return MA_CHANNEL_AUX_11;
+        case kAudioChannelLabel_Discrete_12:          return MA_CHANNEL_AUX_12;
+        case kAudioChannelLabel_Discrete_13:          return MA_CHANNEL_AUX_13;
+        case kAudioChannelLabel_Discrete_14:          return MA_CHANNEL_AUX_14;
+        case kAudioChannelLabel_Discrete_15:          return MA_CHANNEL_AUX_15;
+        case kAudioChannelLabel_Discrete_65535:       return MA_CHANNEL_NONE;
+
+    #if 0   /* Introduced in a later version of macOS. */
+        case kAudioChannelLabel_HOA_ACN:              return MA_CHANNEL_NONE;
+        case kAudioChannelLabel_HOA_ACN_0:            return MA_CHANNEL_AUX_0;
+        case kAudioChannelLabel_HOA_ACN_1:            return MA_CHANNEL_AUX_1;
+        case kAudioChannelLabel_HOA_ACN_2:            return MA_CHANNEL_AUX_2;
+        case kAudioChannelLabel_HOA_ACN_3:            return MA_CHANNEL_AUX_3;
+        case kAudioChannelLabel_HOA_ACN_4:            return MA_CHANNEL_AUX_4;
+        case kAudioChannelLabel_HOA_ACN_5:            return MA_CHANNEL_AUX_5;
+        case kAudioChannelLabel_HOA_ACN_6:            return MA_CHANNEL_AUX_6;
+        case kAudioChannelLabel_HOA_ACN_7:            return MA_CHANNEL_AUX_7;
+        case kAudioChannelLabel_HOA_ACN_8:            return MA_CHANNEL_AUX_8;
+        case kAudioChannelLabel_HOA_ACN_9:            return MA_CHANNEL_AUX_9;
+        case kAudioChannelLabel_HOA_ACN_10:           return MA_CHANNEL_AUX_10;
+        case kAudioChannelLabel_HOA_ACN_11:           return MA_CHANNEL_AUX_11;
+        case kAudioChannelLabel_HOA_ACN_12:           return MA_CHANNEL_AUX_12;
+        case kAudioChannelLabel_HOA_ACN_13:           return MA_CHANNEL_AUX_13;
+        case kAudioChannelLabel_HOA_ACN_14:           return MA_CHANNEL_AUX_14;
+        case kAudioChannelLabel_HOA_ACN_15:           return MA_CHANNEL_AUX_15;
+        case kAudioChannelLabel_HOA_ACN_65024:        return MA_CHANNEL_NONE;
+    #endif
+
+        default:                                      return MA_CHANNEL_NONE;
+    }
+}
+
+static ma_result ma_get_channel_map_from_AudioChannelLayout(AudioChannelLayout* pChannelLayout, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    MA_ASSERT(pChannelLayout != NULL);
+
+    if (pChannelLayout->mChannelLayoutTag == kAudioChannelLayoutTag_UseChannelDescriptions) {
+        UInt32 iChannel;
+        for (iChannel = 0; iChannel < pChannelLayout->mNumberChannelDescriptions && iChannel < channelMapCap; ++iChannel) {
+            pChannelMap[iChannel] = ma_channel_from_AudioChannelLabel(pChannelLayout->mChannelDescriptions[iChannel].mChannelLabel);
+        }
+    } else
+#if 0
+    if (pChannelLayout->mChannelLayoutTag == kAudioChannelLayoutTag_UseChannelBitmap) {
+        /* This is the same kind of system that's used by Windows audio APIs. */
+        UInt32 iChannel = 0;
+        UInt32 iBit;
+        AudioChannelBitmap bitmap = pChannelLayout->mChannelBitmap;
+        for (iBit = 0; iBit < 32 && iChannel < channelMapCap; ++iBit) {
+            AudioChannelBitmap bit = bitmap & (1 << iBit);
+            if (bit != 0) {
+                pChannelMap[iChannel++] = ma_channel_from_AudioChannelBit(bit);
+            }
+        }
+    } else
+#endif
+    {
+        /*
+        Need to use the tag to determine the channel map. For now I'm just assuming a default channel map, but later on this should
+        be updated to determine the mapping based on the tag.
+        */
+        UInt32 channelCount;
+
+        /* Our channel map retrieval APIs below take 32-bit integers, so we'll want to clamp the channel map capacity. */
+        if (channelMapCap > 0xFFFFFFFF) {
+            channelMapCap = 0xFFFFFFFF;
+        }
+
+        channelCount = ma_min(AudioChannelLayoutTag_GetNumberOfChannels(pChannelLayout->mChannelLayoutTag), (UInt32)channelMapCap);
+
+        switch (pChannelLayout->mChannelLayoutTag)
+        {
+            case kAudioChannelLayoutTag_Mono:
+            case kAudioChannelLayoutTag_Stereo:
+            case kAudioChannelLayoutTag_StereoHeadphones:
+            case kAudioChannelLayoutTag_MatrixStereo:
+            case kAudioChannelLayoutTag_MidSide:
+            case kAudioChannelLayoutTag_XY:
+            case kAudioChannelLayoutTag_Binaural:
+            case kAudioChannelLayoutTag_Ambisonic_B_Format:
+            {
+                ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, channelCount);
+            } break;
+
+            case kAudioChannelLayoutTag_Octagonal:
+            {
+                pChannelMap[7] = MA_CHANNEL_SIDE_RIGHT;
+                pChannelMap[6] = MA_CHANNEL_SIDE_LEFT;
+            } MA_FALLTHROUGH; /* Intentional fallthrough. */
+            case kAudioChannelLayoutTag_Hexagonal:
+            {
+                pChannelMap[5] = MA_CHANNEL_BACK_CENTER;
+            } MA_FALLTHROUGH; /* Intentional fallthrough. */
+            case kAudioChannelLayoutTag_Pentagonal:
+            {
+                pChannelMap[4] = MA_CHANNEL_FRONT_CENTER;
+            } MA_FALLTHROUGH; /* Intentional fallthrough. */
+            case kAudioChannelLayoutTag_Quadraphonic:
+            {
+                pChannelMap[3] = MA_CHANNEL_BACK_RIGHT;
+                pChannelMap[2] = MA_CHANNEL_BACK_LEFT;
+                pChannelMap[1] = MA_CHANNEL_RIGHT;
+                pChannelMap[0] = MA_CHANNEL_LEFT;
+            } break;
+
+            /* TODO: Add support for more tags here. */
+
+            default:
+            {
+                ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, channelCount);
+            } break;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+#if (defined(MAC_OS_VERSION_12_0) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_12_0) || \
+    (defined(__IPHONE_15_0) && __IPHONE_OS_VERSION_MAX_ALLOWED >= __IPHONE_15_0)
+#define AUDIO_OBJECT_PROPERTY_ELEMENT kAudioObjectPropertyElementMain
+#else
+/* kAudioObjectPropertyElementMaster is deprecated. */
+#define AUDIO_OBJECT_PROPERTY_ELEMENT kAudioObjectPropertyElementMaster
+#endif
+
+/* kAudioDevicePropertyScope* were renamed to kAudioObjectPropertyScope* in 10.8. */
+#if !defined(MAC_OS_X_VERSION_10_8) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_8)
+#define kAudioObjectPropertyScopeInput kAudioDevicePropertyScopeInput
+#define kAudioObjectPropertyScopeOutput kAudioDevicePropertyScopeOutput
+#endif
+
+static ma_result ma_get_device_object_ids__coreaudio(ma_context* pContext, UInt32* pDeviceCount, AudioObjectID** ppDeviceObjectIDs) /* NOTE: Free the returned buffer with ma_free(). */
+{
+    AudioObjectPropertyAddress propAddressDevices;
+    UInt32 deviceObjectsDataSize;
+    OSStatus status;
+    AudioObjectID* pDeviceObjectIDs;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pDeviceCount != NULL);
+    MA_ASSERT(ppDeviceObjectIDs != NULL);
+
+    /* Safety. */
+    *pDeviceCount = 0;
+    *ppDeviceObjectIDs = NULL;
+
+    propAddressDevices.mSelector = kAudioHardwarePropertyDevices;
+    propAddressDevices.mScope    = kAudioObjectPropertyScopeGlobal;
+    propAddressDevices.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(kAudioObjectSystemObject, &propAddressDevices, 0, NULL, &deviceObjectsDataSize);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    pDeviceObjectIDs = (AudioObjectID*)ma_malloc(deviceObjectsDataSize, &pContext->allocationCallbacks);
+    if (pDeviceObjectIDs == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(kAudioObjectSystemObject, &propAddressDevices, 0, NULL, &deviceObjectsDataSize, pDeviceObjectIDs);
+    if (status != noErr) {
+        ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
+        return ma_result_from_OSStatus(status);
+    }
+
+    *pDeviceCount = deviceObjectsDataSize / sizeof(AudioObjectID);
+    *ppDeviceObjectIDs = pDeviceObjectIDs;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_get_AudioObject_uid_as_CFStringRef(ma_context* pContext, AudioObjectID objectID, CFStringRef* pUID)
+{
+    AudioObjectPropertyAddress propAddress;
+    UInt32 dataSize;
+    OSStatus status;
+
+    MA_ASSERT(pContext != NULL);
+
+    propAddress.mSelector = kAudioDevicePropertyDeviceUID;
+    propAddress.mScope    = kAudioObjectPropertyScopeGlobal;
+    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    dataSize = sizeof(*pUID);
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(objectID, &propAddress, 0, NULL, &dataSize, pUID);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_get_AudioObject_uid(ma_context* pContext, AudioObjectID objectID, size_t bufferSize, char* bufferOut)
+{
+    CFStringRef uid;
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+
+    result = ma_get_AudioObject_uid_as_CFStringRef(pContext, objectID, &uid);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (!((ma_CFStringGetCString_proc)pContext->coreaudio.CFStringGetCString)(uid, bufferOut, bufferSize, kCFStringEncodingUTF8)) {
+        return MA_ERROR;
+    }
+
+    ((ma_CFRelease_proc)pContext->coreaudio.CFRelease)(uid);
+    return MA_SUCCESS;
+}
+
+static ma_result ma_get_AudioObject_name(ma_context* pContext, AudioObjectID objectID, size_t bufferSize, char* bufferOut)
+{
+    AudioObjectPropertyAddress propAddress;
+    CFStringRef deviceName = NULL;
+    UInt32 dataSize;
+    OSStatus status;
+
+    MA_ASSERT(pContext != NULL);
+
+    propAddress.mSelector = kAudioDevicePropertyDeviceNameCFString;
+    propAddress.mScope    = kAudioObjectPropertyScopeGlobal;
+    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    dataSize = sizeof(deviceName);
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(objectID, &propAddress, 0, NULL, &dataSize, &deviceName);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    if (!((ma_CFStringGetCString_proc)pContext->coreaudio.CFStringGetCString)(deviceName, bufferOut, bufferSize, kCFStringEncodingUTF8)) {
+        return MA_ERROR;
+    }
+
+    ((ma_CFRelease_proc)pContext->coreaudio.CFRelease)(deviceName);
+    return MA_SUCCESS;
+}
+
+static ma_bool32 ma_does_AudioObject_support_scope(ma_context* pContext, AudioObjectID deviceObjectID, AudioObjectPropertyScope scope)
+{
+    AudioObjectPropertyAddress propAddress;
+    UInt32 dataSize;
+    OSStatus status;
+    AudioBufferList* pBufferList;
+    ma_bool32 isSupported;
+
+    MA_ASSERT(pContext != NULL);
+
+    /* To know whether or not a device is an input device we need ot look at the stream configuration. If it has an output channel it's a playback device. */
+    propAddress.mSelector = kAudioDevicePropertyStreamConfiguration;
+    propAddress.mScope    = scope;
+    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(deviceObjectID, &propAddress, 0, NULL, &dataSize);
+    if (status != noErr) {
+        return MA_FALSE;
+    }
+
+    pBufferList = (AudioBufferList*)ma_malloc(dataSize, &pContext->allocationCallbacks);
+    if (pBufferList == NULL) {
+        return MA_FALSE;   /* Out of memory. */
+    }
+
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, pBufferList);
+    if (status != noErr) {
+        ma_free(pBufferList, &pContext->allocationCallbacks);
+        return MA_FALSE;
+    }
+
+    isSupported = MA_FALSE;
+    if (pBufferList->mNumberBuffers > 0) {
+        isSupported = MA_TRUE;
+    }
+
+    ma_free(pBufferList, &pContext->allocationCallbacks);
+    return isSupported;
+}
+
+static ma_bool32 ma_does_AudioObject_support_playback(ma_context* pContext, AudioObjectID deviceObjectID)
+{
+    return ma_does_AudioObject_support_scope(pContext, deviceObjectID, kAudioObjectPropertyScopeOutput);
+}
+
+static ma_bool32 ma_does_AudioObject_support_capture(ma_context* pContext, AudioObjectID deviceObjectID)
+{
+    return ma_does_AudioObject_support_scope(pContext, deviceObjectID, kAudioObjectPropertyScopeInput);
+}
+
+
+static ma_result ma_get_AudioObject_stream_descriptions(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, UInt32* pDescriptionCount, AudioStreamRangedDescription** ppDescriptions) /* NOTE: Free the returned pointer with ma_free(). */
+{
+    AudioObjectPropertyAddress propAddress;
+    UInt32 dataSize;
+    OSStatus status;
+    AudioStreamRangedDescription* pDescriptions;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pDescriptionCount != NULL);
+    MA_ASSERT(ppDescriptions != NULL);
+
+    /*
+    TODO: Experiment with kAudioStreamPropertyAvailablePhysicalFormats instead of (or in addition to) kAudioStreamPropertyAvailableVirtualFormats. My
+          MacBook Pro uses s24/32 format, however, which miniaudio does not currently support.
+    */
+    propAddress.mSelector = kAudioStreamPropertyAvailableVirtualFormats; /*kAudioStreamPropertyAvailablePhysicalFormats;*/
+    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
+    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(deviceObjectID, &propAddress, 0, NULL, &dataSize);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    pDescriptions = (AudioStreamRangedDescription*)ma_malloc(dataSize, &pContext->allocationCallbacks);
+    if (pDescriptions == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, pDescriptions);
+    if (status != noErr) {
+        ma_free(pDescriptions, &pContext->allocationCallbacks);
+        return ma_result_from_OSStatus(status);
+    }
+
+    *pDescriptionCount = dataSize / sizeof(*pDescriptions);
+    *ppDescriptions = pDescriptions;
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_get_AudioObject_channel_layout(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, AudioChannelLayout** ppChannelLayout)   /* NOTE: Free the returned pointer with ma_free(). */
+{
+    AudioObjectPropertyAddress propAddress;
+    UInt32 dataSize;
+    OSStatus status;
+    AudioChannelLayout* pChannelLayout;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(ppChannelLayout != NULL);
+
+    *ppChannelLayout = NULL;    /* Safety. */
+
+    propAddress.mSelector = kAudioDevicePropertyPreferredChannelLayout;
+    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
+    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(deviceObjectID, &propAddress, 0, NULL, &dataSize);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    pChannelLayout = (AudioChannelLayout*)ma_malloc(dataSize, &pContext->allocationCallbacks);
+    if (pChannelLayout == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, pChannelLayout);
+    if (status != noErr) {
+        ma_free(pChannelLayout, &pContext->allocationCallbacks);
+        return ma_result_from_OSStatus(status);
+    }
+
+    *ppChannelLayout = pChannelLayout;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_get_AudioObject_channel_count(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_uint32* pChannelCount)
+{
+    AudioChannelLayout* pChannelLayout;
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pChannelCount != NULL);
+
+    *pChannelCount = 0; /* Safety. */
+
+    result = ma_get_AudioObject_channel_layout(pContext, deviceObjectID, deviceType, &pChannelLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pChannelLayout->mChannelLayoutTag == kAudioChannelLayoutTag_UseChannelDescriptions) {
+        *pChannelCount = pChannelLayout->mNumberChannelDescriptions;
+    } else if (pChannelLayout->mChannelLayoutTag == kAudioChannelLayoutTag_UseChannelBitmap) {
+        *pChannelCount = ma_count_set_bits(pChannelLayout->mChannelBitmap);
+    } else {
+        *pChannelCount = AudioChannelLayoutTag_GetNumberOfChannels(pChannelLayout->mChannelLayoutTag);
+    }
+
+    ma_free(pChannelLayout, &pContext->allocationCallbacks);
+    return MA_SUCCESS;
+}
+
+#if 0
+static ma_result ma_get_AudioObject_channel_map(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    AudioChannelLayout* pChannelLayout;
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+
+    result = ma_get_AudioObject_channel_layout(pContext, deviceObjectID, deviceType, &pChannelLayout);
+    if (result != MA_SUCCESS) {
+        return result;  /* Rather than always failing here, would it be more robust to simply assume a default? */
+    }
+
+    result = ma_get_channel_map_from_AudioChannelLayout(pChannelLayout, pChannelMap, channelMapCap);
+    if (result != MA_SUCCESS) {
+        ma_free(pChannelLayout, &pContext->allocationCallbacks);
+        return result;
+    }
+
+    ma_free(pChannelLayout, &pContext->allocationCallbacks);
+    return result;
+}
+#endif
+
+static ma_result ma_get_AudioObject_sample_rates(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, UInt32* pSampleRateRangesCount, AudioValueRange** ppSampleRateRanges)   /* NOTE: Free the returned pointer with ma_free(). */
+{
+    AudioObjectPropertyAddress propAddress;
+    UInt32 dataSize;
+    OSStatus status;
+    AudioValueRange* pSampleRateRanges;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pSampleRateRangesCount != NULL);
+    MA_ASSERT(ppSampleRateRanges != NULL);
+
+    /* Safety. */
+    *pSampleRateRangesCount = 0;
+    *ppSampleRateRanges = NULL;
+
+    propAddress.mSelector = kAudioDevicePropertyAvailableNominalSampleRates;
+    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
+    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    status = ((ma_AudioObjectGetPropertyDataSize_proc)pContext->coreaudio.AudioObjectGetPropertyDataSize)(deviceObjectID, &propAddress, 0, NULL, &dataSize);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    pSampleRateRanges = (AudioValueRange*)ma_malloc(dataSize, &pContext->allocationCallbacks);
+    if (pSampleRateRanges == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, pSampleRateRanges);
+    if (status != noErr) {
+        ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
+        return ma_result_from_OSStatus(status);
+    }
+
+    *pSampleRateRangesCount = dataSize / sizeof(*pSampleRateRanges);
+    *ppSampleRateRanges = pSampleRateRanges;
+    return MA_SUCCESS;
+}
+
+#if 0
+static ma_result ma_get_AudioObject_get_closest_sample_rate(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_uint32 sampleRateIn, ma_uint32* pSampleRateOut)
+{
+    UInt32 sampleRateRangeCount;
+    AudioValueRange* pSampleRateRanges;
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pSampleRateOut != NULL);
+
+    *pSampleRateOut = 0;    /* Safety. */
+
+    result = ma_get_AudioObject_sample_rates(pContext, deviceObjectID, deviceType, &sampleRateRangeCount, &pSampleRateRanges);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (sampleRateRangeCount == 0) {
+        ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
+        return MA_ERROR;   /* Should never hit this case should we? */
+    }
+
+    if (sampleRateIn == 0) {
+        /* Search in order of miniaudio's preferred priority. */
+        UInt32 iMALSampleRate;
+        for (iMALSampleRate = 0; iMALSampleRate < ma_countof(g_maStandardSampleRatePriorities); ++iMALSampleRate) {
+            ma_uint32 malSampleRate = g_maStandardSampleRatePriorities[iMALSampleRate];
+            UInt32 iCASampleRate;
+            for (iCASampleRate = 0; iCASampleRate < sampleRateRangeCount; ++iCASampleRate) {
+                AudioValueRange caSampleRate = pSampleRateRanges[iCASampleRate];
+                if (caSampleRate.mMinimum <= malSampleRate && caSampleRate.mMaximum >= malSampleRate) {
+                    *pSampleRateOut = malSampleRate;
+                    ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
+                    return MA_SUCCESS;
+                }
+            }
+        }
+
+        /*
+        If we get here it means none of miniaudio's standard sample rates matched any of the supported sample rates from the device. In this
+        case we just fall back to the first one reported by Core Audio.
+        */
+        MA_ASSERT(sampleRateRangeCount > 0);
+
+        *pSampleRateOut = pSampleRateRanges[0].mMinimum;
+        ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
+        return MA_SUCCESS;
+    } else {
+        /* Find the closest match to this sample rate. */
+        UInt32 currentAbsoluteDifference = INT32_MAX;
+        UInt32 iCurrentClosestRange = (UInt32)-1;
+        UInt32 iRange;
+        for (iRange = 0; iRange < sampleRateRangeCount; ++iRange) {
+            if (pSampleRateRanges[iRange].mMinimum <= sampleRateIn && pSampleRateRanges[iRange].mMaximum >= sampleRateIn) {
+                *pSampleRateOut = sampleRateIn;
+                ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
+                return MA_SUCCESS;
+            } else {
+                UInt32 absoluteDifference;
+                if (pSampleRateRanges[iRange].mMinimum > sampleRateIn) {
+                    absoluteDifference = pSampleRateRanges[iRange].mMinimum - sampleRateIn;
+                } else {
+                    absoluteDifference = sampleRateIn - pSampleRateRanges[iRange].mMaximum;
+                }
+
+                if (currentAbsoluteDifference > absoluteDifference) {
+                    currentAbsoluteDifference = absoluteDifference;
+                    iCurrentClosestRange = iRange;
+                }
+            }
+        }
+
+        MA_ASSERT(iCurrentClosestRange != (UInt32)-1);
+
+        *pSampleRateOut = pSampleRateRanges[iCurrentClosestRange].mMinimum;
+        ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
+        return MA_SUCCESS;
+    }
+
+    /* Should never get here, but it would mean we weren't able to find any suitable sample rates. */
+    /*ma_free(pSampleRateRanges, &pContext->allocationCallbacks);*/
+    /*return MA_ERROR;*/
+}
+#endif
+
+static ma_result ma_get_AudioObject_closest_buffer_size_in_frames(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_uint32 bufferSizeInFramesIn, ma_uint32* pBufferSizeInFramesOut)
+{
+    AudioObjectPropertyAddress propAddress;
+    AudioValueRange bufferSizeRange;
+    UInt32 dataSize;
+    OSStatus status;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pBufferSizeInFramesOut != NULL);
+
+    *pBufferSizeInFramesOut = 0;    /* Safety. */
+
+    propAddress.mSelector = kAudioDevicePropertyBufferFrameSizeRange;
+    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
+    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    dataSize = sizeof(bufferSizeRange);
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, &bufferSizeRange);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    /* This is just a clamp. */
+    if (bufferSizeInFramesIn < bufferSizeRange.mMinimum) {
+        *pBufferSizeInFramesOut = (ma_uint32)bufferSizeRange.mMinimum;
+    } else if (bufferSizeInFramesIn > bufferSizeRange.mMaximum) {
+        *pBufferSizeInFramesOut = (ma_uint32)bufferSizeRange.mMaximum;
+    } else {
+        *pBufferSizeInFramesOut = bufferSizeInFramesIn;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_set_AudioObject_buffer_size_in_frames(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_uint32* pPeriodSizeInOut)
+{
+    ma_result result;
+    ma_uint32 chosenBufferSizeInFrames;
+    AudioObjectPropertyAddress propAddress;
+    UInt32 dataSize;
+    OSStatus status;
+
+    MA_ASSERT(pContext != NULL);
+
+    result = ma_get_AudioObject_closest_buffer_size_in_frames(pContext, deviceObjectID, deviceType, *pPeriodSizeInOut, &chosenBufferSizeInFrames);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* Try setting the size of the buffer... If this fails we just use whatever is currently set. */
+    propAddress.mSelector = kAudioDevicePropertyBufferFrameSize;
+    propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
+    propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+    ((ma_AudioObjectSetPropertyData_proc)pContext->coreaudio.AudioObjectSetPropertyData)(deviceObjectID, &propAddress, 0, NULL, sizeof(chosenBufferSizeInFrames), &chosenBufferSizeInFrames);
+
+    /* Get the actual size of the buffer. */
+    dataSize = sizeof(*pPeriodSizeInOut);
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(deviceObjectID, &propAddress, 0, NULL, &dataSize, &chosenBufferSizeInFrames);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    *pPeriodSizeInOut = chosenBufferSizeInFrames;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_find_default_AudioObjectID(ma_context* pContext, ma_device_type deviceType, AudioObjectID* pDeviceObjectID)
+{
+    AudioObjectPropertyAddress propAddressDefaultDevice;
+    UInt32 defaultDeviceObjectIDSize = sizeof(AudioObjectID);
+    AudioObjectID defaultDeviceObjectID;
+    OSStatus status;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pDeviceObjectID != NULL);
+
+    /* Safety. */
+    *pDeviceObjectID = 0;
+
+    propAddressDefaultDevice.mScope = kAudioObjectPropertyScopeGlobal;
+    propAddressDefaultDevice.mElement = AUDIO_OBJECT_PROPERTY_ELEMENT;
+    if (deviceType == ma_device_type_playback) {
+        propAddressDefaultDevice.mSelector = kAudioHardwarePropertyDefaultOutputDevice;
+    } else {
+        propAddressDefaultDevice.mSelector = kAudioHardwarePropertyDefaultInputDevice;
+    }
+
+    defaultDeviceObjectIDSize = sizeof(AudioObjectID);
+    status = ((ma_AudioObjectGetPropertyData_proc)pContext->coreaudio.AudioObjectGetPropertyData)(kAudioObjectSystemObject, &propAddressDefaultDevice, 0, NULL, &defaultDeviceObjectIDSize, &defaultDeviceObjectID);
+    if (status == noErr) {
+        *pDeviceObjectID = defaultDeviceObjectID;
+        return MA_SUCCESS;
+    }
+
+    /* If we get here it means we couldn't find the device. */
+    return MA_NO_DEVICE;
+}
+
+static ma_result ma_find_AudioObjectID(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, AudioObjectID* pDeviceObjectID)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pDeviceObjectID != NULL);
+
+    /* Safety. */
+    *pDeviceObjectID = 0;
+
+    if (pDeviceID == NULL) {
+        /* Default device. */
+        return ma_find_default_AudioObjectID(pContext, deviceType, pDeviceObjectID);
+    } else {
+        /* Explicit device. */
+        UInt32 deviceCount;
+        AudioObjectID* pDeviceObjectIDs;
+        ma_result result;
+        UInt32 iDevice;
+
+        result = ma_get_device_object_ids__coreaudio(pContext, &deviceCount, &pDeviceObjectIDs);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        for (iDevice = 0; iDevice < deviceCount; ++iDevice) {
+            AudioObjectID deviceObjectID = pDeviceObjectIDs[iDevice];
+
+            char uid[256];
+            if (ma_get_AudioObject_uid(pContext, deviceObjectID, sizeof(uid), uid) != MA_SUCCESS) {
+                continue;
+            }
+
+            if (deviceType == ma_device_type_playback) {
+                if (ma_does_AudioObject_support_playback(pContext, deviceObjectID)) {
+                    if (strcmp(uid, pDeviceID->coreaudio) == 0) {
+                        *pDeviceObjectID = deviceObjectID;
+                        ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
+                        return MA_SUCCESS;
+                    }
+                }
+            } else {
+                if (ma_does_AudioObject_support_capture(pContext, deviceObjectID)) {
+                    if (strcmp(uid, pDeviceID->coreaudio) == 0) {
+                        *pDeviceObjectID = deviceObjectID;
+                        ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
+                        return MA_SUCCESS;
+                    }
+                }
+            }
+        }
+
+        ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
+    }
+
+    /* If we get here it means we couldn't find the device. */
+    return MA_NO_DEVICE;
+}
+
+
+static ma_result ma_find_best_format__coreaudio(ma_context* pContext, AudioObjectID deviceObjectID, ma_device_type deviceType, ma_format format, ma_uint32 channels, ma_uint32 sampleRate, const AudioStreamBasicDescription* pOrigFormat, AudioStreamBasicDescription* pFormat)
+{
+    UInt32 deviceFormatDescriptionCount;
+    AudioStreamRangedDescription* pDeviceFormatDescriptions;
+    ma_result result;
+    ma_uint32 desiredSampleRate;
+    ma_uint32 desiredChannelCount;
+    ma_format desiredFormat;
+    AudioStreamBasicDescription bestDeviceFormatSoFar;
+    ma_bool32 hasSupportedFormat;
+    UInt32 iFormat;
+
+    result = ma_get_AudioObject_stream_descriptions(pContext, deviceObjectID, deviceType, &deviceFormatDescriptionCount, &pDeviceFormatDescriptions);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    desiredSampleRate = sampleRate;
+    if (desiredSampleRate == 0) {
+        desiredSampleRate = (ma_uint32)pOrigFormat->mSampleRate;
+    }
+
+    desiredChannelCount = channels;
+    if (desiredChannelCount == 0) {
+        desiredChannelCount = pOrigFormat->mChannelsPerFrame;
+    }
+
+    desiredFormat = format;
+    if (desiredFormat == ma_format_unknown) {
+        result = ma_format_from_AudioStreamBasicDescription(pOrigFormat, &desiredFormat);
+        if (result != MA_SUCCESS || desiredFormat == ma_format_unknown) {
+            desiredFormat = g_maFormatPriorities[0];
+        }
+    }
+
+    /*
+    If we get here it means we don't have an exact match to what the client is asking for. We'll need to find the closest one. The next
+    loop will check for formats that have the same sample rate to what we're asking for. If there is, we prefer that one in all cases.
+    */
+    MA_ZERO_OBJECT(&bestDeviceFormatSoFar);
+
+    hasSupportedFormat = MA_FALSE;
+    for (iFormat = 0; iFormat < deviceFormatDescriptionCount; ++iFormat) {
+        ma_format formatFromDescription;
+        ma_result formatResult = ma_format_from_AudioStreamBasicDescription(&pDeviceFormatDescriptions[iFormat].mFormat, &formatFromDescription);
+        if (formatResult == MA_SUCCESS && formatFromDescription != ma_format_unknown) {
+            hasSupportedFormat = MA_TRUE;
+            bestDeviceFormatSoFar = pDeviceFormatDescriptions[iFormat].mFormat;
+            break;
+        }
+    }
+
+    if (!hasSupportedFormat) {
+        ma_free(pDeviceFormatDescriptions, &pContext->allocationCallbacks);
+        return MA_FORMAT_NOT_SUPPORTED;
+    }
+
+
+    for (iFormat = 0; iFormat < deviceFormatDescriptionCount; ++iFormat) {
+        AudioStreamBasicDescription thisDeviceFormat = pDeviceFormatDescriptions[iFormat].mFormat;
+        ma_format thisSampleFormat;
+        ma_result formatResult;
+        ma_format bestSampleFormatSoFar;
+
+        /* If the format is not supported by miniaudio we need to skip this one entirely. */
+        formatResult = ma_format_from_AudioStreamBasicDescription(&pDeviceFormatDescriptions[iFormat].mFormat, &thisSampleFormat);
+        if (formatResult != MA_SUCCESS || thisSampleFormat == ma_format_unknown) {
+            continue;   /* The format is not supported by miniaudio. Skip. */
+        }
+
+        ma_format_from_AudioStreamBasicDescription(&bestDeviceFormatSoFar, &bestSampleFormatSoFar);
+
+        /* Getting here means the format is supported by miniaudio which makes this format a candidate. */
+        if (thisDeviceFormat.mSampleRate != desiredSampleRate) {
+            /*
+            The sample rate does not match, but this format could still be usable, although it's a very low priority. If the best format
+            so far has an equal sample rate we can just ignore this one.
+            */
+            if (bestDeviceFormatSoFar.mSampleRate == desiredSampleRate) {
+                continue;   /* The best sample rate so far has the same sample rate as what we requested which means it's still the best so far. Skip this format. */
+            } else {
+                /* In this case, neither the best format so far nor this one have the same sample rate. Check the channel count next. */
+                if (thisDeviceFormat.mChannelsPerFrame != desiredChannelCount) {
+                    /* This format has a different sample rate _and_ a different channel count. */
+                    if (bestDeviceFormatSoFar.mChannelsPerFrame == desiredChannelCount) {
+                        continue;   /* No change to the best format. */
+                    } else {
+                        /*
+                        Both this format and the best so far have different sample rates and different channel counts. Whichever has the
+                        best format is the new best.
+                        */
+                        if (ma_get_format_priority_index(thisSampleFormat) < ma_get_format_priority_index(bestSampleFormatSoFar)) {
+                            bestDeviceFormatSoFar = thisDeviceFormat;
+                            continue;
+                        } else {
+                            continue;   /* No change to the best format. */
+                        }
+                    }
+                } else {
+                    /* This format has a different sample rate but the desired channel count. */
+                    if (bestDeviceFormatSoFar.mChannelsPerFrame == desiredChannelCount) {
+                        /* Both this format and the best so far have the desired channel count. Whichever has the best format is the new best. */
+                        if (ma_get_format_priority_index(thisSampleFormat) < ma_get_format_priority_index(bestSampleFormatSoFar)) {
+                            bestDeviceFormatSoFar = thisDeviceFormat;
+                            continue;
+                        } else {
+                            continue;   /* No change to the best format for now. */
+                        }
+                    } else {
+                        /* This format has the desired channel count, but the best so far does not. We have a new best. */
+                        bestDeviceFormatSoFar = thisDeviceFormat;
+                        continue;
+                    }
+                }
+            }
+        } else {
+            /*
+            The sample rates match which makes this format a very high priority contender. If the best format so far has a different
+            sample rate it needs to be replaced with this one.
+            */
+            if (bestDeviceFormatSoFar.mSampleRate != desiredSampleRate) {
+                bestDeviceFormatSoFar = thisDeviceFormat;
+                continue;
+            } else {
+                /* In this case both this format and the best format so far have the same sample rate. Check the channel count next. */
+                if (thisDeviceFormat.mChannelsPerFrame == desiredChannelCount) {
+                    /*
+                    In this case this format has the same channel count as what the client is requesting. If the best format so far has
+                    a different count, this one becomes the new best.
+                    */
+                    if (bestDeviceFormatSoFar.mChannelsPerFrame != desiredChannelCount) {
+                        bestDeviceFormatSoFar = thisDeviceFormat;
+                        continue;
+                    } else {
+                        /* In this case both this format and the best so far have the ideal sample rate and channel count. Check the format. */
+                        if (thisSampleFormat == desiredFormat) {
+                            bestDeviceFormatSoFar = thisDeviceFormat;
+                            break;  /* Found the exact match. */
+                        } else {
+                            /* The formats are different. The new best format is the one with the highest priority format according to miniaudio. */
+                            if (ma_get_format_priority_index(thisSampleFormat) < ma_get_format_priority_index(bestSampleFormatSoFar)) {
+                                bestDeviceFormatSoFar = thisDeviceFormat;
+                                continue;
+                            } else {
+                                continue;   /* No change to the best format for now. */
+                            }
+                        }
+                    }
+                } else {
+                    /*
+                    In this case the channel count is different to what the client has requested. If the best so far has the same channel
+                    count as the requested count then it remains the best.
+                    */
+                    if (bestDeviceFormatSoFar.mChannelsPerFrame == desiredChannelCount) {
+                        continue;
+                    } else {
+                        /*
+                        This is the case where both have the same sample rate (good) but different channel counts. Right now both have about
+                        the same priority, but we need to compare the format now.
+                        */
+                        if (thisSampleFormat == bestSampleFormatSoFar) {
+                            if (ma_get_format_priority_index(thisSampleFormat) < ma_get_format_priority_index(bestSampleFormatSoFar)) {
+                                bestDeviceFormatSoFar = thisDeviceFormat;
+                                continue;
+                            } else {
+                                continue;   /* No change to the best format for now. */
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    *pFormat = bestDeviceFormatSoFar;
+
+    ma_free(pDeviceFormatDescriptions, &pContext->allocationCallbacks);
+    return MA_SUCCESS;
+}
+
+static ma_result ma_get_AudioUnit_channel_map(ma_context* pContext, AudioUnit audioUnit, ma_device_type deviceType, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    AudioUnitScope deviceScope;
+    AudioUnitElement deviceBus;
+    UInt32 channelLayoutSize;
+    OSStatus status;
+    AudioChannelLayout* pChannelLayout;
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+
+    if (deviceType == ma_device_type_playback) {
+        deviceScope = kAudioUnitScope_Input;
+        deviceBus = MA_COREAUDIO_OUTPUT_BUS;
+    } else {
+        deviceScope = kAudioUnitScope_Output;
+        deviceBus = MA_COREAUDIO_INPUT_BUS;
+    }
+
+    status = ((ma_AudioUnitGetPropertyInfo_proc)pContext->coreaudio.AudioUnitGetPropertyInfo)(audioUnit, kAudioUnitProperty_AudioChannelLayout, deviceScope, deviceBus, &channelLayoutSize, NULL);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+    pChannelLayout = (AudioChannelLayout*)ma_malloc(channelLayoutSize, &pContext->allocationCallbacks);
+    if (pChannelLayout == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    status = ((ma_AudioUnitGetProperty_proc)pContext->coreaudio.AudioUnitGetProperty)(audioUnit, kAudioUnitProperty_AudioChannelLayout, deviceScope, deviceBus, pChannelLayout, &channelLayoutSize);
+    if (status != noErr) {
+        ma_free(pChannelLayout, &pContext->allocationCallbacks);
+        return ma_result_from_OSStatus(status);
+    }
+
+    result = ma_get_channel_map_from_AudioChannelLayout(pChannelLayout, pChannelMap, channelMapCap);
+    if (result != MA_SUCCESS) {
+        ma_free(pChannelLayout, &pContext->allocationCallbacks);
+        return result;
+    }
+
+    ma_free(pChannelLayout, &pContext->allocationCallbacks);
+    return MA_SUCCESS;
+}
+#endif /* MA_APPLE_DESKTOP */
+
+
+#if !defined(MA_APPLE_DESKTOP)
+static void ma_AVAudioSessionPortDescription_to_device_info(AVAudioSessionPortDescription* pPortDesc, ma_device_info* pInfo)
+{
+    MA_ZERO_OBJECT(pInfo);
+    ma_strncpy_s(pInfo->name,         sizeof(pInfo->name),         [pPortDesc.portName UTF8String], (size_t)-1);
+    ma_strncpy_s(pInfo->id.coreaudio, sizeof(pInfo->id.coreaudio), [pPortDesc.UID      UTF8String], (size_t)-1);
+}
+#endif
+
+static ma_result ma_context_enumerate_devices__coreaudio(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+#if defined(MA_APPLE_DESKTOP)
+    UInt32 deviceCount;
+    AudioObjectID* pDeviceObjectIDs;
+    AudioObjectID defaultDeviceObjectIDPlayback;
+    AudioObjectID defaultDeviceObjectIDCapture;
+    ma_result result;
+    UInt32 iDevice;
+
+    ma_find_default_AudioObjectID(pContext, ma_device_type_playback, &defaultDeviceObjectIDPlayback);   /* OK if this fails. */
+    ma_find_default_AudioObjectID(pContext, ma_device_type_capture,  &defaultDeviceObjectIDCapture);    /* OK if this fails. */
+
+    result = ma_get_device_object_ids__coreaudio(pContext, &deviceCount, &pDeviceObjectIDs);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    for (iDevice = 0; iDevice < deviceCount; ++iDevice) {
+        AudioObjectID deviceObjectID = pDeviceObjectIDs[iDevice];
+        ma_device_info info;
+
+        MA_ZERO_OBJECT(&info);
+        if (ma_get_AudioObject_uid(pContext, deviceObjectID, sizeof(info.id.coreaudio), info.id.coreaudio) != MA_SUCCESS) {
+            continue;
+        }
+        if (ma_get_AudioObject_name(pContext, deviceObjectID, sizeof(info.name), info.name) != MA_SUCCESS) {
+            continue;
+        }
+
+        if (ma_does_AudioObject_support_playback(pContext, deviceObjectID)) {
+            if (deviceObjectID == defaultDeviceObjectIDPlayback) {
+                info.isDefault = MA_TRUE;
+            }
+
+            if (!callback(pContext, ma_device_type_playback, &info, pUserData)) {
+                break;
+            }
+        }
+        if (ma_does_AudioObject_support_capture(pContext, deviceObjectID)) {
+            if (deviceObjectID == defaultDeviceObjectIDCapture) {
+                info.isDefault = MA_TRUE;
+            }
+
+            if (!callback(pContext, ma_device_type_capture, &info, pUserData)) {
+                break;
+            }
+        }
+    }
+
+    ma_free(pDeviceObjectIDs, &pContext->allocationCallbacks);
+#else
+    ma_device_info info;
+    NSArray *pInputs  = [[[AVAudioSession sharedInstance] currentRoute] inputs];
+    NSArray *pOutputs = [[[AVAudioSession sharedInstance] currentRoute] outputs];
+
+    for (AVAudioSessionPortDescription* pPortDesc in pOutputs) {
+        ma_AVAudioSessionPortDescription_to_device_info(pPortDesc, &info);
+        if (!callback(pContext, ma_device_type_playback, &info, pUserData)) {
+            return MA_SUCCESS;
+        }
+    }
+
+    for (AVAudioSessionPortDescription* pPortDesc in pInputs) {
+        ma_AVAudioSessionPortDescription_to_device_info(pPortDesc, &info);
+        if (!callback(pContext, ma_device_type_capture, &info, pUserData)) {
+            return MA_SUCCESS;
+        }
+    }
+#endif
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info__coreaudio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+
+#if defined(MA_APPLE_DESKTOP)
+    /* Desktop */
+    {
+        AudioObjectID deviceObjectID;
+        AudioObjectID defaultDeviceObjectID;
+        UInt32 streamDescriptionCount;
+        AudioStreamRangedDescription* pStreamDescriptions;
+        UInt32 iStreamDescription;
+        UInt32 sampleRateRangeCount;
+        AudioValueRange* pSampleRateRanges;
+
+        ma_find_default_AudioObjectID(pContext, deviceType, &defaultDeviceObjectID);     /* OK if this fails. */
+
+        result = ma_find_AudioObjectID(pContext, deviceType, pDeviceID, &deviceObjectID);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        result = ma_get_AudioObject_uid(pContext, deviceObjectID, sizeof(pDeviceInfo->id.coreaudio), pDeviceInfo->id.coreaudio);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        result = ma_get_AudioObject_name(pContext, deviceObjectID, sizeof(pDeviceInfo->name), pDeviceInfo->name);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        if (deviceObjectID == defaultDeviceObjectID) {
+            pDeviceInfo->isDefault = MA_TRUE;
+        }
+
+        /*
+        There could be a large number of permutations here. Fortunately there is only a single channel count
+        being reported which reduces this quite a bit. For sample rates we're only reporting those that are
+        one of miniaudio's recognized "standard" rates. If there are still more formats than can fit into
+        our fixed sized array we'll just need to truncate them. This is unlikely and will probably only happen
+        if some driver performs software data conversion and therefore reports every possible format and
+        sample rate.
+        */
+        pDeviceInfo->nativeDataFormatCount = 0;
+
+        /* Formats. */
+        {
+            ma_format uniqueFormats[ma_format_count];
+            ma_uint32 uniqueFormatCount = 0;
+            ma_uint32 channels;
+
+            /* Channels. */
+            result = ma_get_AudioObject_channel_count(pContext, deviceObjectID, deviceType, &channels);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+
+            /* Formats. */
+            result = ma_get_AudioObject_stream_descriptions(pContext, deviceObjectID, deviceType, &streamDescriptionCount, &pStreamDescriptions);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+
+            for (iStreamDescription = 0; iStreamDescription < streamDescriptionCount; ++iStreamDescription) {
+                ma_format format;
+                ma_bool32 hasFormatBeenHandled = MA_FALSE;
+                ma_uint32 iOutputFormat;
+                ma_uint32 iSampleRate;
+
+                result = ma_format_from_AudioStreamBasicDescription(&pStreamDescriptions[iStreamDescription].mFormat, &format);
+                if (result != MA_SUCCESS) {
+                    continue;
+                }
+
+                MA_ASSERT(format != ma_format_unknown);
+
+                /* Make sure the format isn't already in the output list. */
+                for (iOutputFormat = 0; iOutputFormat < uniqueFormatCount; ++iOutputFormat) {
+                    if (uniqueFormats[iOutputFormat] == format) {
+                        hasFormatBeenHandled = MA_TRUE;
+                        break;
+                    }
+                }
+
+                /* If we've already handled this format just skip it. */
+                if (hasFormatBeenHandled) {
+                    continue;
+                }
+
+                uniqueFormats[uniqueFormatCount] = format;
+                uniqueFormatCount += 1;
+
+                /* Sample Rates */
+                result = ma_get_AudioObject_sample_rates(pContext, deviceObjectID, deviceType, &sampleRateRangeCount, &pSampleRateRanges);
+                if (result != MA_SUCCESS) {
+                    return result;
+                }
+
+                /*
+                Annoyingly Core Audio reports a sample rate range. We just get all the standard rates that are
+                between this range.
+                */
+                for (iSampleRate = 0; iSampleRate < sampleRateRangeCount; ++iSampleRate) {
+                    ma_uint32 iStandardSampleRate;
+                    for (iStandardSampleRate = 0; iStandardSampleRate < ma_countof(g_maStandardSampleRatePriorities); iStandardSampleRate += 1) {
+                        ma_uint32 standardSampleRate = g_maStandardSampleRatePriorities[iStandardSampleRate];
+                        if (standardSampleRate >= pSampleRateRanges[iSampleRate].mMinimum && standardSampleRate <= pSampleRateRanges[iSampleRate].mMaximum) {
+                            /* We have a new data format. Add it to the list. */
+                            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
+                            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
+                            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = standardSampleRate;
+                            pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = 0;
+                            pDeviceInfo->nativeDataFormatCount += 1;
+
+                            if (pDeviceInfo->nativeDataFormatCount >= ma_countof(pDeviceInfo->nativeDataFormats)) {
+                                break;  /* No more room for any more formats. */
+                            }
+                        }
+                    }
+                }
+
+                ma_free(pSampleRateRanges, &pContext->allocationCallbacks);
+
+                if (pDeviceInfo->nativeDataFormatCount >= ma_countof(pDeviceInfo->nativeDataFormats)) {
+                    break;  /* No more room for any more formats. */
+                }
+            }
+
+            ma_free(pStreamDescriptions, &pContext->allocationCallbacks);
+        }
+    }
+#else
+    /* Mobile */
+    {
+        AudioComponentDescription desc;
+        AudioComponent component;
+        AudioUnit audioUnit;
+        OSStatus status;
+        AudioUnitScope formatScope;
+        AudioUnitElement formatElement;
+        AudioStreamBasicDescription bestFormat;
+        UInt32 propSize;
+
+        /* We want to ensure we use a consistent device name to device enumeration. */
+        if (pDeviceID != NULL && pDeviceID->coreaudio[0] != '\0') {
+            ma_bool32 found = MA_FALSE;
+            if (deviceType == ma_device_type_playback) {
+                NSArray *pOutputs = [[[AVAudioSession sharedInstance] currentRoute] outputs];
+                for (AVAudioSessionPortDescription* pPortDesc in pOutputs) {
+                    if (strcmp(pDeviceID->coreaudio, [pPortDesc.UID UTF8String]) == 0) {
+                        ma_AVAudioSessionPortDescription_to_device_info(pPortDesc, pDeviceInfo);
+                        found = MA_TRUE;
+                        break;
+                    }
+                }
+            } else {
+                NSArray *pInputs = [[[AVAudioSession sharedInstance] currentRoute] inputs];
+                for (AVAudioSessionPortDescription* pPortDesc in pInputs) {
+                    if (strcmp(pDeviceID->coreaudio, [pPortDesc.UID UTF8String]) == 0) {
+                        ma_AVAudioSessionPortDescription_to_device_info(pPortDesc, pDeviceInfo);
+                        found = MA_TRUE;
+                        break;
+                    }
+                }
+            }
+
+            if (!found) {
+                return MA_DOES_NOT_EXIST;
+            }
+        } else {
+            if (deviceType == ma_device_type_playback) {
+                ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+            } else {
+                ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+            }
+        }
+
+
+        /*
+        Retrieving device information is more annoying on mobile than desktop. For simplicity I'm locking this down to whatever format is
+        reported on a temporary I/O unit. The problem, however, is that this doesn't return a value for the sample rate which we need to
+        retrieve from the AVAudioSession shared instance.
+        */
+        desc.componentType = kAudioUnitType_Output;
+        desc.componentSubType = kAudioUnitSubType_RemoteIO;
+        desc.componentManufacturer = kAudioUnitManufacturer_Apple;
+        desc.componentFlags = 0;
+        desc.componentFlagsMask = 0;
+
+        component = ((ma_AudioComponentFindNext_proc)pContext->coreaudio.AudioComponentFindNext)(NULL, &desc);
+        if (component == NULL) {
+            return MA_FAILED_TO_INIT_BACKEND;
+        }
+
+        status = ((ma_AudioComponentInstanceNew_proc)pContext->coreaudio.AudioComponentInstanceNew)(component, &audioUnit);
+        if (status != noErr) {
+            return ma_result_from_OSStatus(status);
+        }
+
+        formatScope   = (deviceType == ma_device_type_playback) ? kAudioUnitScope_Input : kAudioUnitScope_Output;
+        formatElement = (deviceType == ma_device_type_playback) ? MA_COREAUDIO_OUTPUT_BUS : MA_COREAUDIO_INPUT_BUS;
+
+        propSize = sizeof(bestFormat);
+        status = ((ma_AudioUnitGetProperty_proc)pContext->coreaudio.AudioUnitGetProperty)(audioUnit, kAudioUnitProperty_StreamFormat, formatScope, formatElement, &bestFormat, &propSize);
+        if (status != noErr) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(audioUnit);
+            return ma_result_from_OSStatus(status);
+        }
+
+        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(audioUnit);
+        audioUnit = NULL;
+
+        /* Only a single format is being reported for iOS. */
+        pDeviceInfo->nativeDataFormatCount = 1;
+
+        result = ma_format_from_AudioStreamBasicDescription(&bestFormat, &pDeviceInfo->nativeDataFormats[0].format);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pDeviceInfo->nativeDataFormats[0].channels = bestFormat.mChannelsPerFrame;
+
+        /*
+        It looks like Apple are wanting to push the whole AVAudioSession thing. Thus, we need to use that to determine device settings. To do
+        this we just get the shared instance and inspect.
+        */
+        @autoreleasepool {
+            AVAudioSession* pAudioSession = [AVAudioSession sharedInstance];
+            MA_ASSERT(pAudioSession != NULL);
+
+            pDeviceInfo->nativeDataFormats[0].sampleRate = (ma_uint32)pAudioSession.sampleRate;
+        }
+    }
+#endif
+
+    (void)pDeviceInfo; /* Unused. */
+    return MA_SUCCESS;
+}
+
+static AudioBufferList* ma_allocate_AudioBufferList__coreaudio(ma_uint32 sizeInFrames, ma_format format, ma_uint32 channels, ma_stream_layout layout, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    AudioBufferList* pBufferList;
+    UInt32 audioBufferSizeInBytes;
+    size_t allocationSize;
+
+    MA_ASSERT(sizeInFrames > 0);
+    MA_ASSERT(format != ma_format_unknown);
+    MA_ASSERT(channels > 0);
+
+    allocationSize = sizeof(AudioBufferList) - sizeof(AudioBuffer);  /* Subtract sizeof(AudioBuffer) because that part is dynamically sized. */
+    if (layout == ma_stream_layout_interleaved) {
+        /* Interleaved case. This is the simple case because we just have one buffer. */
+        allocationSize += sizeof(AudioBuffer) * 1;
+    } else {
+        /* Non-interleaved case. This is the more complex case because there's more than one buffer. */
+        allocationSize += sizeof(AudioBuffer) * channels;
+    }
+
+    allocationSize += sizeInFrames * ma_get_bytes_per_frame(format, channels);
+
+    pBufferList = (AudioBufferList*)ma_malloc(allocationSize, pAllocationCallbacks);
+    if (pBufferList == NULL) {
+        return NULL;
+    }
+
+    audioBufferSizeInBytes = (UInt32)(sizeInFrames * ma_get_bytes_per_sample(format));
+
+    if (layout == ma_stream_layout_interleaved) {
+        pBufferList->mNumberBuffers = 1;
+        pBufferList->mBuffers[0].mNumberChannels = channels;
+        pBufferList->mBuffers[0].mDataByteSize   = audioBufferSizeInBytes * channels;
+        pBufferList->mBuffers[0].mData           = (ma_uint8*)pBufferList + sizeof(AudioBufferList);
+    } else {
+        ma_uint32 iBuffer;
+        pBufferList->mNumberBuffers = channels;
+        for (iBuffer = 0; iBuffer < pBufferList->mNumberBuffers; ++iBuffer) {
+            pBufferList->mBuffers[iBuffer].mNumberChannels = 1;
+            pBufferList->mBuffers[iBuffer].mDataByteSize   = audioBufferSizeInBytes;
+            pBufferList->mBuffers[iBuffer].mData           = (ma_uint8*)pBufferList + ((sizeof(AudioBufferList) - sizeof(AudioBuffer)) + (sizeof(AudioBuffer) * channels)) + (audioBufferSizeInBytes * iBuffer);
+        }
+    }
+
+    return pBufferList;
+}
+
+static ma_result ma_device_realloc_AudioBufferList__coreaudio(ma_device* pDevice, ma_uint32 sizeInFrames, ma_format format, ma_uint32 channels, ma_stream_layout layout)
+{
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(format != ma_format_unknown);
+    MA_ASSERT(channels > 0);
+
+    /* Only resize the buffer if necessary. */
+    if (pDevice->coreaudio.audioBufferCapInFrames < sizeInFrames) {
+        AudioBufferList* pNewAudioBufferList;
+
+        pNewAudioBufferList = ma_allocate_AudioBufferList__coreaudio(sizeInFrames, format, channels, layout, &pDevice->pContext->allocationCallbacks);
+        if (pNewAudioBufferList == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        /* At this point we'll have a new AudioBufferList and we can free the old one. */
+        ma_free(pDevice->coreaudio.pAudioBufferList, &pDevice->pContext->allocationCallbacks);
+        pDevice->coreaudio.pAudioBufferList = pNewAudioBufferList;
+        pDevice->coreaudio.audioBufferCapInFrames = sizeInFrames;
+    }
+
+    /* Getting here means the capacity of the audio is fine. */
+    return MA_SUCCESS;
+}
+
+
+static OSStatus ma_on_output__coreaudio(void* pUserData, AudioUnitRenderActionFlags* pActionFlags, const AudioTimeStamp* pTimeStamp, UInt32 busNumber, UInt32 frameCount, AudioBufferList* pBufferList)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    ma_stream_layout layout;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "INFO: Output Callback: busNumber=%d, frameCount=%d, mNumberBuffers=%d\n", (int)busNumber, (int)frameCount, (int)pBufferList->mNumberBuffers);*/
+
+    /* We need to check whether or not we are outputting interleaved or non-interleaved samples. The way we do this is slightly different for each type. */
+    layout = ma_stream_layout_interleaved;
+    if (pBufferList->mBuffers[0].mNumberChannels != pDevice->playback.internalChannels) {
+        layout = ma_stream_layout_deinterleaved;
+    }
+
+    if (layout == ma_stream_layout_interleaved) {
+        /* For now we can assume everything is interleaved. */
+        UInt32 iBuffer;
+        for (iBuffer = 0; iBuffer < pBufferList->mNumberBuffers; ++iBuffer) {
+            if (pBufferList->mBuffers[iBuffer].mNumberChannels == pDevice->playback.internalChannels) {
+                ma_uint32 frameCountForThisBuffer = pBufferList->mBuffers[iBuffer].mDataByteSize / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+                if (frameCountForThisBuffer > 0) {
+                    ma_device_handle_backend_data_callback(pDevice, pBufferList->mBuffers[iBuffer].mData, NULL, frameCountForThisBuffer);
+                }
+
+                /*a_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  frameCount=%d, mNumberChannels=%d, mDataByteSize=%d\n", (int)frameCount, (int)pBufferList->mBuffers[iBuffer].mNumberChannels, (int)pBufferList->mBuffers[iBuffer].mDataByteSize);*/
+            } else {
+                /*
+                This case is where the number of channels in the output buffer do not match our internal channels. It could mean that it's
+                not interleaved, in which case we can't handle right now since miniaudio does not yet support non-interleaved streams. We just
+                output silence here.
+                */
+                MA_ZERO_MEMORY(pBufferList->mBuffers[iBuffer].mData, pBufferList->mBuffers[iBuffer].mDataByteSize);
+                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  WARNING: Outputting silence. frameCount=%d, mNumberChannels=%d, mDataByteSize=%d\n", (int)frameCount, (int)pBufferList->mBuffers[iBuffer].mNumberChannels, (int)pBufferList->mBuffers[iBuffer].mDataByteSize);*/
+            }
+        }
+    } else {
+        /* This is the deinterleaved case. We need to update each buffer in groups of internalChannels. This assumes each buffer is the same size. */
+        MA_ASSERT(pDevice->playback.internalChannels <= MA_MAX_CHANNELS);   /* This should have been validated at initialization time. */
+
+        /*
+        For safety we'll check that the internal channels is a multiple of the buffer count. If it's not it means something
+        very strange has happened and we're not going to support it.
+        */
+        if ((pBufferList->mNumberBuffers % pDevice->playback.internalChannels) == 0) {
+            ma_uint8 tempBuffer[4096];
+            UInt32 iBuffer;
+
+            for (iBuffer = 0; iBuffer < pBufferList->mNumberBuffers; iBuffer += pDevice->playback.internalChannels) {
+                ma_uint32 frameCountPerBuffer = pBufferList->mBuffers[iBuffer].mDataByteSize / ma_get_bytes_per_sample(pDevice->playback.internalFormat);
+                ma_uint32 framesRemaining = frameCountPerBuffer;
+
+                while (framesRemaining > 0) {
+                    void* ppDeinterleavedBuffers[MA_MAX_CHANNELS];
+                    ma_uint32 iChannel;
+                    ma_uint32 framesToRead = sizeof(tempBuffer) / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+                    if (framesToRead > framesRemaining) {
+                        framesToRead = framesRemaining;
+                    }
+
+                    ma_device_handle_backend_data_callback(pDevice, tempBuffer, NULL, framesToRead);
+
+                    for (iChannel = 0; iChannel < pDevice->playback.internalChannels; ++iChannel) {
+                        ppDeinterleavedBuffers[iChannel] = (void*)ma_offset_ptr(pBufferList->mBuffers[iBuffer+iChannel].mData, (frameCountPerBuffer - framesRemaining) * ma_get_bytes_per_sample(pDevice->playback.internalFormat));
+                    }
+
+                    ma_deinterleave_pcm_frames(pDevice->playback.internalFormat, pDevice->playback.internalChannels, framesToRead, tempBuffer, ppDeinterleavedBuffers);
+
+                    framesRemaining -= framesToRead;
+                }
+            }
+        }
+    }
+
+    (void)pActionFlags;
+    (void)pTimeStamp;
+    (void)busNumber;
+    (void)frameCount;
+
+    return noErr;
+}
+
+static OSStatus ma_on_input__coreaudio(void* pUserData, AudioUnitRenderActionFlags* pActionFlags, const AudioTimeStamp* pTimeStamp, UInt32 busNumber, UInt32 frameCount, AudioBufferList* pUnusedBufferList)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    AudioBufferList* pRenderedBufferList;
+    ma_result result;
+    ma_stream_layout layout;
+    ma_uint32 iBuffer;
+    OSStatus status;
+
+    MA_ASSERT(pDevice != NULL);
+
+    pRenderedBufferList = (AudioBufferList*)pDevice->coreaudio.pAudioBufferList;
+    MA_ASSERT(pRenderedBufferList);
+
+    /* We need to check whether or not we are outputting interleaved or non-interleaved samples. The way we do this is slightly different for each type. */
+    layout = ma_stream_layout_interleaved;
+    if (pRenderedBufferList->mBuffers[0].mNumberChannels != pDevice->capture.internalChannels) {
+        layout = ma_stream_layout_deinterleaved;
+    }
+
+    /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "INFO: Input Callback: busNumber=%d, frameCount=%d, mNumberBuffers=%d\n", (int)busNumber, (int)frameCount, (int)pRenderedBufferList->mNumberBuffers);*/
+
+    /*
+    There has been a situation reported where frame count passed into this function is greater than the capacity of
+    our capture buffer. There doesn't seem to be a reliable way to determine what the maximum frame count will be,
+    so we need to instead resort to dynamically reallocating our buffer to ensure it's large enough to capture the
+    number of frames requested by this callback.
+    */
+    result = ma_device_realloc_AudioBufferList__coreaudio(pDevice, frameCount, pDevice->capture.internalFormat, pDevice->capture.internalChannels, layout);
+    if (result != MA_SUCCESS) {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "Failed to allocate AudioBufferList for capture.\n");
+        return noErr;
+    }
+
+    pRenderedBufferList = (AudioBufferList*)pDevice->coreaudio.pAudioBufferList;
+    MA_ASSERT(pRenderedBufferList);
+
+    /*
+    When you call AudioUnitRender(), Core Audio tries to be helpful by setting the mDataByteSize to the number of bytes
+    that were actually rendered. The problem with this is that the next call can fail with -50 due to the size no longer
+    being set to the capacity of the buffer, but instead the size in bytes of the previous render. This will cause a
+    problem when a future call to this callback specifies a larger number of frames.
+
+    To work around this we need to explicitly set the size of each buffer to their respective size in bytes.
+    */
+    for (iBuffer = 0; iBuffer < pRenderedBufferList->mNumberBuffers; ++iBuffer) {
+        pRenderedBufferList->mBuffers[iBuffer].mDataByteSize = pDevice->coreaudio.audioBufferCapInFrames * ma_get_bytes_per_sample(pDevice->capture.internalFormat) * pRenderedBufferList->mBuffers[iBuffer].mNumberChannels;
+        /*printf("DEBUG: nDataByteSize = %d\n", (int)pRenderedBufferList->mBuffers[iBuffer].mDataByteSize);*/
+    }
+
+    status = ((ma_AudioUnitRender_proc)pDevice->pContext->coreaudio.AudioUnitRender)((AudioUnit)pDevice->coreaudio.audioUnitCapture, pActionFlags, pTimeStamp, busNumber, frameCount, pRenderedBufferList);
+    if (status != noErr) {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "ERROR: AudioUnitRender() failed with %d.\n", (int)status);
+        return status;
+    }
+
+    if (layout == ma_stream_layout_interleaved) {
+        for (iBuffer = 0; iBuffer < pRenderedBufferList->mNumberBuffers; ++iBuffer) {
+            if (pRenderedBufferList->mBuffers[iBuffer].mNumberChannels == pDevice->capture.internalChannels) {
+                ma_device_handle_backend_data_callback(pDevice, NULL, pRenderedBufferList->mBuffers[iBuffer].mData, frameCount);
+                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  mDataByteSize=%d.\n", (int)pRenderedBufferList->mBuffers[iBuffer].mDataByteSize);*/
+            } else {
+                /*
+                This case is where the number of channels in the output buffer do not match our internal channels. It could mean that it's
+                not interleaved, in which case we can't handle right now since miniaudio does not yet support non-interleaved streams.
+                */
+                ma_uint8 silentBuffer[4096];
+                ma_uint32 framesRemaining;
+
+                MA_ZERO_MEMORY(silentBuffer, sizeof(silentBuffer));
+
+                framesRemaining = frameCount;
+                while (framesRemaining > 0) {
+                    ma_uint32 framesToSend = sizeof(silentBuffer) / ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+                    if (framesToSend > framesRemaining) {
+                        framesToSend = framesRemaining;
+                    }
+
+                    ma_device_handle_backend_data_callback(pDevice, NULL, silentBuffer, framesToSend);
+
+                    framesRemaining -= framesToSend;
+                }
+
+                /*ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_DEBUG, "  WARNING: Outputting silence. frameCount=%d, mNumberChannels=%d, mDataByteSize=%d\n", (int)frameCount, (int)pRenderedBufferList->mBuffers[iBuffer].mNumberChannels, (int)pRenderedBufferList->mBuffers[iBuffer].mDataByteSize);*/
+            }
+        }
+    } else {
+        /* This is the deinterleaved case. We need to interleave the audio data before sending it to the client. This assumes each buffer is the same size. */
+        MA_ASSERT(pDevice->capture.internalChannels <= MA_MAX_CHANNELS);    /* This should have been validated at initialization time. */
+
+        /*
+        For safety we'll check that the internal channels is a multiple of the buffer count. If it's not it means something
+        very strange has happened and we're not going to support it.
+        */
+        if ((pRenderedBufferList->mNumberBuffers % pDevice->capture.internalChannels) == 0) {
+            ma_uint8 tempBuffer[4096];
+            for (iBuffer = 0; iBuffer < pRenderedBufferList->mNumberBuffers; iBuffer += pDevice->capture.internalChannels) {
+                ma_uint32 framesRemaining = frameCount;
+                while (framesRemaining > 0) {
+                    void* ppDeinterleavedBuffers[MA_MAX_CHANNELS];
+                    ma_uint32 iChannel;
+                    ma_uint32 framesToSend = sizeof(tempBuffer) / ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+                    if (framesToSend > framesRemaining) {
+                        framesToSend = framesRemaining;
+                    }
+
+                    for (iChannel = 0; iChannel < pDevice->capture.internalChannels; ++iChannel) {
+                        ppDeinterleavedBuffers[iChannel] = (void*)ma_offset_ptr(pRenderedBufferList->mBuffers[iBuffer+iChannel].mData, (frameCount - framesRemaining) * ma_get_bytes_per_sample(pDevice->capture.internalFormat));
+                    }
+
+                    ma_interleave_pcm_frames(pDevice->capture.internalFormat, pDevice->capture.internalChannels, framesToSend, (const void**)ppDeinterleavedBuffers, tempBuffer);
+                    ma_device_handle_backend_data_callback(pDevice, NULL, tempBuffer, framesToSend);
+
+                    framesRemaining -= framesToSend;
+                }
+            }
+        }
+    }
+
+    (void)pActionFlags;
+    (void)pTimeStamp;
+    (void)busNumber;
+    (void)frameCount;
+    (void)pUnusedBufferList;
+
+    return noErr;
+}
+
+static void on_start_stop__coreaudio(void* pUserData, AudioUnit audioUnit, AudioUnitPropertyID propertyID, AudioUnitScope scope, AudioUnitElement element)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    /* Don't do anything if it looks like we're just reinitializing due to a device switch. */
+    if (((audioUnit == pDevice->coreaudio.audioUnitPlayback) && pDevice->coreaudio.isSwitchingPlaybackDevice) ||
+        ((audioUnit == pDevice->coreaudio.audioUnitCapture)  && pDevice->coreaudio.isSwitchingCaptureDevice)) {
+        return;
+    }
+
+    /*
+    There's been a report of a deadlock here when triggered by ma_device_uninit(). It looks like
+    AudioUnitGetProprty (called below) and AudioComponentInstanceDispose (called in ma_device_uninit)
+    can try waiting on the same lock. I'm going to try working around this by not calling any Core
+    Audio APIs in the callback when the device has been stopped or uninitialized.
+    */
+    if (ma_device_get_state(pDevice) == ma_device_state_uninitialized || ma_device_get_state(pDevice) == ma_device_state_stopping || ma_device_get_state(pDevice) == ma_device_state_stopped) {
+        ma_device__on_notification_stopped(pDevice);
+    } else {
+        UInt32 isRunning;
+        UInt32 isRunningSize = sizeof(isRunning);
+        OSStatus status = ((ma_AudioUnitGetProperty_proc)pDevice->pContext->coreaudio.AudioUnitGetProperty)(audioUnit, kAudioOutputUnitProperty_IsRunning, scope, element, &isRunning, &isRunningSize);
+        if (status != noErr) {
+            goto done; /* Don't really know what to do in this case... just ignore it, I suppose... */
+        }
+
+        if (!isRunning) {
+            /*
+            The stop event is a bit annoying in Core Audio because it will be called when we automatically switch the default device. Some scenarios to consider:
+
+            1) When the device is unplugged, this will be called _before_ the default device change notification.
+            2) When the device is changed via the default device change notification, this will be called _after_ the switch.
+
+            For case #1, we just check if there's a new default device available. If so, we just ignore the stop event. For case #2 we check a flag.
+            */
+            if (((audioUnit == pDevice->coreaudio.audioUnitPlayback) && pDevice->coreaudio.isDefaultPlaybackDevice) ||
+                ((audioUnit == pDevice->coreaudio.audioUnitCapture)  && pDevice->coreaudio.isDefaultCaptureDevice)) {
+                /*
+                It looks like the device is switching through an external event, such as the user unplugging the device or changing the default device
+                via the operating system's sound settings. If we're re-initializing the device, we just terminate because we want the stopping of the
+                device to be seamless to the client (we don't want them receiving the stopped event and thinking that the device has stopped when it
+                hasn't!).
+                */
+                if (((audioUnit == pDevice->coreaudio.audioUnitPlayback) && pDevice->coreaudio.isSwitchingPlaybackDevice) ||
+                    ((audioUnit == pDevice->coreaudio.audioUnitCapture)  && pDevice->coreaudio.isSwitchingCaptureDevice)) {
+                    goto done;
+                }
+
+                /*
+                Getting here means the device is not reinitializing which means it may have been unplugged. From what I can see, it looks like Core Audio
+                will try switching to the new default device seamlessly. We need to somehow find a way to determine whether or not Core Audio will most
+                likely be successful in switching to the new device.
+
+                TODO: Try to predict if Core Audio will switch devices. If not, the stopped callback needs to be posted.
+                */
+                goto done;
+            }
+
+            /* Getting here means we need to stop the device. */
+            ma_device__on_notification_stopped(pDevice);
+        }
+    }
+
+    (void)propertyID; /* Unused. */
+
+done:
+    /* Always signal the stop event. It's possible for the "else" case to get hit which can happen during an interruption. */
+    ma_event_signal(&pDevice->coreaudio.stopEvent);
+}
+
+#if defined(MA_APPLE_DESKTOP)
+static ma_spinlock g_DeviceTrackingInitLock_CoreAudio = 0;  /* A spinlock for mutal exclusion of the init/uninit of the global tracking data. Initialization to 0 is what we need. */
+static ma_uint32   g_DeviceTrackingInitCounter_CoreAudio = 0;
+static ma_mutex    g_DeviceTrackingMutex_CoreAudio;
+static ma_device** g_ppTrackedDevices_CoreAudio = NULL;
+static ma_uint32   g_TrackedDeviceCap_CoreAudio = 0;
+static ma_uint32   g_TrackedDeviceCount_CoreAudio = 0;
+
+static OSStatus ma_default_device_changed__coreaudio(AudioObjectID objectID, UInt32 addressCount, const AudioObjectPropertyAddress* pAddresses, void* pUserData)
+{
+    ma_device_type deviceType;
+
+    /* Not sure if I really need to check this, but it makes me feel better. */
+    if (addressCount == 0) {
+        return noErr;
+    }
+
+    if (pAddresses[0].mSelector == kAudioHardwarePropertyDefaultOutputDevice) {
+        deviceType = ma_device_type_playback;
+    } else if (pAddresses[0].mSelector == kAudioHardwarePropertyDefaultInputDevice) {
+        deviceType = ma_device_type_capture;
+    } else {
+        return noErr;   /* Should never hit this. */
+    }
+
+    ma_mutex_lock(&g_DeviceTrackingMutex_CoreAudio);
+    {
+        ma_uint32 iDevice;
+        for (iDevice = 0; iDevice < g_TrackedDeviceCount_CoreAudio; iDevice += 1) {
+            ma_result reinitResult;
+            ma_device* pDevice;
+
+            pDevice = g_ppTrackedDevices_CoreAudio[iDevice];
+            if (pDevice->type == deviceType || pDevice->type == ma_device_type_duplex) {
+                if (deviceType == ma_device_type_playback) {
+                    pDevice->coreaudio.isSwitchingPlaybackDevice = MA_TRUE;
+                    reinitResult = ma_device_reinit_internal__coreaudio(pDevice, deviceType, MA_TRUE);
+                    pDevice->coreaudio.isSwitchingPlaybackDevice = MA_FALSE;
+                } else {
+                    pDevice->coreaudio.isSwitchingCaptureDevice = MA_TRUE;
+                    reinitResult = ma_device_reinit_internal__coreaudio(pDevice, deviceType, MA_TRUE);
+                    pDevice->coreaudio.isSwitchingCaptureDevice = MA_FALSE;
+                }
+
+                if (reinitResult == MA_SUCCESS) {
+                    ma_device__post_init_setup(pDevice, deviceType);
+
+                    /* Restart the device if required. If this fails we need to stop the device entirely. */
+                    if (ma_device_get_state(pDevice) == ma_device_state_started) {
+                        OSStatus status;
+                        if (deviceType == ma_device_type_playback) {
+                            status = ((ma_AudioOutputUnitStart_proc)pDevice->pContext->coreaudio.AudioOutputUnitStart)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
+                            if (status != noErr) {
+                                if (pDevice->type == ma_device_type_duplex) {
+                                    ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+                                }
+                                ma_device__set_state(pDevice, ma_device_state_stopped);
+                            }
+                        } else if (deviceType == ma_device_type_capture) {
+                            status = ((ma_AudioOutputUnitStart_proc)pDevice->pContext->coreaudio.AudioOutputUnitStart)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+                            if (status != noErr) {
+                                if (pDevice->type == ma_device_type_duplex) {
+                                    ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
+                                }
+                                ma_device__set_state(pDevice, ma_device_state_stopped);
+                            }
+                        }
+                    }
+
+                    ma_device__on_notification_rerouted(pDevice);
+                }
+            }
+        }
+    }
+    ma_mutex_unlock(&g_DeviceTrackingMutex_CoreAudio);
+
+    /* Unused parameters. */
+    (void)objectID;
+    (void)pUserData;
+
+    return noErr;
+}
+
+static ma_result ma_context__init_device_tracking__coreaudio(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+
+    ma_spinlock_lock(&g_DeviceTrackingInitLock_CoreAudio);
+    {
+        /* Don't do anything if we've already initialized device tracking. */
+        if (g_DeviceTrackingInitCounter_CoreAudio == 0) {
+            AudioObjectPropertyAddress propAddress;
+            propAddress.mScope    = kAudioObjectPropertyScopeGlobal;
+            propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+            ma_mutex_init(&g_DeviceTrackingMutex_CoreAudio);
+
+            propAddress.mSelector = kAudioHardwarePropertyDefaultInputDevice;
+            ((ma_AudioObjectAddPropertyListener_proc)pContext->coreaudio.AudioObjectAddPropertyListener)(kAudioObjectSystemObject, &propAddress, &ma_default_device_changed__coreaudio, NULL);
+
+            propAddress.mSelector = kAudioHardwarePropertyDefaultOutputDevice;
+            ((ma_AudioObjectAddPropertyListener_proc)pContext->coreaudio.AudioObjectAddPropertyListener)(kAudioObjectSystemObject, &propAddress, &ma_default_device_changed__coreaudio, NULL);
+
+        }
+        g_DeviceTrackingInitCounter_CoreAudio += 1;
+    }
+    ma_spinlock_unlock(&g_DeviceTrackingInitLock_CoreAudio);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context__uninit_device_tracking__coreaudio(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+
+    ma_spinlock_lock(&g_DeviceTrackingInitLock_CoreAudio);
+    {
+        if (g_DeviceTrackingInitCounter_CoreAudio > 0)
+            g_DeviceTrackingInitCounter_CoreAudio -= 1;
+
+        if (g_DeviceTrackingInitCounter_CoreAudio == 0) {
+            AudioObjectPropertyAddress propAddress;
+            propAddress.mScope    = kAudioObjectPropertyScopeGlobal;
+            propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+            propAddress.mSelector = kAudioHardwarePropertyDefaultInputDevice;
+            ((ma_AudioObjectRemovePropertyListener_proc)pContext->coreaudio.AudioObjectRemovePropertyListener)(kAudioObjectSystemObject, &propAddress, &ma_default_device_changed__coreaudio, NULL);
+
+            propAddress.mSelector = kAudioHardwarePropertyDefaultOutputDevice;
+            ((ma_AudioObjectRemovePropertyListener_proc)pContext->coreaudio.AudioObjectRemovePropertyListener)(kAudioObjectSystemObject, &propAddress, &ma_default_device_changed__coreaudio, NULL);
+
+            /* At this point there should be no tracked devices. If not there's an error somewhere. */
+            if (g_ppTrackedDevices_CoreAudio != NULL) {
+                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "You have uninitialized all contexts while an associated device is still active.");
+                ma_spinlock_unlock(&g_DeviceTrackingInitLock_CoreAudio);
+                return MA_INVALID_OPERATION;
+            }
+
+            ma_mutex_uninit(&g_DeviceTrackingMutex_CoreAudio);
+        }
+    }
+    ma_spinlock_unlock(&g_DeviceTrackingInitLock_CoreAudio);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device__track__coreaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    ma_mutex_lock(&g_DeviceTrackingMutex_CoreAudio);
+    {
+        /* Allocate memory if required. */
+        if (g_TrackedDeviceCap_CoreAudio <= g_TrackedDeviceCount_CoreAudio) {
+            ma_uint32 newCap;
+            ma_device** ppNewDevices;
+
+            newCap = g_TrackedDeviceCap_CoreAudio * 2;
+            if (newCap == 0) {
+                newCap = 1;
+            }
+
+            ppNewDevices = (ma_device**)ma_realloc(g_ppTrackedDevices_CoreAudio, sizeof(*g_ppTrackedDevices_CoreAudio)*newCap, &pDevice->pContext->allocationCallbacks);
+            if (ppNewDevices == NULL) {
+                ma_mutex_unlock(&g_DeviceTrackingMutex_CoreAudio);
+                return MA_OUT_OF_MEMORY;
+            }
+
+            g_ppTrackedDevices_CoreAudio = ppNewDevices;
+            g_TrackedDeviceCap_CoreAudio = newCap;
+        }
+
+        g_ppTrackedDevices_CoreAudio[g_TrackedDeviceCount_CoreAudio] = pDevice;
+        g_TrackedDeviceCount_CoreAudio += 1;
+    }
+    ma_mutex_unlock(&g_DeviceTrackingMutex_CoreAudio);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device__untrack__coreaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    ma_mutex_lock(&g_DeviceTrackingMutex_CoreAudio);
+    {
+        ma_uint32 iDevice;
+        for (iDevice = 0; iDevice < g_TrackedDeviceCount_CoreAudio; iDevice += 1) {
+            if (g_ppTrackedDevices_CoreAudio[iDevice] == pDevice) {
+                /* We've found the device. We now need to remove it from the list. */
+                ma_uint32 jDevice;
+                for (jDevice = iDevice; jDevice < g_TrackedDeviceCount_CoreAudio-1; jDevice += 1) {
+                    g_ppTrackedDevices_CoreAudio[jDevice] = g_ppTrackedDevices_CoreAudio[jDevice+1];
+                }
+
+                g_TrackedDeviceCount_CoreAudio -= 1;
+
+                /* If there's nothing else in the list we need to free memory. */
+                if (g_TrackedDeviceCount_CoreAudio == 0) {
+                    ma_free(g_ppTrackedDevices_CoreAudio, &pDevice->pContext->allocationCallbacks);
+                    g_ppTrackedDevices_CoreAudio = NULL;
+                    g_TrackedDeviceCap_CoreAudio = 0;
+                }
+
+                break;
+            }
+        }
+    }
+    ma_mutex_unlock(&g_DeviceTrackingMutex_CoreAudio);
+
+    return MA_SUCCESS;
+}
+#endif
+
+#if defined(MA_APPLE_MOBILE)
+@interface ma_ios_notification_handler:NSObject {
+    ma_device* m_pDevice;
+}
+@end
+
+@implementation ma_ios_notification_handler
+-(id)init:(ma_device*)pDevice
+{
+    self = [super init];
+    m_pDevice = pDevice;
+
+    /* For route changes. */
+    [[NSNotificationCenter defaultCenter] addObserver:self selector:@selector(handle_route_change:) name:AVAudioSessionRouteChangeNotification object:[AVAudioSession sharedInstance]];
+
+    /* For interruptions. */
+    [[NSNotificationCenter defaultCenter] addObserver:self selector:@selector(handle_interruption:) name:AVAudioSessionInterruptionNotification object:[AVAudioSession sharedInstance]];
+
+    return self;
+}
+
+-(void)dealloc
+{
+    [self remove_handler];
+
+    #if defined(__has_feature)
+        #if !__has_feature(objc_arc)
+            [super dealloc];
+        #endif
+    #endif
+}
+
+-(void)remove_handler
+{
+    [[NSNotificationCenter defaultCenter] removeObserver:self name:AVAudioSessionRouteChangeNotification object:nil];
+    [[NSNotificationCenter defaultCenter] removeObserver:self name:AVAudioSessionInterruptionNotification object:nil];
+}
+
+-(void)handle_interruption:(NSNotification*)pNotification
+{
+    NSInteger type = [[[pNotification userInfo] objectForKey:AVAudioSessionInterruptionTypeKey] integerValue];
+    switch (type)
+    {
+        case AVAudioSessionInterruptionTypeBegan:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Interruption: AVAudioSessionInterruptionTypeBegan\n");
+
+            /*
+            Core Audio will have stopped the internal device automatically, but we need explicitly
+            stop it at a higher level to ensure miniaudio-specific state is updated for consistency.
+            */
+            ma_device_stop(m_pDevice);
+
+            /*
+            Fire the notification after the device has been stopped to ensure it's in the correct
+            state when the notification handler is invoked.
+            */
+            ma_device__on_notification_interruption_began(m_pDevice);
+        } break;
+
+        case AVAudioSessionInterruptionTypeEnded:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Interruption: AVAudioSessionInterruptionTypeEnded\n");
+            ma_device__on_notification_interruption_ended(m_pDevice);
+        } break;
+    }
+}
+
+-(void)handle_route_change:(NSNotification*)pNotification
+{
+    AVAudioSession* pSession = [AVAudioSession sharedInstance];
+
+    NSInteger reason = [[[pNotification userInfo] objectForKey:AVAudioSessionRouteChangeReasonKey] integerValue];
+    switch (reason)
+    {
+        case AVAudioSessionRouteChangeReasonOldDeviceUnavailable:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonOldDeviceUnavailable\n");
+        } break;
+
+        case AVAudioSessionRouteChangeReasonNewDeviceAvailable:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonNewDeviceAvailable\n");
+        } break;
+
+        case AVAudioSessionRouteChangeReasonNoSuitableRouteForCategory:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonNoSuitableRouteForCategory\n");
+        } break;
+
+        case AVAudioSessionRouteChangeReasonWakeFromSleep:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonWakeFromSleep\n");
+        } break;
+
+        case AVAudioSessionRouteChangeReasonOverride:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonOverride\n");
+        } break;
+
+        case AVAudioSessionRouteChangeReasonCategoryChange:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonCategoryChange\n");
+        } break;
+
+        case AVAudioSessionRouteChangeReasonUnknown:
+        default:
+        {
+            ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_INFO, "[Core Audio] Route Changed: AVAudioSessionRouteChangeReasonUnknown\n");
+        } break;
+    }
+
+    ma_log_postf(ma_device_get_log(m_pDevice), MA_LOG_LEVEL_DEBUG, "[Core Audio] Changing Route. inputNumberChannels=%d; outputNumberOfChannels=%d\n", (int)pSession.inputNumberOfChannels, (int)pSession.outputNumberOfChannels);
+
+    /* Let the application know about the route change. */
+    ma_device__on_notification_rerouted(m_pDevice);
+}
+@end
+#endif
+
+static ma_result ma_device_uninit__coreaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_uninitialized);
+
+#if defined(MA_APPLE_DESKTOP)
+    /*
+    Make sure we're no longer tracking the device. It doesn't matter if we call this for a non-default device because it'll
+    just gracefully ignore it.
+    */
+    ma_device__untrack__coreaudio(pDevice);
+#endif
+#if defined(MA_APPLE_MOBILE)
+    if (pDevice->coreaudio.pNotificationHandler != NULL) {
+        ma_ios_notification_handler* pNotificationHandler = (MA_BRIDGE_TRANSFER ma_ios_notification_handler*)pDevice->coreaudio.pNotificationHandler;
+        [pNotificationHandler remove_handler];
+    }
+#endif
+
+    if (pDevice->coreaudio.audioUnitCapture != NULL) {
+        ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+    }
+    if (pDevice->coreaudio.audioUnitPlayback != NULL) {
+        ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
+    }
+
+    if (pDevice->coreaudio.pAudioBufferList) {
+        ma_free(pDevice->coreaudio.pAudioBufferList, &pDevice->pContext->allocationCallbacks);
+    }
+
+    return MA_SUCCESS;
+}
+
+typedef struct
+{
+    ma_bool32 allowNominalSampleRateChange;
+
+    /* Input. */
+    ma_format formatIn;
+    ma_uint32 channelsIn;
+    ma_uint32 sampleRateIn;
+    ma_channel channelMapIn[MA_MAX_CHANNELS];
+    ma_uint32 periodSizeInFramesIn;
+    ma_uint32 periodSizeInMillisecondsIn;
+    ma_uint32 periodsIn;
+    ma_share_mode shareMode;
+    ma_performance_profile performanceProfile;
+    ma_bool32 registerStopEvent;
+
+    /* Output. */
+#if defined(MA_APPLE_DESKTOP)
+    AudioObjectID deviceObjectID;
+#endif
+    AudioComponent component;
+    AudioUnit audioUnit;
+    AudioBufferList* pAudioBufferList;  /* Only used for input devices. */
+    ma_format formatOut;
+    ma_uint32 channelsOut;
+    ma_uint32 sampleRateOut;
+    ma_channel channelMapOut[MA_MAX_CHANNELS];
+    ma_uint32 periodSizeInFramesOut;
+    ma_uint32 periodsOut;
+    char deviceName[256];
+} ma_device_init_internal_data__coreaudio;
+
+static ma_result ma_device_init_internal__coreaudio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_init_internal_data__coreaudio* pData, void* pDevice_DoNotReference)   /* <-- pDevice is typed as void* intentionally so as to avoid accidentally referencing it. */
+{
+    ma_result result = MA_SUCCESS;
+    OSStatus status;
+    UInt32 enableIOFlag;
+    AudioStreamBasicDescription bestFormat;
+    ma_uint32 actualPeriodSizeInFrames;
+    AURenderCallbackStruct callbackInfo;
+#if defined(MA_APPLE_DESKTOP)
+    AudioObjectID deviceObjectID;
+#endif
+
+    /* This API should only be used for a single device type: playback or capture. No full-duplex mode. */
+    if (deviceType == ma_device_type_duplex) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(deviceType == ma_device_type_playback || deviceType == ma_device_type_capture);
+
+#if defined(MA_APPLE_DESKTOP)
+    pData->deviceObjectID = 0;
+#endif
+    pData->component = NULL;
+    pData->audioUnit = NULL;
+    pData->pAudioBufferList = NULL;
+
+#if defined(MA_APPLE_DESKTOP)
+    result = ma_find_AudioObjectID(pContext, deviceType, pDeviceID, &deviceObjectID);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pData->deviceObjectID = deviceObjectID;
+#endif
+
+    /* Core audio doesn't really use the notion of a period so we can leave this unmodified, but not too over the top. */
+    pData->periodsOut = pData->periodsIn;
+    if (pData->periodsOut == 0) {
+        pData->periodsOut = MA_DEFAULT_PERIODS;
+    }
+    if (pData->periodsOut > 16) {
+        pData->periodsOut = 16;
+    }
+
+
+    /* Audio unit. */
+    status = ((ma_AudioComponentInstanceNew_proc)pContext->coreaudio.AudioComponentInstanceNew)((AudioComponent)pContext->coreaudio.component, (AudioUnit*)&pData->audioUnit);
+    if (status != noErr) {
+        return ma_result_from_OSStatus(status);
+    }
+
+
+    /* The input/output buses need to be explicitly enabled and disabled. We set the flag based on the output unit first, then we just swap it for input. */
+    enableIOFlag = 1;
+    if (deviceType == ma_device_type_capture) {
+        enableIOFlag = 0;
+    }
+
+    status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Output, MA_COREAUDIO_OUTPUT_BUS, &enableIOFlag, sizeof(enableIOFlag));
+    if (status != noErr) {
+        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+        return ma_result_from_OSStatus(status);
+    }
+
+    enableIOFlag = (enableIOFlag == 0) ? 1 : 0;
+    status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioOutputUnitProperty_EnableIO, kAudioUnitScope_Input, MA_COREAUDIO_INPUT_BUS, &enableIOFlag, sizeof(enableIOFlag));
+    if (status != noErr) {
+        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+        return ma_result_from_OSStatus(status);
+    }
+
+
+    /* Set the device to use with this audio unit. This is only used on desktop since we are using defaults on mobile. */
+#if defined(MA_APPLE_DESKTOP)
+    status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0, &deviceObjectID, sizeof(deviceObjectID));
+    if (status != noErr) {
+        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+        return ma_result_from_OSStatus(result);
+    }
+#else
+    /*
+    For some reason it looks like Apple is only allowing selection of the input device. There does not appear to be any way to change
+    the default output route. I have no idea why this is like this, but for now we'll only be able to configure capture devices.
+    */
+    if (pDeviceID != NULL) {
+        if (deviceType == ma_device_type_capture) {
+            ma_bool32 found = MA_FALSE;
+            NSArray *pInputs = [[[AVAudioSession sharedInstance] currentRoute] inputs];
+            for (AVAudioSessionPortDescription* pPortDesc in pInputs) {
+                if (strcmp(pDeviceID->coreaudio, [pPortDesc.UID UTF8String]) == 0) {
+                    [[AVAudioSession sharedInstance] setPreferredInput:pPortDesc error:nil];
+                    found = MA_TRUE;
+                    break;
+                }
+            }
+
+            if (found == MA_FALSE) {
+                return MA_DOES_NOT_EXIST;
+            }
+        }
+    }
+#endif
+
+    /*
+    Format. This is the hardest part of initialization because there's a few variables to take into account.
+      1) The format must be supported by the device.
+      2) The format must be supported miniaudio.
+      3) There's a priority that miniaudio prefers.
+
+    Ideally we would like to use a format that's as close to the hardware as possible so we can get as close to a passthrough as possible. The
+    most important property is the sample rate. miniaudio can do format conversion for any sample rate and channel count, but cannot do the same
+    for the sample data format. If the sample data format is not supported by miniaudio it must be ignored completely.
+
+    On mobile platforms this is a bit different. We just force the use of whatever the audio unit's current format is set to.
+    */
+    {
+        AudioStreamBasicDescription origFormat;
+        UInt32 origFormatSize = sizeof(origFormat);
+        AudioUnitScope   formatScope   = (deviceType == ma_device_type_playback) ? kAudioUnitScope_Input : kAudioUnitScope_Output;
+        AudioUnitElement formatElement = (deviceType == ma_device_type_playback) ? MA_COREAUDIO_OUTPUT_BUS : MA_COREAUDIO_INPUT_BUS;
+
+        if (deviceType == ma_device_type_playback) {
+            status = ((ma_AudioUnitGetProperty_proc)pContext->coreaudio.AudioUnitGetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Output, MA_COREAUDIO_OUTPUT_BUS, &origFormat, &origFormatSize);
+        } else {
+            status = ((ma_AudioUnitGetProperty_proc)pContext->coreaudio.AudioUnitGetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, kAudioUnitScope_Input, MA_COREAUDIO_INPUT_BUS, &origFormat, &origFormatSize);
+        }
+        if (status != noErr) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return ma_result_from_OSStatus(status);
+        }
+
+    #if defined(MA_APPLE_DESKTOP)
+        result = ma_find_best_format__coreaudio(pContext, deviceObjectID, deviceType, pData->formatIn, pData->channelsIn, pData->sampleRateIn, &origFormat, &bestFormat);
+        if (result != MA_SUCCESS) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return result;
+        }
+
+        /*
+        Technical Note TN2091: Device input using the HAL Output Audio Unit
+            https://developer.apple.com/library/archive/technotes/tn2091/_index.html
+
+        This documentation says the following:
+
+            The internal AudioConverter can handle any *simple* conversion. Typically, this means that a client can specify ANY
+            variant of the PCM formats. Consequently, the device's sample rate should match the desired sample rate. If sample rate
+            conversion is needed, it can be accomplished by buffering the input and converting the data on a separate thread with
+            another AudioConverter.
+
+        The important part here is the mention that it can handle *simple* conversions, which does *not* include sample rate. We
+        therefore want to ensure the sample rate stays consistent. This document is specifically for input, but I'm going to play it
+        safe and apply the same rule to output as well.
+
+        I have tried going against the documentation by setting the sample rate anyway, but this just results in AudioUnitRender()
+        returning a result code of -10863. I have also tried changing the format directly on the input scope on the input bus, but
+        this just results in `ca_require: IsStreamFormatWritable(inScope, inElement) NotWritable` when trying to set the format.
+
+        Something that does seem to work, however, has been setting the nominal sample rate on the device object. The problem with
+        this, however, is that it actually changes the sample rate at the operating system level and not just the application. This
+        could be intrusive to the user, however, so I don't think it's wise to make this the default. Instead I'm making this a
+        configuration option. When the `coreaudio.allowNominalSampleRateChange` config option is set to true, changing the sample
+        rate will be allowed. Otherwise it'll be fixed to the current sample rate. To check the system-defined sample rate, run
+        the Audio MIDI Setup program that comes installed on macOS and observe how the sample rate changes as the sample rate is
+        changed by miniaudio.
+        */
+        if (pData->allowNominalSampleRateChange) {
+            AudioValueRange sampleRateRange;
+            AudioObjectPropertyAddress propAddress;
+
+            sampleRateRange.mMinimum = bestFormat.mSampleRate;
+            sampleRateRange.mMaximum = bestFormat.mSampleRate;
+
+            propAddress.mSelector = kAudioDevicePropertyNominalSampleRate;
+            propAddress.mScope    = (deviceType == ma_device_type_playback) ? kAudioObjectPropertyScopeOutput : kAudioObjectPropertyScopeInput;
+            propAddress.mElement  = AUDIO_OBJECT_PROPERTY_ELEMENT;
+
+            status = ((ma_AudioObjectSetPropertyData_proc)pContext->coreaudio.AudioObjectSetPropertyData)(deviceObjectID, &propAddress, 0, NULL, sizeof(sampleRateRange), &sampleRateRange);
+            if (status != noErr) {
+                bestFormat.mSampleRate = origFormat.mSampleRate;
+            }
+        } else {
+            bestFormat.mSampleRate = origFormat.mSampleRate;
+        }
+
+        status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, formatScope, formatElement, &bestFormat, sizeof(bestFormat));
+        if (status != noErr) {
+            /* We failed to set the format, so fall back to the current format of the audio unit. */
+            bestFormat = origFormat;
+        }
+    #else
+        bestFormat = origFormat;
+
+        /*
+        Sample rate is a little different here because for some reason kAudioUnitProperty_StreamFormat returns 0... Oh well. We need to instead try
+        setting the sample rate to what the user has requested and then just see the results of it. Need to use some Objective-C here for this since
+        it depends on Apple's AVAudioSession API. To do this we just get the shared AVAudioSession instance and then set it. Note that from what I
+        can tell, it looks like the sample rate is shared between playback and capture for everything.
+        */
+        @autoreleasepool {
+            AVAudioSession* pAudioSession = [AVAudioSession sharedInstance];
+            MA_ASSERT(pAudioSession != NULL);
+
+            [pAudioSession setPreferredSampleRate:(double)pData->sampleRateIn error:nil];
+            bestFormat.mSampleRate = pAudioSession.sampleRate;
+
+            /*
+            I've had a report that the channel count returned by AudioUnitGetProperty above is inconsistent with
+            AVAudioSession outputNumberOfChannels. I'm going to try using the AVAudioSession values instead.
+
+            UPDATE 20/02/2025:
+            When testing on the simulator with an iPhone 15 and iOS 17 I get an error when initializing the audio
+            unit if set the input channels to pAudioSession.inputNumberOfChannels. What is happening is the channel
+            count returned from AudioUnitGetProperty() above is set to 2, but pAudioSession is reporting a channel
+            count of 1. When this happens, the call to AudioUnitSetProprty() below just down below will succeed, but
+            AudioUnitInitialize() further down will fail. The only solution I have come up with is to not set the
+            channel count to pAudioSession.inputNumberOfChannels.
+            */
+            if (deviceType == ma_device_type_playback) {
+                bestFormat.mChannelsPerFrame = (UInt32)pAudioSession.outputNumberOfChannels;
+            }
+
+            #if 0
+            if (deviceType == ma_device_type_capture) {
+                /*printf("DEBUG: bestFormat.mChannelsPerFrame = %d; pAudioSession.inputNumberOfChannels = %d\n", (int)bestFormat.mChannelsPerFrame, (int)pAudioSession.inputNumberOfChannels);*/
+                bestFormat.mChannelsPerFrame = (UInt32)pAudioSession.inputNumberOfChannels;
+            }
+            #endif
+        }
+
+        
+        status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_StreamFormat, formatScope, formatElement, &bestFormat, sizeof(bestFormat));
+        if (status != noErr) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return ma_result_from_OSStatus(status);
+        }
+    #endif
+
+        result = ma_format_from_AudioStreamBasicDescription(&bestFormat, &pData->formatOut);
+        if (result != MA_SUCCESS) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return result;
+        }
+
+        if (pData->formatOut == ma_format_unknown) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return MA_FORMAT_NOT_SUPPORTED;
+        }
+
+        pData->channelsOut   = bestFormat.mChannelsPerFrame;
+        pData->sampleRateOut = (ma_uint32)bestFormat.mSampleRate;
+    }
+
+    /* Clamp the channel count for safety. */
+    if (pData->channelsOut > MA_MAX_CHANNELS) {
+        pData->channelsOut = MA_MAX_CHANNELS;
+    }
+
+    /*
+    Internal channel map. This is weird in my testing. If I use the AudioObject to get the
+    channel map, the channel descriptions are set to "Unknown" for some reason. To work around
+    this it looks like retrieving it from the AudioUnit will work. However, and this is where
+    it gets weird, it doesn't seem to work with capture devices, nor at all on iOS... Therefore
+    I'm going to fall back to a default assumption in these cases.
+    */
+#if defined(MA_APPLE_DESKTOP)
+    result = ma_get_AudioUnit_channel_map(pContext, pData->audioUnit, deviceType, pData->channelMapOut, pData->channelsOut);
+    if (result != MA_SUCCESS) {
+    #if 0
+        /* Try falling back to the channel map from the AudioObject. */
+        result = ma_get_AudioObject_channel_map(pContext, deviceObjectID, deviceType, pData->channelMapOut, pData->channelsOut);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    #else
+        /* Fall back to default assumptions. */
+        ma_channel_map_init_standard(ma_standard_channel_map_default, pData->channelMapOut, ma_countof(pData->channelMapOut), pData->channelsOut);
+    #endif
+    }
+#else
+    /* TODO: Figure out how to get the channel map using AVAudioSession. */
+    ma_channel_map_init_standard(ma_standard_channel_map_default, pData->channelMapOut, ma_countof(pData->channelMapOut), pData->channelsOut);
+#endif
+
+
+    /* Buffer size. Not allowing this to be configurable on iOS. */
+    if (pData->periodSizeInFramesIn == 0) {
+        if (pData->periodSizeInMillisecondsIn == 0) {
+            if (pData->performanceProfile == ma_performance_profile_low_latency) {
+                actualPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY, pData->sampleRateOut);
+            } else {
+                actualPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE, pData->sampleRateOut);
+            }
+        } else {
+            actualPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(pData->periodSizeInMillisecondsIn, pData->sampleRateOut);
+        }
+    } else {
+        actualPeriodSizeInFrames = pData->periodSizeInFramesIn;
+    }
+
+#if defined(MA_APPLE_DESKTOP)
+    result = ma_set_AudioObject_buffer_size_in_frames(pContext, deviceObjectID, deviceType, &actualPeriodSizeInFrames);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+#else
+    /*
+    On iOS, the size of the IO buffer needs to be specified in seconds and is a floating point
+    number. I don't trust any potential truncation errors due to converting from float to integer
+    so I'm going to explicitly set the actual period size to the next power of 2.
+    */
+    @autoreleasepool {
+        AVAudioSession* pAudioSession = [AVAudioSession sharedInstance];
+        MA_ASSERT(pAudioSession != NULL);
+
+        [pAudioSession setPreferredIOBufferDuration:((float)actualPeriodSizeInFrames / pAudioSession.sampleRate) error:nil];
+        actualPeriodSizeInFrames = ma_next_power_of_2((ma_uint32)(pAudioSession.IOBufferDuration * pAudioSession.sampleRate));
+    }
+#endif
+
+
+    /*
+    During testing I discovered that the buffer size can be too big. You'll get an error like this:
+
+      kAudioUnitErr_TooManyFramesToProcess : inFramesToProcess=4096, mMaxFramesPerSlice=512
+
+    Note how inFramesToProcess is smaller than mMaxFramesPerSlice. To fix, we need to set kAudioUnitProperty_MaximumFramesPerSlice to that
+    of the size of our buffer, or do it the other way around and set our buffer size to the kAudioUnitProperty_MaximumFramesPerSlice.
+    */
+    status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_MaximumFramesPerSlice, kAudioUnitScope_Global, 0, &actualPeriodSizeInFrames, sizeof(actualPeriodSizeInFrames));
+    if (status != noErr) {
+        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+        return ma_result_from_OSStatus(status);
+    }
+
+    pData->periodSizeInFramesOut = (ma_uint32)actualPeriodSizeInFrames;
+
+    /* We need a buffer list if this is an input device. We render into this in the input callback. */
+    if (deviceType == ma_device_type_capture) {
+        ma_bool32 isInterleaved = (bestFormat.mFormatFlags & kAudioFormatFlagIsNonInterleaved) == 0;
+        AudioBufferList* pBufferList;
+
+        pBufferList = ma_allocate_AudioBufferList__coreaudio(pData->periodSizeInFramesOut, pData->formatOut, pData->channelsOut, (isInterleaved) ? ma_stream_layout_interleaved : ma_stream_layout_deinterleaved, &pContext->allocationCallbacks);
+        if (pBufferList == NULL) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return MA_OUT_OF_MEMORY;
+        }
+
+        pData->pAudioBufferList = pBufferList;
+    }
+
+    /* Callbacks. */
+    callbackInfo.inputProcRefCon = pDevice_DoNotReference;
+    if (deviceType == ma_device_type_playback) {
+        callbackInfo.inputProc = ma_on_output__coreaudio;
+        status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioUnitProperty_SetRenderCallback, kAudioUnitScope_Global, 0, &callbackInfo, sizeof(callbackInfo));
+        if (status != noErr) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return ma_result_from_OSStatus(status);
+        }
+    } else {
+        callbackInfo.inputProc = ma_on_input__coreaudio;
+        status = ((ma_AudioUnitSetProperty_proc)pContext->coreaudio.AudioUnitSetProperty)(pData->audioUnit, kAudioOutputUnitProperty_SetInputCallback, kAudioUnitScope_Global, 0, &callbackInfo, sizeof(callbackInfo));
+        if (status != noErr) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return ma_result_from_OSStatus(status);
+        }
+    }
+
+    /* We need to listen for stop events. */
+    if (pData->registerStopEvent) {
+        status = ((ma_AudioUnitAddPropertyListener_proc)pContext->coreaudio.AudioUnitAddPropertyListener)(pData->audioUnit, kAudioOutputUnitProperty_IsRunning, on_start_stop__coreaudio, pDevice_DoNotReference);
+        if (status != noErr) {
+            ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+            return ma_result_from_OSStatus(status);
+        }
+    }
+
+    /* Initialize the audio unit. */
+    status = ((ma_AudioUnitInitialize_proc)pContext->coreaudio.AudioUnitInitialize)(pData->audioUnit);
+    if (status != noErr) {
+        ma_free(pData->pAudioBufferList, &pContext->allocationCallbacks);
+        pData->pAudioBufferList = NULL;
+        ((ma_AudioComponentInstanceDispose_proc)pContext->coreaudio.AudioComponentInstanceDispose)(pData->audioUnit);
+        return ma_result_from_OSStatus(status);
+    }
+
+    /* Grab the name. */
+#if defined(MA_APPLE_DESKTOP)
+    ma_get_AudioObject_name(pContext, deviceObjectID, sizeof(pData->deviceName), pData->deviceName);
+#else
+    if (deviceType == ma_device_type_playback) {
+        ma_strcpy_s(pData->deviceName, sizeof(pData->deviceName), MA_DEFAULT_PLAYBACK_DEVICE_NAME);
+    } else {
+        ma_strcpy_s(pData->deviceName, sizeof(pData->deviceName), MA_DEFAULT_CAPTURE_DEVICE_NAME);
+    }
+#endif
+
+    return result;
+}
+
+#if defined(MA_APPLE_DESKTOP)
+static ma_result ma_device_reinit_internal__coreaudio(ma_device* pDevice, ma_device_type deviceType, ma_bool32 disposePreviousAudioUnit)
+{
+    ma_device_init_internal_data__coreaudio data;
+    ma_result result;
+
+    /* This should only be called for playback or capture, not duplex. */
+    if (deviceType == ma_device_type_duplex) {
+        return MA_INVALID_ARGS;
+    }
+
+    data.allowNominalSampleRateChange = MA_FALSE;   /* Don't change the nominal sample rate when switching devices. */
+
+    if (deviceType == ma_device_type_capture) {
+        data.formatIn               = pDevice->capture.format;
+        data.channelsIn             = pDevice->capture.channels;
+        data.sampleRateIn           = pDevice->sampleRate;
+        MA_COPY_MEMORY(data.channelMapIn, pDevice->capture.channelMap, sizeof(pDevice->capture.channelMap));
+        data.shareMode              = pDevice->capture.shareMode;
+        data.performanceProfile     = pDevice->coreaudio.originalPerformanceProfile;
+        data.registerStopEvent      = MA_TRUE;
+
+        if (disposePreviousAudioUnit) {
+            ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+            ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+        }
+        if (pDevice->coreaudio.pAudioBufferList) {
+            ma_free(pDevice->coreaudio.pAudioBufferList, &pDevice->pContext->allocationCallbacks);
+        }
+    } else if (deviceType == ma_device_type_playback) {
+        data.formatIn               = pDevice->playback.format;
+        data.channelsIn             = pDevice->playback.channels;
+        data.sampleRateIn           = pDevice->sampleRate;
+        MA_COPY_MEMORY(data.channelMapIn, pDevice->playback.channelMap, sizeof(pDevice->playback.channelMap));
+        data.shareMode              = pDevice->playback.shareMode;
+        data.performanceProfile     = pDevice->coreaudio.originalPerformanceProfile;
+        data.registerStopEvent      = (pDevice->type != ma_device_type_duplex);
+
+        if (disposePreviousAudioUnit) {
+            ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
+            ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
+        }
+    }
+    data.periodSizeInFramesIn       = pDevice->coreaudio.originalPeriodSizeInFrames;
+    data.periodSizeInMillisecondsIn = pDevice->coreaudio.originalPeriodSizeInMilliseconds;
+    data.periodsIn                  = pDevice->coreaudio.originalPeriods;
+
+    /* Need at least 3 periods for duplex. */
+    if (data.periodsIn < 3 && pDevice->type == ma_device_type_duplex) {
+        data.periodsIn = 3;
+    }
+
+    result = ma_device_init_internal__coreaudio(pDevice->pContext, deviceType, NULL, &data, (void*)pDevice);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (deviceType == ma_device_type_capture) {
+    #if defined(MA_APPLE_DESKTOP)
+        pDevice->coreaudio.deviceObjectIDCapture     = (ma_uint32)data.deviceObjectID;
+        ma_get_AudioObject_uid(pDevice->pContext, pDevice->coreaudio.deviceObjectIDCapture, sizeof(pDevice->capture.id.coreaudio), pDevice->capture.id.coreaudio);
+    #endif
+        pDevice->coreaudio.audioUnitCapture          = (ma_ptr)data.audioUnit;
+        pDevice->coreaudio.pAudioBufferList          = (ma_ptr)data.pAudioBufferList;
+        pDevice->coreaudio.audioBufferCapInFrames    = data.periodSizeInFramesOut;
+
+        pDevice->capture.internalFormat              = data.formatOut;
+        pDevice->capture.internalChannels            = data.channelsOut;
+        pDevice->capture.internalSampleRate          = data.sampleRateOut;
+        MA_COPY_MEMORY(pDevice->capture.internalChannelMap, data.channelMapOut, sizeof(data.channelMapOut));
+        pDevice->capture.internalPeriodSizeInFrames  = data.periodSizeInFramesOut;
+        pDevice->capture.internalPeriods             = data.periodsOut;
+    } else if (deviceType == ma_device_type_playback) {
+    #if defined(MA_APPLE_DESKTOP)
+        pDevice->coreaudio.deviceObjectIDPlayback    = (ma_uint32)data.deviceObjectID;
+        ma_get_AudioObject_uid(pDevice->pContext, pDevice->coreaudio.deviceObjectIDPlayback, sizeof(pDevice->playback.id.coreaudio), pDevice->playback.id.coreaudio);
+    #endif
+        pDevice->coreaudio.audioUnitPlayback         = (ma_ptr)data.audioUnit;
+
+        pDevice->playback.internalFormat             = data.formatOut;
+        pDevice->playback.internalChannels           = data.channelsOut;
+        pDevice->playback.internalSampleRate         = data.sampleRateOut;
+        MA_COPY_MEMORY(pDevice->playback.internalChannelMap, data.channelMapOut, sizeof(data.channelMapOut));
+        pDevice->playback.internalPeriodSizeInFrames = data.periodSizeInFramesOut;
+        pDevice->playback.internalPeriods            = data.periodsOut;
+    }
+
+    return MA_SUCCESS;
+}
+#endif /* MA_APPLE_DESKTOP */
+
+static ma_result ma_device_init__coreaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pConfig != NULL);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    /* No exclusive mode with the Core Audio backend for now. */
+    if (((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive) ||
+        ((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive)) {
+        return MA_SHARE_MODE_NOT_SUPPORTED;
+    }
+
+    /* Capture needs to be initialized first. */
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ma_device_init_internal_data__coreaudio data;
+        data.allowNominalSampleRateChange = pConfig->coreaudio.allowNominalSampleRateChange;
+        data.formatIn                     = pDescriptorCapture->format;
+        data.channelsIn                   = pDescriptorCapture->channels;
+        data.sampleRateIn                 = pDescriptorCapture->sampleRate;
+        MA_COPY_MEMORY(data.channelMapIn, pDescriptorCapture->channelMap, sizeof(pDescriptorCapture->channelMap));
+        data.periodSizeInFramesIn         = pDescriptorCapture->periodSizeInFrames;
+        data.periodSizeInMillisecondsIn   = pDescriptorCapture->periodSizeInMilliseconds;
+        data.periodsIn                    = pDescriptorCapture->periodCount;
+        data.shareMode                    = pDescriptorCapture->shareMode;
+        data.performanceProfile           = pConfig->performanceProfile;
+        data.registerStopEvent            = MA_TRUE;
+
+        /* Need at least 3 periods for duplex. */
+        if (data.periodsIn < 3 && pConfig->deviceType == ma_device_type_duplex) {
+            data.periodsIn = 3;
+        }
+
+        result = ma_device_init_internal__coreaudio(pDevice->pContext, ma_device_type_capture, pDescriptorCapture->pDeviceID, &data, (void*)pDevice);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pDevice->coreaudio.isDefaultCaptureDevice           = (pConfig->capture.pDeviceID == NULL);
+    #if defined(MA_APPLE_DESKTOP)
+        pDevice->coreaudio.deviceObjectIDCapture            = (ma_uint32)data.deviceObjectID;
+    #endif
+        pDevice->coreaudio.audioUnitCapture                 = (ma_ptr)data.audioUnit;
+        pDevice->coreaudio.pAudioBufferList                 = (ma_ptr)data.pAudioBufferList;
+        pDevice->coreaudio.audioBufferCapInFrames           = data.periodSizeInFramesOut;
+        pDevice->coreaudio.originalPeriodSizeInFrames       = pDescriptorCapture->periodSizeInFrames;
+        pDevice->coreaudio.originalPeriodSizeInMilliseconds = pDescriptorCapture->periodSizeInMilliseconds;
+        pDevice->coreaudio.originalPeriods                  = pDescriptorCapture->periodCount;
+        pDevice->coreaudio.originalPerformanceProfile       = pConfig->performanceProfile;
+
+        pDescriptorCapture->format                          = data.formatOut;
+        pDescriptorCapture->channels                        = data.channelsOut;
+        pDescriptorCapture->sampleRate                      = data.sampleRateOut;
+        MA_COPY_MEMORY(pDescriptorCapture->channelMap, data.channelMapOut, sizeof(data.channelMapOut));
+        pDescriptorCapture->periodSizeInFrames              = data.periodSizeInFramesOut;
+        pDescriptorCapture->periodCount                     = data.periodsOut;
+
+    #if defined(MA_APPLE_DESKTOP)
+        ma_get_AudioObject_uid(pDevice->pContext, pDevice->coreaudio.deviceObjectIDCapture, sizeof(pDevice->capture.id.coreaudio), pDevice->capture.id.coreaudio);
+
+        /*
+        If we are using the default device we'll need to listen for changes to the system's default device so we can seamlessly
+        switch the device in the background.
+        */
+        if (pConfig->capture.pDeviceID == NULL) {
+            ma_device__track__coreaudio(pDevice);
+        }
+    #endif
+    }
+
+    /* Playback. */
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_device_init_internal_data__coreaudio data;
+        data.allowNominalSampleRateChange   = pConfig->coreaudio.allowNominalSampleRateChange;
+        data.formatIn                       = pDescriptorPlayback->format;
+        data.channelsIn                     = pDescriptorPlayback->channels;
+        data.sampleRateIn                   = pDescriptorPlayback->sampleRate;
+        MA_COPY_MEMORY(data.channelMapIn, pDescriptorPlayback->channelMap, sizeof(pDescriptorPlayback->channelMap));
+        data.shareMode                      = pDescriptorPlayback->shareMode;
+        data.performanceProfile             = pConfig->performanceProfile;
+
+        /* In full-duplex mode we want the playback buffer to be the same size as the capture buffer. */
+        if (pConfig->deviceType == ma_device_type_duplex) {
+            data.periodSizeInFramesIn       = pDescriptorCapture->periodSizeInFrames;
+            data.periodsIn                  = pDescriptorCapture->periodCount;
+            data.registerStopEvent          = MA_FALSE;
+        } else {
+            data.periodSizeInFramesIn       = pDescriptorPlayback->periodSizeInFrames;
+            data.periodSizeInMillisecondsIn = pDescriptorPlayback->periodSizeInMilliseconds;
+            data.periodsIn                  = pDescriptorPlayback->periodCount;
+            data.registerStopEvent          = MA_TRUE;
+        }
+
+        result = ma_device_init_internal__coreaudio(pDevice->pContext, ma_device_type_playback, pDescriptorPlayback->pDeviceID, &data, (void*)pDevice);
+        if (result != MA_SUCCESS) {
+            if (pConfig->deviceType == ma_device_type_duplex) {
+                ((ma_AudioComponentInstanceDispose_proc)pDevice->pContext->coreaudio.AudioComponentInstanceDispose)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+                if (pDevice->coreaudio.pAudioBufferList) {
+                    ma_free(pDevice->coreaudio.pAudioBufferList, &pDevice->pContext->allocationCallbacks);
+                }
+            }
+            return result;
+        }
+
+        pDevice->coreaudio.isDefaultPlaybackDevice          = (pConfig->playback.pDeviceID == NULL);
+    #if defined(MA_APPLE_DESKTOP)
+        pDevice->coreaudio.deviceObjectIDPlayback           = (ma_uint32)data.deviceObjectID;
+    #endif
+        pDevice->coreaudio.audioUnitPlayback                = (ma_ptr)data.audioUnit;
+        pDevice->coreaudio.originalPeriodSizeInFrames       = pDescriptorPlayback->periodSizeInFrames;
+        pDevice->coreaudio.originalPeriodSizeInMilliseconds = pDescriptorPlayback->periodSizeInMilliseconds;
+        pDevice->coreaudio.originalPeriods                  = pDescriptorPlayback->periodCount;
+        pDevice->coreaudio.originalPerformanceProfile       = pConfig->performanceProfile;
+
+        pDescriptorPlayback->format                         = data.formatOut;
+        pDescriptorPlayback->channels                       = data.channelsOut;
+        pDescriptorPlayback->sampleRate                     = data.sampleRateOut;
+        MA_COPY_MEMORY(pDescriptorPlayback->channelMap, data.channelMapOut, sizeof(data.channelMapOut));
+        pDescriptorPlayback->periodSizeInFrames             = data.periodSizeInFramesOut;
+        pDescriptorPlayback->periodCount                    = data.periodsOut;
+
+    #if defined(MA_APPLE_DESKTOP)
+        ma_get_AudioObject_uid(pDevice->pContext, pDevice->coreaudio.deviceObjectIDPlayback, sizeof(pDevice->playback.id.coreaudio), pDevice->playback.id.coreaudio);
+
+        /*
+        If we are using the default device we'll need to listen for changes to the system's default device so we can seamlessly
+        switch the device in the background.
+        */
+        if (pDescriptorPlayback->pDeviceID == NULL && (pConfig->deviceType != ma_device_type_duplex || pDescriptorCapture->pDeviceID != NULL)) {
+            ma_device__track__coreaudio(pDevice);
+        }
+    #endif
+    }
+
+
+
+    /*
+    When stopping the device, a callback is called on another thread. We need to wait for this callback
+    before returning from ma_device_stop(). This event is used for this.
+    */
+    ma_event_init(&pDevice->coreaudio.stopEvent);
+
+    /*
+    We need to detect when a route has changed so we can update the data conversion pipeline accordingly. This is done
+    differently on non-Desktop Apple platforms.
+    */
+#if defined(MA_APPLE_MOBILE)
+    pDevice->coreaudio.pNotificationHandler = (MA_BRIDGE_RETAINED void*)[[ma_ios_notification_handler alloc] init:pDevice];
+#endif
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_device_start__coreaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        OSStatus status = ((ma_AudioOutputUnitStart_proc)pDevice->pContext->coreaudio.AudioOutputUnitStart)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+        if (status != noErr) {
+            return ma_result_from_OSStatus(status);
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        OSStatus status = ((ma_AudioOutputUnitStart_proc)pDevice->pContext->coreaudio.AudioOutputUnitStart)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
+        if (status != noErr) {
+            if (pDevice->type == ma_device_type_duplex) {
+                ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+            }
+            return ma_result_from_OSStatus(status);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__coreaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /* It's not clear from the documentation whether or not AudioOutputUnitStop() actually drains the device or not. */
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        OSStatus status = ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitCapture);
+        if (status != noErr) {
+            return ma_result_from_OSStatus(status);
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        OSStatus status = ((ma_AudioOutputUnitStop_proc)pDevice->pContext->coreaudio.AudioOutputUnitStop)((AudioUnit)pDevice->coreaudio.audioUnitPlayback);
+        if (status != noErr) {
+            return ma_result_from_OSStatus(status);
+        }
+    }
+
+    /* We need to wait for the callback to finish before returning. */
+    ma_event_wait(&pDevice->coreaudio.stopEvent);
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_context_uninit__coreaudio(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_coreaudio);
+
+#if defined(MA_APPLE_MOBILE)
+    if (!pContext->coreaudio.noAudioSessionDeactivate) {
+        if (![[AVAudioSession sharedInstance] setActive:false error:nil]) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "Failed to deactivate audio session.");
+            return MA_FAILED_TO_INIT_BACKEND;
+        }
+    }
+#endif
+
+#if !defined(MA_NO_RUNTIME_LINKING) && !defined(MA_APPLE_MOBILE)
+    ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit);
+    ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
+    ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
+#endif
+
+#if !defined(MA_APPLE_MOBILE)
+    ma_context__uninit_device_tracking__coreaudio(pContext);
+#endif
+
+    (void)pContext;
+    return MA_SUCCESS;
+}
+
+#if defined(MA_APPLE_MOBILE) && defined(__IPHONE_12_0)
+static AVAudioSessionCategory ma_to_AVAudioSessionCategory(ma_ios_session_category category)
+{
+    /* The "default" and "none" categories are treated different and should not be used as an input into this function. */
+    MA_ASSERT(category != ma_ios_session_category_default);
+    MA_ASSERT(category != ma_ios_session_category_none);
+
+    switch (category) {
+        case ma_ios_session_category_ambient:         return AVAudioSessionCategoryAmbient;
+        case ma_ios_session_category_solo_ambient:    return AVAudioSessionCategorySoloAmbient;
+        case ma_ios_session_category_playback:        return AVAudioSessionCategoryPlayback;
+        case ma_ios_session_category_record:          return AVAudioSessionCategoryRecord;
+        case ma_ios_session_category_play_and_record: return AVAudioSessionCategoryPlayAndRecord;
+        case ma_ios_session_category_multi_route:     return AVAudioSessionCategoryMultiRoute;
+        case ma_ios_session_category_none:            return AVAudioSessionCategoryAmbient;
+        case ma_ios_session_category_default:         return AVAudioSessionCategoryAmbient;
+        default:                                      return AVAudioSessionCategoryAmbient;
+    }
+}
+#endif
+
+static ma_result ma_context_init__coreaudio(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+#if !defined(MA_APPLE_MOBILE)
+    ma_result result;
+#endif
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(pContext != NULL);
+
+#if defined(MA_APPLE_MOBILE)
+    @autoreleasepool {
+        AVAudioSession* pAudioSession = [AVAudioSession sharedInstance];
+        AVAudioSessionCategoryOptions options = pConfig->coreaudio.sessionCategoryOptions;
+
+        MA_ASSERT(pAudioSession != NULL);
+
+        if (pConfig->coreaudio.sessionCategory == ma_ios_session_category_default) {
+            /*
+            I'm going to use trial and error to determine our default session category. First we'll try PlayAndRecord. If that fails
+            we'll try Playback and if that fails we'll try record. If all of these fail we'll just not set the category.
+            */
+        #if !defined(MA_APPLE_TV) && !defined(MA_APPLE_WATCH)
+            options |= AVAudioSessionCategoryOptionDefaultToSpeaker;
+        #endif
+
+            if ([pAudioSession setCategory: AVAudioSessionCategoryPlayAndRecord withOptions:options error:nil]) {
+                /* Using PlayAndRecord */
+            } else if ([pAudioSession setCategory: AVAudioSessionCategoryPlayback withOptions:options error:nil]) {
+                /* Using Playback */
+            } else if ([pAudioSession setCategory: AVAudioSessionCategoryRecord withOptions:options error:nil]) {
+                /* Using Record */
+            } else {
+                /* Leave as default? */
+            }
+        } else {
+            if (pConfig->coreaudio.sessionCategory != ma_ios_session_category_none) {
+            #if defined(__IPHONE_12_0)
+                if (![pAudioSession setCategory: ma_to_AVAudioSessionCategory(pConfig->coreaudio.sessionCategory) withOptions:options error:nil]) {
+                    return MA_INVALID_OPERATION;    /* Failed to set session category. */
+                }
+            #else
+                /* Ignore the session category on version 11 and older, but post a warning. */
+                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "Session category only supported in iOS 12 and newer.");
+            #endif
+            }
+        }
+
+        if (!pConfig->coreaudio.noAudioSessionActivate) {
+            if (![pAudioSession setActive:true error:nil]) {
+                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "Failed to activate audio session.");
+                return MA_FAILED_TO_INIT_BACKEND;
+            }
+        }
+    }
+#endif
+
+#if !defined(MA_NO_RUNTIME_LINKING) && !defined(MA_APPLE_MOBILE)
+    pContext->coreaudio.hCoreFoundation = ma_dlopen(ma_context_get_log(pContext), "/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation");
+    if (pContext->coreaudio.hCoreFoundation == NULL) {
+        return MA_API_NOT_FOUND;
+    }
+
+    pContext->coreaudio.CFStringGetCString = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation, "CFStringGetCString");
+    pContext->coreaudio.CFRelease          = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation, "CFRelease");
+
+
+    pContext->coreaudio.hCoreAudio = ma_dlopen(ma_context_get_log(pContext), "/System/Library/Frameworks/CoreAudio.framework/CoreAudio");
+    if (pContext->coreaudio.hCoreAudio == NULL) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
+        return MA_API_NOT_FOUND;
+    }
+
+    pContext->coreaudio.AudioObjectGetPropertyData        = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectGetPropertyData");
+    pContext->coreaudio.AudioObjectGetPropertyDataSize    = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectGetPropertyDataSize");
+    pContext->coreaudio.AudioObjectSetPropertyData        = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectSetPropertyData");
+    pContext->coreaudio.AudioObjectAddPropertyListener    = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectAddPropertyListener");
+    pContext->coreaudio.AudioObjectRemovePropertyListener = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio, "AudioObjectRemovePropertyListener");
+
+    /*
+    It looks like Apple has moved some APIs from AudioUnit into AudioToolbox on more recent versions of macOS. They are still
+    defined in AudioUnit, but just in case they decide to remove them from there entirely I'm going to implement a fallback.
+    The way it'll work is that it'll first try AudioUnit, and if the required symbols are not present there we'll fall back to
+    AudioToolbox.
+    */
+    pContext->coreaudio.hAudioUnit = ma_dlopen(ma_context_get_log(pContext), "/System/Library/Frameworks/AudioUnit.framework/AudioUnit");
+    if (pContext->coreaudio.hAudioUnit == NULL) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
+        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
+        return MA_API_NOT_FOUND;
+    }
+
+    if (ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioComponentFindNext") == NULL) {
+        /* Couldn't find the required symbols in AudioUnit, so fall back to AudioToolbox. */
+        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit);
+        pContext->coreaudio.hAudioUnit = ma_dlopen(ma_context_get_log(pContext), "/System/Library/Frameworks/AudioToolbox.framework/AudioToolbox");
+        if (pContext->coreaudio.hAudioUnit == NULL) {
+            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
+            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
+            return MA_API_NOT_FOUND;
+        }
+    }
+
+    pContext->coreaudio.AudioComponentFindNext            = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioComponentFindNext");
+    pContext->coreaudio.AudioComponentInstanceDispose     = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioComponentInstanceDispose");
+    pContext->coreaudio.AudioComponentInstanceNew         = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioComponentInstanceNew");
+    pContext->coreaudio.AudioOutputUnitStart              = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioOutputUnitStart");
+    pContext->coreaudio.AudioOutputUnitStop               = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioOutputUnitStop");
+    pContext->coreaudio.AudioUnitAddPropertyListener      = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitAddPropertyListener");
+    pContext->coreaudio.AudioUnitGetPropertyInfo          = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitGetPropertyInfo");
+    pContext->coreaudio.AudioUnitGetProperty              = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitGetProperty");
+    pContext->coreaudio.AudioUnitSetProperty              = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitSetProperty");
+    pContext->coreaudio.AudioUnitInitialize               = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitInitialize");
+    pContext->coreaudio.AudioUnitRender                   = ma_dlsym(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit, "AudioUnitRender");
+#else
+    pContext->coreaudio.CFStringGetCString                = (ma_proc)CFStringGetCString;
+    pContext->coreaudio.CFRelease                         = (ma_proc)CFRelease;
+
+    #if defined(MA_APPLE_DESKTOP)
+    pContext->coreaudio.AudioObjectGetPropertyData        = (ma_proc)AudioObjectGetPropertyData;
+    pContext->coreaudio.AudioObjectGetPropertyDataSize    = (ma_proc)AudioObjectGetPropertyDataSize;
+    pContext->coreaudio.AudioObjectSetPropertyData        = (ma_proc)AudioObjectSetPropertyData;
+    pContext->coreaudio.AudioObjectAddPropertyListener    = (ma_proc)AudioObjectAddPropertyListener;
+    pContext->coreaudio.AudioObjectRemovePropertyListener = (ma_proc)AudioObjectRemovePropertyListener;
+    #endif
+
+    pContext->coreaudio.AudioComponentFindNext            = (ma_proc)AudioComponentFindNext;
+    pContext->coreaudio.AudioComponentInstanceDispose     = (ma_proc)AudioComponentInstanceDispose;
+    pContext->coreaudio.AudioComponentInstanceNew         = (ma_proc)AudioComponentInstanceNew;
+    pContext->coreaudio.AudioOutputUnitStart              = (ma_proc)AudioOutputUnitStart;
+    pContext->coreaudio.AudioOutputUnitStop               = (ma_proc)AudioOutputUnitStop;
+    pContext->coreaudio.AudioUnitAddPropertyListener      = (ma_proc)AudioUnitAddPropertyListener;
+    pContext->coreaudio.AudioUnitGetPropertyInfo          = (ma_proc)AudioUnitGetPropertyInfo;
+    pContext->coreaudio.AudioUnitGetProperty              = (ma_proc)AudioUnitGetProperty;
+    pContext->coreaudio.AudioUnitSetProperty              = (ma_proc)AudioUnitSetProperty;
+    pContext->coreaudio.AudioUnitInitialize               = (ma_proc)AudioUnitInitialize;
+    pContext->coreaudio.AudioUnitRender                   = (ma_proc)AudioUnitRender;
+#endif
+
+    /* Audio component. */
+    {
+        AudioComponentDescription desc;
+        desc.componentType         = kAudioUnitType_Output;
+    #if defined(MA_APPLE_DESKTOP)
+        desc.componentSubType      = kAudioUnitSubType_HALOutput;
+    #else
+        desc.componentSubType      = kAudioUnitSubType_RemoteIO;
+    #endif
+        desc.componentManufacturer = kAudioUnitManufacturer_Apple;
+        desc.componentFlags        = 0;
+        desc.componentFlagsMask    = 0;
+
+        pContext->coreaudio.component = ((ma_AudioComponentFindNext_proc)pContext->coreaudio.AudioComponentFindNext)(NULL, &desc);
+        if (pContext->coreaudio.component == NULL) {
+        #if !defined(MA_NO_RUNTIME_LINKING) && !defined(MA_APPLE_MOBILE)
+            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit);
+            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
+            ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
+        #endif
+            return MA_FAILED_TO_INIT_BACKEND;
+        }
+    }
+
+#if !defined(MA_APPLE_MOBILE)
+    result = ma_context__init_device_tracking__coreaudio(pContext);
+    if (result != MA_SUCCESS) {
+    #if !defined(MA_NO_RUNTIME_LINKING) && !defined(MA_APPLE_MOBILE)
+        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hAudioUnit);
+        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreAudio);
+        ma_dlclose(ma_context_get_log(pContext), pContext->coreaudio.hCoreFoundation);
+    #endif
+        return result;
+    }
+#endif
+
+    pContext->coreaudio.noAudioSessionDeactivate = pConfig->coreaudio.noAudioSessionDeactivate;
+
+    pCallbacks->onContextInit             = ma_context_init__coreaudio;
+    pCallbacks->onContextUninit           = ma_context_uninit__coreaudio;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__coreaudio;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__coreaudio;
+    pCallbacks->onDeviceInit              = ma_device_init__coreaudio;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__coreaudio;
+    pCallbacks->onDeviceStart             = ma_device_start__coreaudio;
+    pCallbacks->onDeviceStop              = ma_device_stop__coreaudio;
+    pCallbacks->onDeviceRead              = NULL;
+    pCallbacks->onDeviceWrite             = NULL;
+    pCallbacks->onDeviceDataLoop          = NULL;
+
+    return MA_SUCCESS;
+}
+#endif  /* MA_HAS_COREAUDIO */
+
+
+
+/******************************************************************************
+
+sndio Backend
+
+******************************************************************************/
+#ifdef MA_HAS_SNDIO
+#include <fcntl.h>
+
+/*
+Only supporting OpenBSD. This did not work very well at all on FreeBSD when I tried it. Not sure if this is due
+to miniaudio's implementation or if it's some kind of system configuration issue, but basically the default device
+just doesn't emit any sound, or at times you'll hear tiny pieces. I will consider enabling this when there's
+demand for it or if I can get it tested and debugged more thoroughly.
+*/
+#if 0
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+#include <sys/audioio.h>
+#endif
+#if defined(__FreeBSD__) || defined(__DragonFly__)
+#include <sys/soundcard.h>
+#endif
+#endif
+
+#define MA_SIO_DEVANY   "default"
+#define MA_SIO_PLAY     1
+#define MA_SIO_REC      2
+#define MA_SIO_NENC     8
+#define MA_SIO_NCHAN    8
+#define MA_SIO_NRATE    16
+#define MA_SIO_NCONF    4
+
+struct ma_sio_hdl; /* <-- Opaque */
+
+struct ma_sio_par
+{
+    unsigned int bits;
+    unsigned int bps;
+    unsigned int sig;
+    unsigned int le;
+    unsigned int msb;
+    unsigned int rchan;
+    unsigned int pchan;
+    unsigned int rate;
+    unsigned int bufsz;
+    unsigned int xrun;
+    unsigned int round;
+    unsigned int appbufsz;
+    int __pad[3];
+    unsigned int __magic;
+};
+
+struct ma_sio_enc
+{
+    unsigned int bits;
+    unsigned int bps;
+    unsigned int sig;
+    unsigned int le;
+    unsigned int msb;
+};
+
+struct ma_sio_conf
+{
+    unsigned int enc;
+    unsigned int rchan;
+    unsigned int pchan;
+    unsigned int rate;
+};
+
+struct ma_sio_cap
+{
+    struct ma_sio_enc enc[MA_SIO_NENC];
+    unsigned int rchan[MA_SIO_NCHAN];
+    unsigned int pchan[MA_SIO_NCHAN];
+    unsigned int rate[MA_SIO_NRATE];
+    int __pad[7];
+    unsigned int nconf;
+    struct ma_sio_conf confs[MA_SIO_NCONF];
+};
+
+typedef struct ma_sio_hdl* (* ma_sio_open_proc)   (const char*, unsigned int, int);
+typedef void               (* ma_sio_close_proc)  (struct ma_sio_hdl*);
+typedef int                (* ma_sio_setpar_proc) (struct ma_sio_hdl*, struct ma_sio_par*);
+typedef int                (* ma_sio_getpar_proc) (struct ma_sio_hdl*, struct ma_sio_par*);
+typedef int                (* ma_sio_getcap_proc) (struct ma_sio_hdl*, struct ma_sio_cap*);
+typedef size_t             (* ma_sio_write_proc)  (struct ma_sio_hdl*, const void*, size_t);
+typedef size_t             (* ma_sio_read_proc)   (struct ma_sio_hdl*, void*, size_t);
+typedef int                (* ma_sio_start_proc)  (struct ma_sio_hdl*);
+typedef int                (* ma_sio_stop_proc)   (struct ma_sio_hdl*);
+typedef int                (* ma_sio_initpar_proc)(struct ma_sio_par*);
+
+static ma_uint32 ma_get_standard_sample_rate_priority_index__sndio(ma_uint32 sampleRate)   /* Lower = higher priority */
+{
+    ma_uint32 i;
+    for (i = 0; i < ma_countof(g_maStandardSampleRatePriorities); ++i) {
+        if (g_maStandardSampleRatePriorities[i] == sampleRate) {
+            return i;
+        }
+    }
+
+    return (ma_uint32)-1;
+}
+
+static ma_format ma_format_from_sio_enc__sndio(unsigned int bits, unsigned int bps, unsigned int sig, unsigned int le, unsigned int msb)
+{
+    /* We only support native-endian right now. */
+    if ((ma_is_little_endian() && le == 0) || (ma_is_big_endian() && le == 1)) {
+        return ma_format_unknown;
+    }
+
+    if (bits ==  8 && bps == 1 && sig == 0) {
+        return ma_format_u8;
+    }
+    if (bits == 16 && bps == 2 && sig == 1) {
+        return ma_format_s16;
+    }
+    if (bits == 24 && bps == 3 && sig == 1) {
+        return ma_format_s24;
+    }
+    if (bits == 24 && bps == 4 && sig == 1 && msb == 0) {
+        /*return ma_format_s24_32;*/
+    }
+    if (bits == 32 && bps == 4 && sig == 1) {
+        return ma_format_s32;
+    }
+
+    return ma_format_unknown;
+}
+
+static ma_format ma_find_best_format_from_sio_cap__sndio(struct ma_sio_cap* caps)
+{
+    ma_format bestFormat;
+    unsigned int iConfig;
+
+    MA_ASSERT(caps != NULL);
+
+    bestFormat = ma_format_unknown;
+    for (iConfig = 0; iConfig < caps->nconf; iConfig += 1) {
+        unsigned int iEncoding;
+        for (iEncoding = 0; iEncoding < MA_SIO_NENC; iEncoding += 1) {
+            unsigned int bits;
+            unsigned int bps;
+            unsigned int sig;
+            unsigned int le;
+            unsigned int msb;
+            ma_format format;
+
+            if ((caps->confs[iConfig].enc & (1UL << iEncoding)) == 0) {
+                continue;
+            }
+
+            bits = caps->enc[iEncoding].bits;
+            bps  = caps->enc[iEncoding].bps;
+            sig  = caps->enc[iEncoding].sig;
+            le   = caps->enc[iEncoding].le;
+            msb  = caps->enc[iEncoding].msb;
+            format = ma_format_from_sio_enc__sndio(bits, bps, sig, le, msb);
+            if (format == ma_format_unknown) {
+                continue;   /* Format not supported. */
+            }
+
+            if (bestFormat == ma_format_unknown) {
+                bestFormat = format;
+            } else {
+                if (ma_get_format_priority_index(bestFormat) > ma_get_format_priority_index(format)) {    /* <-- Lower = better. */
+                    bestFormat = format;
+                }
+            }
+        }
+    }
+
+    return bestFormat;
+}
+
+static ma_uint32 ma_find_best_channels_from_sio_cap__sndio(struct ma_sio_cap* caps, ma_device_type deviceType, ma_format requiredFormat)
+{
+    ma_uint32 maxChannels;
+    unsigned int iConfig;
+
+    MA_ASSERT(caps != NULL);
+    MA_ASSERT(requiredFormat != ma_format_unknown);
+
+    /* Just pick whatever configuration has the most channels. */
+    maxChannels = 0;
+    for (iConfig = 0; iConfig < caps->nconf; iConfig += 1) {
+        /* The encoding should be of requiredFormat. */
+        unsigned int iEncoding;
+        for (iEncoding = 0; iEncoding < MA_SIO_NENC; iEncoding += 1) {
+            unsigned int iChannel;
+            unsigned int bits;
+            unsigned int bps;
+            unsigned int sig;
+            unsigned int le;
+            unsigned int msb;
+            ma_format format;
+
+            if ((caps->confs[iConfig].enc & (1UL << iEncoding)) == 0) {
+                continue;
+            }
+
+            bits = caps->enc[iEncoding].bits;
+            bps  = caps->enc[iEncoding].bps;
+            sig  = caps->enc[iEncoding].sig;
+            le   = caps->enc[iEncoding].le;
+            msb  = caps->enc[iEncoding].msb;
+            format = ma_format_from_sio_enc__sndio(bits, bps, sig, le, msb);
+            if (format != requiredFormat) {
+                continue;
+            }
+
+            /* Getting here means the format is supported. Iterate over each channel count and grab the biggest one. */
+            for (iChannel = 0; iChannel < MA_SIO_NCHAN; iChannel += 1) {
+                unsigned int chan = 0;
+                unsigned int channels;
+
+                if (deviceType == ma_device_type_playback) {
+                    chan = caps->confs[iConfig].pchan;
+                } else {
+                    chan = caps->confs[iConfig].rchan;
+                }
+
+                if ((chan & (1UL << iChannel)) == 0) {
+                    continue;
+                }
+
+                if (deviceType == ma_device_type_playback) {
+                    channels = caps->pchan[iChannel];
+                } else {
+                    channels = caps->rchan[iChannel];
+                }
+
+                if (maxChannels < channels) {
+                    maxChannels = channels;
+                }
+            }
+        }
+    }
+
+    return maxChannels;
+}
+
+static ma_uint32 ma_find_best_sample_rate_from_sio_cap__sndio(struct ma_sio_cap* caps, ma_device_type deviceType, ma_format requiredFormat, ma_uint32 requiredChannels)
+{
+    ma_uint32 firstSampleRate;
+    ma_uint32 bestSampleRate;
+    unsigned int iConfig;
+
+    MA_ASSERT(caps != NULL);
+    MA_ASSERT(requiredFormat != ma_format_unknown);
+    MA_ASSERT(requiredChannels > 0);
+    MA_ASSERT(requiredChannels <= MA_MAX_CHANNELS);
+
+    firstSampleRate = 0; /* <-- If the device does not support a standard rate we'll fall back to the first one that's found. */
+    bestSampleRate  = 0;
+
+    for (iConfig = 0; iConfig < caps->nconf; iConfig += 1) {
+        /* The encoding should be of requiredFormat. */
+        unsigned int iEncoding;
+        for (iEncoding = 0; iEncoding < MA_SIO_NENC; iEncoding += 1) {
+            unsigned int iChannel;
+            unsigned int bits;
+            unsigned int bps;
+            unsigned int sig;
+            unsigned int le;
+            unsigned int msb;
+            ma_format format;
+
+            if ((caps->confs[iConfig].enc & (1UL << iEncoding)) == 0) {
+                continue;
+            }
+
+            bits = caps->enc[iEncoding].bits;
+            bps  = caps->enc[iEncoding].bps;
+            sig  = caps->enc[iEncoding].sig;
+            le   = caps->enc[iEncoding].le;
+            msb  = caps->enc[iEncoding].msb;
+            format = ma_format_from_sio_enc__sndio(bits, bps, sig, le, msb);
+            if (format != requiredFormat) {
+                continue;
+            }
+
+            /* Getting here means the format is supported. Iterate over each channel count and grab the biggest one. */
+            for (iChannel = 0; iChannel < MA_SIO_NCHAN; iChannel += 1) {
+                unsigned int chan = 0;
+                unsigned int channels;
+                unsigned int iRate;
+
+                if (deviceType == ma_device_type_playback) {
+                    chan = caps->confs[iConfig].pchan;
+                } else {
+                    chan = caps->confs[iConfig].rchan;
+                }
+
+                if ((chan & (1UL << iChannel)) == 0) {
+                    continue;
+                }
+
+                if (deviceType == ma_device_type_playback) {
+                    channels = caps->pchan[iChannel];
+                } else {
+                    channels = caps->rchan[iChannel];
+                }
+
+                if (channels != requiredChannels) {
+                    continue;
+                }
+
+                /* Getting here means we have found a compatible encoding/channel pair. */
+                for (iRate = 0; iRate < MA_SIO_NRATE; iRate += 1) {
+                    ma_uint32 rate = (ma_uint32)caps->rate[iRate];
+                    ma_uint32 ratePriority;
+
+                    if (firstSampleRate == 0) {
+                        firstSampleRate = rate;
+                    }
+
+                    /* Disregard this rate if it's not a standard one. */
+                    ratePriority = ma_get_standard_sample_rate_priority_index__sndio(rate);
+                    if (ratePriority == (ma_uint32)-1) {
+                        continue;
+                    }
+
+                    if (ma_get_standard_sample_rate_priority_index__sndio(bestSampleRate) > ratePriority) {   /* Lower = better. */
+                        bestSampleRate = rate;
+                    }
+                }
+            }
+        }
+    }
+
+    /* If a standard sample rate was not found just fall back to the first one that was iterated. */
+    if (bestSampleRate == 0) {
+        bestSampleRate = firstSampleRate;
+    }
+
+    return bestSampleRate;
+}
+
+
+static ma_result ma_context_enumerate_devices__sndio(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_bool32 isTerminating = MA_FALSE;
+    struct ma_sio_hdl* handle;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    /* sndio doesn't seem to have a good device enumeration API, so I'm therefore only enumerating over default devices for now. */
+
+    /* Playback. */
+    if (!isTerminating) {
+        handle = ((ma_sio_open_proc)pContext->sndio.sio_open)(MA_SIO_DEVANY, MA_SIO_PLAY, 0);
+        if (handle != NULL) {
+            /* Supports playback. */
+            ma_device_info deviceInfo;
+            MA_ZERO_OBJECT(&deviceInfo);
+            ma_strcpy_s(deviceInfo.id.sndio, sizeof(deviceInfo.id.sndio), MA_SIO_DEVANY);
+            ma_strcpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME);
+
+            isTerminating = !callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+
+            ((ma_sio_close_proc)pContext->sndio.sio_close)(handle);
+        }
+    }
+
+    /* Capture. */
+    if (!isTerminating) {
+        handle = ((ma_sio_open_proc)pContext->sndio.sio_open)(MA_SIO_DEVANY, MA_SIO_REC, 0);
+        if (handle != NULL) {
+            /* Supports capture. */
+            ma_device_info deviceInfo;
+            MA_ZERO_OBJECT(&deviceInfo);
+            ma_strcpy_s(deviceInfo.id.sndio, sizeof(deviceInfo.id.sndio), "default");
+            ma_strcpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME);
+
+            isTerminating = !callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+
+            ((ma_sio_close_proc)pContext->sndio.sio_close)(handle);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info__sndio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    char devid[256];
+    struct ma_sio_hdl* handle;
+    struct ma_sio_cap caps;
+    unsigned int iConfig;
+
+    MA_ASSERT(pContext != NULL);
+
+    /* We need to open the device before we can get information about it. */
+    if (pDeviceID == NULL) {
+        ma_strcpy_s(devid, sizeof(devid), MA_SIO_DEVANY);
+        ma_strcpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), (deviceType == ma_device_type_playback) ? MA_DEFAULT_PLAYBACK_DEVICE_NAME : MA_DEFAULT_CAPTURE_DEVICE_NAME);
+    } else {
+        ma_strcpy_s(devid, sizeof(devid), pDeviceID->sndio);
+        ma_strcpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), devid);
+    }
+
+    handle = ((ma_sio_open_proc)pContext->sndio.sio_open)(devid, (deviceType == ma_device_type_playback) ? MA_SIO_PLAY : MA_SIO_REC, 0);
+    if (handle == NULL) {
+        return MA_NO_DEVICE;
+    }
+
+    if (((ma_sio_getcap_proc)pContext->sndio.sio_getcap)(handle, &caps) == 0) {
+        return MA_ERROR;
+    }
+
+    pDeviceInfo->nativeDataFormatCount = 0;
+
+    for (iConfig = 0; iConfig < caps.nconf; iConfig += 1) {
+        /*
+        The main thing we care about is that the encoding is supported by miniaudio. If it is, we want to give
+        preference to some formats over others.
+        */
+        unsigned int iEncoding;
+        unsigned int iChannel;
+        unsigned int iRate;
+
+        for (iEncoding = 0; iEncoding < MA_SIO_NENC; iEncoding += 1) {
+            unsigned int bits;
+            unsigned int bps;
+            unsigned int sig;
+            unsigned int le;
+            unsigned int msb;
+            ma_format format;
+
+            if ((caps.confs[iConfig].enc & (1UL << iEncoding)) == 0) {
+                continue;
+            }
+
+            bits = caps.enc[iEncoding].bits;
+            bps  = caps.enc[iEncoding].bps;
+            sig  = caps.enc[iEncoding].sig;
+            le   = caps.enc[iEncoding].le;
+            msb  = caps.enc[iEncoding].msb;
+            format = ma_format_from_sio_enc__sndio(bits, bps, sig, le, msb);
+            if (format == ma_format_unknown) {
+                continue;   /* Format not supported. */
+            }
+
+
+            /* Channels. */
+            for (iChannel = 0; iChannel < MA_SIO_NCHAN; iChannel += 1) {
+                unsigned int chan = 0;
+                unsigned int channels;
+
+                if (deviceType == ma_device_type_playback) {
+                    chan = caps.confs[iConfig].pchan;
+                } else {
+                    chan = caps.confs[iConfig].rchan;
+                }
+
+                if ((chan & (1UL << iChannel)) == 0) {
+                    continue;
+                }
+
+                if (deviceType == ma_device_type_playback) {
+                    channels = caps.pchan[iChannel];
+                } else {
+                    channels = caps.rchan[iChannel];
+                }
+
+
+                /* Sample Rates. */
+                for (iRate = 0; iRate < MA_SIO_NRATE; iRate += 1) {
+                    if ((caps.confs[iConfig].rate & (1UL << iRate)) != 0) {
+                        ma_device_info_add_native_data_format(pDeviceInfo, format, channels, caps.rate[iRate], 0);
+                    }
+                }
+            }
+        }
+    }
+
+    ((ma_sio_close_proc)pContext->sndio.sio_close)(handle);
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_uninit__sndio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)pDevice->sndio.handleCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)pDevice->sndio.handlePlayback);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init_handle__sndio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptor, ma_device_type deviceType)
+{
+    const char* pDeviceName;
+    ma_ptr handle;
+    int openFlags = 0;
+    struct ma_sio_cap caps;
+    struct ma_sio_par par;
+    const ma_device_id* pDeviceID;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_format internalFormat;
+    ma_uint32 internalChannels;
+    ma_uint32 internalSampleRate;
+    ma_uint32 internalPeriodSizeInFrames;
+    ma_uint32 internalPeriods;
+
+    MA_ASSERT(pConfig    != NULL);
+    MA_ASSERT(deviceType != ma_device_type_duplex);
+    MA_ASSERT(pDevice    != NULL);
+
+    if (deviceType == ma_device_type_capture) {
+        openFlags = MA_SIO_REC;
+    } else {
+        openFlags = MA_SIO_PLAY;
+    }
+
+    pDeviceID  = pDescriptor->pDeviceID;
+    format     = pDescriptor->format;
+    channels   = pDescriptor->channels;
+    sampleRate = pDescriptor->sampleRate;
+
+    pDeviceName = MA_SIO_DEVANY;
+    if (pDeviceID != NULL) {
+        pDeviceName = pDeviceID->sndio;
+    }
+
+    handle = (ma_ptr)((ma_sio_open_proc)pDevice->pContext->sndio.sio_open)(pDeviceName, openFlags, 0);
+    if (handle == NULL) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to open device.");
+        return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+    }
+
+    /* We need to retrieve the device caps to determine the most appropriate format to use. */
+    if (((ma_sio_getcap_proc)pDevice->pContext->sndio.sio_getcap)((struct ma_sio_hdl*)handle, &caps) == 0) {
+        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)handle);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to retrieve device caps.");
+        return MA_ERROR;
+    }
+
+    /*
+    Note: sndio reports a huge range of available channels. This is inconvenient for us because there's no real
+    way, as far as I can tell, to get the _actual_ channel count of the device. I'm therefore restricting this
+    to the requested channels, regardless of whether or not the default channel count is requested.
+
+    For hardware devices, I'm suspecting only a single channel count will be reported and we can safely use the
+    value returned by ma_find_best_channels_from_sio_cap__sndio().
+    */
+    if (deviceType == ma_device_type_capture) {
+        if (format == ma_format_unknown) {
+            format = ma_find_best_format_from_sio_cap__sndio(&caps);
+        }
+
+        if (channels == 0) {
+            if (strlen(pDeviceName) > strlen("rsnd/") && strncmp(pDeviceName, "rsnd/", strlen("rsnd/")) == 0) {
+                channels = ma_find_best_channels_from_sio_cap__sndio(&caps, deviceType, format);
+            } else {
+                channels = MA_DEFAULT_CHANNELS;
+            }
+        }
+    } else {
+        if (format == ma_format_unknown) {
+            format = ma_find_best_format_from_sio_cap__sndio(&caps);
+        }
+
+        if (channels == 0) {
+            if (strlen(pDeviceName) > strlen("rsnd/") && strncmp(pDeviceName, "rsnd/", strlen("rsnd/")) == 0) {
+                channels = ma_find_best_channels_from_sio_cap__sndio(&caps, deviceType, format);
+            } else {
+                channels = MA_DEFAULT_CHANNELS;
+            }
+        }
+    }
+
+    if (sampleRate == 0) {
+        sampleRate = ma_find_best_sample_rate_from_sio_cap__sndio(&caps, pConfig->deviceType, format, channels);
+    }
+
+
+    ((ma_sio_initpar_proc)pDevice->pContext->sndio.sio_initpar)(&par);
+    par.msb = 0;
+    par.le  = ma_is_little_endian();
+
+    switch (format) {
+        case ma_format_u8:
+        {
+            par.bits = 8;
+            par.bps  = 1;
+            par.sig  = 0;
+        } break;
+
+        case ma_format_s24:
+        {
+            par.bits = 24;
+            par.bps  = 3;
+            par.sig  = 1;
+        } break;
+
+        case ma_format_s32:
+        {
+            par.bits = 32;
+            par.bps  = 4;
+            par.sig  = 1;
+        } break;
+
+        case ma_format_s16:
+        case ma_format_f32:
+        case ma_format_unknown:
+        default:
+        {
+            par.bits = 16;
+            par.bps  = 2;
+            par.sig  = 1;
+        } break;
+    }
+
+    if (deviceType == ma_device_type_capture) {
+        par.rchan = channels;
+    } else {
+        par.pchan = channels;
+    }
+
+    par.rate = sampleRate;
+
+    internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, par.rate, pConfig->performanceProfile);
+
+    par.round    = internalPeriodSizeInFrames;
+    par.appbufsz = par.round * pDescriptor->periodCount;
+
+    if (((ma_sio_setpar_proc)pDevice->pContext->sndio.sio_setpar)((struct ma_sio_hdl*)handle, &par) == 0) {
+        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)handle);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to set buffer size.");
+        return MA_ERROR;
+    }
+
+    if (((ma_sio_getpar_proc)pDevice->pContext->sndio.sio_getpar)((struct ma_sio_hdl*)handle, &par) == 0) {
+        ((ma_sio_close_proc)pDevice->pContext->sndio.sio_close)((struct ma_sio_hdl*)handle);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to retrieve buffer size.");
+        return MA_ERROR;
+    }
+
+    internalFormat             = ma_format_from_sio_enc__sndio(par.bits, par.bps, par.sig, par.le, par.msb);
+    internalChannels           = (deviceType == ma_device_type_capture) ? par.rchan : par.pchan;
+    internalSampleRate         = par.rate;
+    internalPeriods            = par.appbufsz / par.round;
+    internalPeriodSizeInFrames = par.round;
+
+    if (deviceType == ma_device_type_capture) {
+        pDevice->sndio.handleCapture  = handle;
+    } else {
+        pDevice->sndio.handlePlayback = handle;
+    }
+
+    pDescriptor->format             = internalFormat;
+    pDescriptor->channels           = internalChannels;
+    pDescriptor->sampleRate         = internalSampleRate;
+    ma_channel_map_init_standard(ma_standard_channel_map_sndio, pDescriptor->channelMap, ma_countof(pDescriptor->channelMap), internalChannels);
+    pDescriptor->periodSizeInFrames = internalPeriodSizeInFrames;
+    pDescriptor->periodCount        = internalPeriods;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init__sndio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ZERO_OBJECT(&pDevice->sndio);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ma_result result = ma_device_init_handle__sndio(pDevice, pConfig, pDescriptorCapture, ma_device_type_capture);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_result result = ma_device_init_handle__sndio(pDevice, pConfig, pDescriptorPlayback, ma_device_type_playback);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start__sndio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ((ma_sio_start_proc)pDevice->pContext->sndio.sio_start)((struct ma_sio_hdl*)pDevice->sndio.handleCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ((ma_sio_start_proc)pDevice->pContext->sndio.sio_start)((struct ma_sio_hdl*)pDevice->sndio.handlePlayback);   /* <-- Doesn't actually playback until data is written. */
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__sndio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /*
+    From the documentation:
+
+        The sio_stop() function puts the audio subsystem in the same state as before sio_start() is called. It stops recording, drains the play buffer and then
+        stops playback. If samples to play are queued but playback hasn't started yet then playback is forced immediately; playback will actually stop once the
+        buffer is drained. In no case are samples in the play buffer discarded.
+
+    Therefore, sio_stop() performs all of the necessary draining for us.
+    */
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ((ma_sio_stop_proc)pDevice->pContext->sndio.sio_stop)((struct ma_sio_hdl*)pDevice->sndio.handleCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ((ma_sio_stop_proc)pDevice->pContext->sndio.sio_stop)((struct ma_sio_hdl*)pDevice->sndio.handlePlayback);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_write__sndio(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
+{
+    int result;
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = 0;
+    }
+
+    result = ((ma_sio_write_proc)pDevice->pContext->sndio.sio_write)((struct ma_sio_hdl*)pDevice->sndio.handlePlayback, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
+    if (result == 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to send data from the client to the device.");
+        return MA_IO_ERROR;
+    }
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = frameCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_read__sndio(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
+{
+    int result;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    result = ((ma_sio_read_proc)pDevice->pContext->sndio.sio_read)((struct ma_sio_hdl*)pDevice->sndio.handleCapture, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
+    if (result == 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[sndio] Failed to read data from the device to be sent to the device.");
+        return MA_IO_ERROR;
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = frameCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_uninit__sndio(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_sndio);
+
+    (void)pContext;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__sndio(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+#ifndef MA_NO_RUNTIME_LINKING
+    const char* libsndioNames[] = {
+        "libsndio.so"
+    };
+    size_t i;
+
+    for (i = 0; i < ma_countof(libsndioNames); ++i) {
+        pContext->sndio.sndioSO = ma_dlopen(ma_context_get_log(pContext), libsndioNames[i]);
+        if (pContext->sndio.sndioSO != NULL) {
+            break;
+        }
+    }
+
+    if (pContext->sndio.sndioSO == NULL) {
+        return MA_NO_BACKEND;
+    }
+
+    pContext->sndio.sio_open    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_open");
+    pContext->sndio.sio_close   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_close");
+    pContext->sndio.sio_setpar  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_setpar");
+    pContext->sndio.sio_getpar  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_getpar");
+    pContext->sndio.sio_getcap  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_getcap");
+    pContext->sndio.sio_write   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_write");
+    pContext->sndio.sio_read    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_read");
+    pContext->sndio.sio_start   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_start");
+    pContext->sndio.sio_stop    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_stop");
+    pContext->sndio.sio_initpar = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->sndio.sndioSO, "sio_initpar");
+#else
+    pContext->sndio.sio_open    = sio_open;
+    pContext->sndio.sio_close   = sio_close;
+    pContext->sndio.sio_setpar  = sio_setpar;
+    pContext->sndio.sio_getpar  = sio_getpar;
+    pContext->sndio.sio_getcap  = sio_getcap;
+    pContext->sndio.sio_write   = sio_write;
+    pContext->sndio.sio_read    = sio_read;
+    pContext->sndio.sio_start   = sio_start;
+    pContext->sndio.sio_stop    = sio_stop;
+    pContext->sndio.sio_initpar = sio_initpar;
+#endif
+
+    pCallbacks->onContextInit             = ma_context_init__sndio;
+    pCallbacks->onContextUninit           = ma_context_uninit__sndio;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__sndio;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__sndio;
+    pCallbacks->onDeviceInit              = ma_device_init__sndio;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__sndio;
+    pCallbacks->onDeviceStart             = ma_device_start__sndio;
+    pCallbacks->onDeviceStop              = ma_device_stop__sndio;
+    pCallbacks->onDeviceRead              = ma_device_read__sndio;
+    pCallbacks->onDeviceWrite             = ma_device_write__sndio;
+    pCallbacks->onDeviceDataLoop          = NULL;
+
+    (void)pConfig;
+    return MA_SUCCESS;
+}
+#endif  /* MA_HAS_SNDIO */
+
+
+
+/******************************************************************************
+
+audio(4) Backend
+
+******************************************************************************/
+#ifdef MA_HAS_AUDIO4
+#include <fcntl.h>
+#include <poll.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/audioio.h>
+
+#ifdef __NetBSD__
+#include <sys/param.h>
+#endif
+
+#if defined(__OpenBSD__)
+    #include <sys/param.h>
+    #if defined(OpenBSD) && OpenBSD >= 201709
+        #define MA_AUDIO4_USE_NEW_API
+    #endif
+#endif
+
+static void ma_construct_device_id__audio4(char* id, size_t idSize, const char* base, int deviceIndex)
+{
+    size_t baseLen;
+
+    MA_ASSERT(id != NULL);
+    MA_ASSERT(idSize > 0);
+    MA_ASSERT(deviceIndex >= 0);
+
+    baseLen = strlen(base);
+    MA_ASSERT(idSize > baseLen);
+
+    ma_strcpy_s(id, idSize, base);
+    ma_itoa_s(deviceIndex, id+baseLen, idSize-baseLen, 10);
+}
+
+static ma_result ma_extract_device_index_from_id__audio4(const char* id, const char* base, int* pIndexOut)
+{
+    size_t idLen;
+    size_t baseLen;
+    const char* deviceIndexStr;
+
+    MA_ASSERT(id != NULL);
+    MA_ASSERT(base != NULL);
+    MA_ASSERT(pIndexOut != NULL);
+
+    idLen = strlen(id);
+    baseLen = strlen(base);
+    if (idLen <= baseLen) {
+        return MA_ERROR;   /* Doesn't look like the id starts with the base. */
+    }
+
+    if (strncmp(id, base, baseLen) != 0) {
+        return MA_ERROR;   /* ID does not begin with base. */
+    }
+
+    deviceIndexStr = id + baseLen;
+    if (deviceIndexStr[0] == '\0') {
+        return MA_ERROR;   /* No index specified in the ID. */
+    }
+
+    if (pIndexOut) {
+        *pIndexOut = atoi(deviceIndexStr);
+    }
+
+    return MA_SUCCESS;
+}
+
+
+#if !defined(MA_AUDIO4_USE_NEW_API)    /* Old API */
+static ma_format ma_format_from_encoding__audio4(unsigned int encoding, unsigned int precision)
+{
+    if (precision == 8 && (encoding == AUDIO_ENCODING_ULINEAR || encoding == AUDIO_ENCODING_ULINEAR || encoding == AUDIO_ENCODING_ULINEAR_LE || encoding == AUDIO_ENCODING_ULINEAR_BE)) {
+        return ma_format_u8;
+    } else {
+        if (ma_is_little_endian() && encoding == AUDIO_ENCODING_SLINEAR_LE) {
+            if (precision == 16) {
+                return ma_format_s16;
+            } else if (precision == 24) {
+                return ma_format_s24;
+            } else if (precision == 32) {
+                return ma_format_s32;
+            }
+        } else if (ma_is_big_endian() && encoding == AUDIO_ENCODING_SLINEAR_BE) {
+            if (precision == 16) {
+                return ma_format_s16;
+            } else if (precision == 24) {
+                return ma_format_s24;
+            } else if (precision == 32) {
+                return ma_format_s32;
+            }
+        }
+    }
+
+    return ma_format_unknown;  /* Encoding not supported. */
+}
+
+static void ma_encoding_from_format__audio4(ma_format format, unsigned int* pEncoding, unsigned int* pPrecision)
+{
+    MA_ASSERT(pEncoding  != NULL);
+    MA_ASSERT(pPrecision != NULL);
+
+    switch (format)
+    {
+        case ma_format_u8:
+        {
+            *pEncoding = AUDIO_ENCODING_ULINEAR;
+            *pPrecision = 8;
+        } break;
+
+        case ma_format_s24:
+        {
+            *pEncoding = (ma_is_little_endian()) ? AUDIO_ENCODING_SLINEAR_LE : AUDIO_ENCODING_SLINEAR_BE;
+            *pPrecision = 24;
+        } break;
+
+        case ma_format_s32:
+        {
+            *pEncoding = (ma_is_little_endian()) ? AUDIO_ENCODING_SLINEAR_LE : AUDIO_ENCODING_SLINEAR_BE;
+            *pPrecision = 32;
+        } break;
+
+        case ma_format_s16:
+        case ma_format_f32:
+        case ma_format_unknown:
+        default:
+        {
+            *pEncoding = (ma_is_little_endian()) ? AUDIO_ENCODING_SLINEAR_LE : AUDIO_ENCODING_SLINEAR_BE;
+            *pPrecision = 16;
+        } break;
+    }
+}
+
+static ma_format ma_format_from_prinfo__audio4(struct audio_prinfo* prinfo)
+{
+    return ma_format_from_encoding__audio4(prinfo->encoding, prinfo->precision);
+}
+
+static ma_format ma_best_format_from_fd__audio4(int fd, ma_format preferredFormat)
+{
+    audio_encoding_t encoding;
+    ma_uint32 iFormat;
+    int counter = 0;
+
+    /* First check to see if the preferred format is supported. */
+    if (preferredFormat != ma_format_unknown) {
+        counter = 0;
+        for (;;) {
+            MA_ZERO_OBJECT(&encoding);
+            encoding.index = counter;
+            if (ioctl(fd, AUDIO_GETENC, &encoding) < 0) {
+                break;
+            }
+
+            if (preferredFormat == ma_format_from_encoding__audio4(encoding.encoding, encoding.precision)) {
+                return preferredFormat;  /* Found the preferred format. */
+            }
+
+            /* Getting here means this encoding does not match our preferred format so we need to more on to the next encoding. */
+            counter += 1;
+        }
+    }
+
+    /* Getting here means our preferred format is not supported, so fall back to our standard priorities. */
+    for (iFormat = 0; iFormat < ma_countof(g_maFormatPriorities); iFormat += 1) {
+        ma_format format = g_maFormatPriorities[iFormat];
+
+        counter = 0;
+        for (;;) {
+            MA_ZERO_OBJECT(&encoding);
+            encoding.index = counter;
+            if (ioctl(fd, AUDIO_GETENC, &encoding) < 0) {
+                break;
+            }
+
+            if (format == ma_format_from_encoding__audio4(encoding.encoding, encoding.precision)) {
+                return format;  /* Found a workable format. */
+            }
+
+            /* Getting here means this encoding does not match our preferred format so we need to more on to the next encoding. */
+            counter += 1;
+        }
+    }
+
+    /* Getting here means not appropriate format was found. */
+    return ma_format_unknown;
+}
+#else
+static ma_format ma_format_from_swpar__audio4(struct audio_swpar* par)
+{
+    if (par->bits == 8 && par->bps == 1 && par->sig == 0) {
+        return ma_format_u8;
+    }
+    if (par->bits == 16 && par->bps == 2 && par->sig == 1 && par->le == ma_is_little_endian()) {
+        return ma_format_s16;
+    }
+    if (par->bits == 24 && par->bps == 3 && par->sig == 1 && par->le == ma_is_little_endian()) {
+        return ma_format_s24;
+    }
+    if (par->bits == 32 && par->bps == 4 && par->sig == 1 && par->le == ma_is_little_endian()) {
+        return ma_format_f32;
+    }
+
+    /* Format not supported. */
+    return ma_format_unknown;
+}
+#endif
+
+static ma_result ma_context_get_device_info_from_fd__audio4(ma_context* pContext, ma_device_type deviceType, int fd, ma_device_info* pDeviceInfo)
+{
+    audio_device_t fdDevice;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(fd >= 0);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    (void)pContext;
+    (void)deviceType;
+
+    if (ioctl(fd, AUDIO_GETDEV, &fdDevice) < 0) {
+        return MA_ERROR;   /* Failed to retrieve device info. */
+    }
+
+    /* Name. */
+    ma_strcpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), fdDevice.name);
+
+    #if !defined(MA_AUDIO4_USE_NEW_API)
+    {
+        audio_info_t fdInfo;
+        int counter = 0;
+        ma_uint32 channels;
+        ma_uint32 sampleRate;
+
+#if defined(__NetBSD__) && (__NetBSD_Version__ >= 900000000)
+        if (ioctl(fd, AUDIO_GETFORMAT, &fdInfo) < 0) {
+            return MA_ERROR;
+        }
+#else
+        if (ioctl(fd, AUDIO_GETINFO, &fdInfo) < 0) {
+            return MA_ERROR;
+        }
+#endif
+
+        if (deviceType == ma_device_type_playback) {
+            channels   = fdInfo.play.channels;
+            sampleRate = fdInfo.play.sample_rate;
+        } else {
+            channels   = fdInfo.record.channels;
+            sampleRate = fdInfo.record.sample_rate;
+        }
+
+        /* Supported formats. We get this by looking at the encodings. */
+        pDeviceInfo->nativeDataFormatCount = 0;
+        for (;;) {
+            audio_encoding_t encoding;
+            ma_format format;
+
+            MA_ZERO_OBJECT(&encoding);
+            encoding.index = counter;
+            if (ioctl(fd, AUDIO_GETENC, &encoding) < 0) {
+                break;
+            }
+
+            format = ma_format_from_encoding__audio4(encoding.encoding, encoding.precision);
+            if (format != ma_format_unknown) {
+                ma_device_info_add_native_data_format(pDeviceInfo, format, channels, sampleRate, 0);
+            }
+
+            counter += 1;
+        }
+    }
+    #else
+    {
+        struct audio_swpar fdPar;
+        ma_format format;
+        ma_uint32 channels;
+        ma_uint32 sampleRate;
+
+        if (ioctl(fd, AUDIO_GETPAR, &fdPar) < 0) {
+            return MA_ERROR;
+        }
+
+        format = ma_format_from_swpar__audio4(&fdPar);
+        if (format == ma_format_unknown) {
+            return MA_FORMAT_NOT_SUPPORTED;
+        }
+
+        if (deviceType == ma_device_type_playback) {
+            channels = fdPar.pchan;
+        } else {
+            channels = fdPar.rchan;
+        }
+
+        sampleRate = fdPar.rate;
+
+        pDeviceInfo->nativeDataFormatCount = 0;
+        ma_device_info_add_native_data_format(pDeviceInfo, format, channels, sampleRate, 0);
+    }
+    #endif
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_enumerate_devices__audio4(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    const int maxDevices = 64;
+    char devpath[256];
+    int iDevice;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    /*
+    Every device will be named "/dev/audioN", with a "/dev/audioctlN" equivalent. We use the "/dev/audioctlN"
+    version here since we can open it even when another process has control of the "/dev/audioN" device.
+    */
+    for (iDevice = 0; iDevice < maxDevices; ++iDevice) {
+        struct stat st;
+        int fd;
+        ma_bool32 isTerminating = MA_FALSE;
+
+        ma_strcpy_s(devpath, sizeof(devpath), "/dev/audioctl");
+        ma_itoa_s(iDevice, devpath+strlen(devpath), sizeof(devpath)-strlen(devpath), 10);
+
+        if (stat(devpath, &st) < 0) {
+            break;
+        }
+
+        /* The device exists, but we need to check if it's usable as playback and/or capture. */
+
+        /* Playback. */
+        if (!isTerminating) {
+            fd = open(devpath, O_RDONLY, 0);
+            if (fd >= 0) {
+                /* Supports playback. */
+                ma_device_info deviceInfo;
+                MA_ZERO_OBJECT(&deviceInfo);
+                ma_construct_device_id__audio4(deviceInfo.id.audio4, sizeof(deviceInfo.id.audio4), "/dev/audio", iDevice);
+                if (ma_context_get_device_info_from_fd__audio4(pContext, ma_device_type_playback, fd, &deviceInfo) == MA_SUCCESS) {
+                    isTerminating = !callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+                }
+
+                close(fd);
+            }
+        }
+
+        /* Capture. */
+        if (!isTerminating) {
+            fd = open(devpath, O_WRONLY, 0);
+            if (fd >= 0) {
+                /* Supports capture. */
+                ma_device_info deviceInfo;
+                MA_ZERO_OBJECT(&deviceInfo);
+                ma_construct_device_id__audio4(deviceInfo.id.audio4, sizeof(deviceInfo.id.audio4), "/dev/audio", iDevice);
+                if (ma_context_get_device_info_from_fd__audio4(pContext, ma_device_type_capture, fd, &deviceInfo) == MA_SUCCESS) {
+                    isTerminating = !callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+                }
+
+                close(fd);
+            }
+        }
+
+        if (isTerminating) {
+            break;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info__audio4(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    int fd = -1;
+    int deviceIndex = -1;
+    char ctlid[256];
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+
+    /*
+    We need to open the "/dev/audioctlN" device to get the info. To do this we need to extract the number
+    from the device ID which will be in "/dev/audioN" format.
+    */
+    if (pDeviceID == NULL) {
+        /* Default device. */
+        ma_strcpy_s(ctlid, sizeof(ctlid), "/dev/audioctl");
+    } else {
+        /* Specific device. We need to convert from "/dev/audioN" to "/dev/audioctlN". */
+        result = ma_extract_device_index_from_id__audio4(pDeviceID->audio4, "/dev/audio", &deviceIndex);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        ma_construct_device_id__audio4(ctlid, sizeof(ctlid), "/dev/audioctl", deviceIndex);
+    }
+
+    fd = open(ctlid, (deviceType == ma_device_type_playback) ? O_WRONLY : O_RDONLY, 0);
+    if (fd == -1) {
+        return MA_NO_DEVICE;
+    }
+
+    if (deviceIndex == -1) {
+        ma_strcpy_s(pDeviceInfo->id.audio4, sizeof(pDeviceInfo->id.audio4), "/dev/audio");
+    } else {
+        ma_construct_device_id__audio4(pDeviceInfo->id.audio4, sizeof(pDeviceInfo->id.audio4), "/dev/audio", deviceIndex);
+    }
+
+    result = ma_context_get_device_info_from_fd__audio4(pContext, deviceType, fd, pDeviceInfo);
+
+    close(fd);
+    return result;
+}
+
+static ma_result ma_device_uninit__audio4(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        close(pDevice->audio4.fdCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        close(pDevice->audio4.fdPlayback);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init_fd__audio4(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptor, ma_device_type deviceType)
+{
+    const char* pDefaultDeviceNames[] = {
+        "/dev/audio",
+        "/dev/audio0"
+    };
+    const char* pDefaultDeviceCtlNames[] = {
+        "/dev/audioctl",
+        "/dev/audioctl0"
+    };
+    int fd;
+    int fdFlags = 0;
+    size_t iDefaultDevice = (size_t)-1;
+    ma_format internalFormat;
+    ma_uint32 internalChannels;
+    ma_uint32 internalSampleRate;
+    ma_uint32 internalPeriodSizeInFrames;
+    ma_uint32 internalPeriods;
+
+    MA_ASSERT(pConfig    != NULL);
+    MA_ASSERT(deviceType != ma_device_type_duplex);
+    MA_ASSERT(pDevice    != NULL);
+
+    /* The first thing to do is open the file. */
+    if (deviceType == ma_device_type_capture) {
+        fdFlags = O_RDONLY;
+    } else {
+        fdFlags = O_WRONLY;
+    }
+    /*fdFlags |= O_NONBLOCK;*/
+
+    /* Find the index of the default device as a start. We'll use this index later. Set it to (size_t)-1 otherwise. */
+    if (pDescriptor->pDeviceID == NULL) {
+        /* Default device. */
+        for (iDefaultDevice = 0; iDefaultDevice < ma_countof(pDefaultDeviceNames); ++iDefaultDevice) {
+            fd = open(pDefaultDeviceNames[iDefaultDevice], fdFlags, 0);
+            if (fd != -1) {
+                break;
+            }
+        }
+    } else {
+        /* Specific device. */
+        fd = open(pDescriptor->pDeviceID->audio4, fdFlags, 0);
+
+        for (iDefaultDevice = 0; iDefaultDevice < ma_countof(pDefaultDeviceNames); iDefaultDevice += 1) {
+            if (ma_strcmp(pDefaultDeviceNames[iDefaultDevice], pDescriptor->pDeviceID->audio4) == 0) {
+                break;
+            }
+        }
+
+        if (iDefaultDevice == ma_countof(pDefaultDeviceNames)) {
+            iDefaultDevice = (size_t)-1;
+        }
+    }
+
+    if (fd == -1) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to open device.");
+        return ma_result_from_errno(errno);
+    }
+
+    #if !defined(MA_AUDIO4_USE_NEW_API)    /* Old API */
+    {
+        audio_info_t fdInfo;
+        int fdInfoResult = -1;
+
+        /*
+        The documentation is a little bit unclear to me as to how it handles formats. It says the
+        following:
+
+            Regardless of formats supported by underlying driver, the audio driver accepts the
+            following formats.
+
+        By then the next sentence says this:
+
+            `encoding` and `precision` are one of the values obtained by AUDIO_GETENC.
+
+        It sounds like a direct contradiction to me. I'm going to play this safe any only use the
+        best sample format returned by AUDIO_GETENC. If the requested format is supported we'll
+        use that, but otherwise we'll just use our standard format priorities to pick an
+        appropriate one.
+        */
+        AUDIO_INITINFO(&fdInfo);
+
+        /*
+        Get the default format from the audioctl file if we're asking for a default device. If we
+        retrieve it from /dev/audio it'll default to mono 8000Hz.
+        */
+        if (iDefaultDevice != (size_t)-1) {
+            /* We're using a default device. Get the info from the /dev/audioctl file instead of /dev/audio. */
+            int fdctl = open(pDefaultDeviceCtlNames[iDefaultDevice], fdFlags, 0);
+            if (fdctl != -1) {
+#if defined(__NetBSD__) && (__NetBSD_Version__ >= 900000000)
+                fdInfoResult = ioctl(fdctl, AUDIO_GETFORMAT, &fdInfo);
+#else
+                fdInfoResult = ioctl(fdctl, AUDIO_GETINFO, &fdInfo);
+#endif
+                close(fdctl);
+            }
+        }
+
+        if (fdInfoResult == -1) {
+            /* We still don't have the default device info so just retrieve it from the main audio device. */
+            if (ioctl(fd, AUDIO_GETINFO, &fdInfo) < 0) {
+                close(fd);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] AUDIO_GETINFO failed.");
+                return ma_result_from_errno(errno);
+            }
+        }
+
+        /* We get the driver to do as much of the data conversion as possible. */
+        if (deviceType == ma_device_type_capture) {
+            fdInfo.mode = AUMODE_RECORD;
+            ma_encoding_from_format__audio4(ma_best_format_from_fd__audio4(fd, pDescriptor->format), &fdInfo.record.encoding, &fdInfo.record.precision);
+
+            if (pDescriptor->channels != 0) {
+                fdInfo.record.channels = ma_clamp(pDescriptor->channels, 1, 12);    /* From the documentation: `channels` ranges from 1 to 12. */
+            }
+
+            if (pDescriptor->sampleRate != 0) {
+                fdInfo.record.sample_rate = ma_clamp(pDescriptor->sampleRate, 1000, 192000);    /* From the documentation: `frequency` ranges from 1000Hz to 192000Hz. (They mean `sample_rate` instead of `frequency`.) */
+            }
+        } else {
+            fdInfo.mode = AUMODE_PLAY;
+            ma_encoding_from_format__audio4(ma_best_format_from_fd__audio4(fd, pDescriptor->format), &fdInfo.play.encoding, &fdInfo.play.precision);
+
+            if (pDescriptor->channels != 0) {
+                fdInfo.play.channels = ma_clamp(pDescriptor->channels, 1, 12);    /* From the documentation: `channels` ranges from 1 to 12. */
+            }
+
+            if (pDescriptor->sampleRate != 0) {
+                fdInfo.play.sample_rate = ma_clamp(pDescriptor->sampleRate, 1000, 192000);    /* From the documentation: `frequency` ranges from 1000Hz to 192000Hz. (They mean `sample_rate` instead of `frequency`.) */
+            }
+        }
+
+        if (ioctl(fd, AUDIO_SETINFO, &fdInfo) < 0) {
+            close(fd);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to set device format. AUDIO_SETINFO failed.");
+            return ma_result_from_errno(errno);
+        }
+
+        if (ioctl(fd, AUDIO_GETINFO, &fdInfo) < 0) {
+            close(fd);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] AUDIO_GETINFO failed.");
+            return ma_result_from_errno(errno);
+        }
+
+        if (deviceType == ma_device_type_capture) {
+            internalFormat     = ma_format_from_prinfo__audio4(&fdInfo.record);
+            internalChannels   = fdInfo.record.channels;
+            internalSampleRate = fdInfo.record.sample_rate;
+        } else {
+            internalFormat     = ma_format_from_prinfo__audio4(&fdInfo.play);
+            internalChannels   = fdInfo.play.channels;
+            internalSampleRate = fdInfo.play.sample_rate;
+        }
+
+        if (internalFormat == ma_format_unknown) {
+            close(fd);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] The device's internal device format is not supported by miniaudio. The device is unusable.");
+            return MA_FORMAT_NOT_SUPPORTED;
+        }
+
+        /* Buffer. */
+        {
+            ma_uint32 internalPeriodSizeInBytes;
+
+            internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, internalSampleRate, pConfig->performanceProfile);
+
+            internalPeriodSizeInBytes = internalPeriodSizeInFrames * ma_get_bytes_per_frame(internalFormat, internalChannels);
+            if (internalPeriodSizeInBytes < 16) {
+                internalPeriodSizeInBytes = 16;
+            }
+
+            internalPeriods = pDescriptor->periodCount;
+            if (internalPeriods < 2) {
+                internalPeriods = 2;
+            }
+
+            /* What miniaudio calls a period, audio4 calls a block. */
+            AUDIO_INITINFO(&fdInfo);
+            fdInfo.hiwat     = internalPeriods;
+            fdInfo.lowat     = internalPeriods-1;
+            fdInfo.blocksize = internalPeriodSizeInBytes;
+            if (ioctl(fd, AUDIO_SETINFO, &fdInfo) < 0) {
+                close(fd);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to set internal buffer size. AUDIO_SETINFO failed.");
+                return ma_result_from_errno(errno);
+            }
+
+            internalPeriods            = fdInfo.hiwat;
+            internalPeriodSizeInFrames = fdInfo.blocksize / ma_get_bytes_per_frame(internalFormat, internalChannels);
+        }
+    }
+    #else
+    {
+        struct audio_swpar fdPar;
+
+        /* We need to retrieve the format of the device so we can know the channel count and sample rate. Then we can calculate the buffer size. */
+        if (ioctl(fd, AUDIO_GETPAR, &fdPar) < 0) {
+            close(fd);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to retrieve initial device parameters.");
+            return ma_result_from_errno(errno);
+        }
+
+        internalFormat     = ma_format_from_swpar__audio4(&fdPar);
+        internalChannels   = (deviceType == ma_device_type_capture) ? fdPar.rchan : fdPar.pchan;
+        internalSampleRate = fdPar.rate;
+
+        if (internalFormat == ma_format_unknown) {
+            close(fd);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] The device's internal device format is not supported by miniaudio. The device is unusable.");
+            return MA_FORMAT_NOT_SUPPORTED;
+        }
+
+        /* Buffer. */
+        {
+            ma_uint32 internalPeriodSizeInBytes;
+
+            internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, internalSampleRate, pConfig->performanceProfile);
+
+            /* What miniaudio calls a period, audio4 calls a block. */
+            internalPeriodSizeInBytes = internalPeriodSizeInFrames * ma_get_bytes_per_frame(internalFormat, internalChannels);
+            if (internalPeriodSizeInBytes < 16) {
+                internalPeriodSizeInBytes = 16;
+            }
+
+            fdPar.nblks = pDescriptor->periodCount;
+            fdPar.round = internalPeriodSizeInBytes;
+
+            if (ioctl(fd, AUDIO_SETPAR, &fdPar) < 0) {
+                close(fd);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to set device parameters.");
+                return ma_result_from_errno(errno);
+            }
+
+            if (ioctl(fd, AUDIO_GETPAR, &fdPar) < 0) {
+                close(fd);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to retrieve actual device parameters.");
+                return ma_result_from_errno(errno);
+            }
+        }
+
+        internalFormat             = ma_format_from_swpar__audio4(&fdPar);
+        internalChannels           = (deviceType == ma_device_type_capture) ? fdPar.rchan : fdPar.pchan;
+        internalSampleRate         = fdPar.rate;
+        internalPeriods            = fdPar.nblks;
+        internalPeriodSizeInFrames = fdPar.round / ma_get_bytes_per_frame(internalFormat, internalChannels);
+    }
+    #endif
+
+    if (internalFormat == ma_format_unknown) {
+        close(fd);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] The device's internal device format is not supported by miniaudio. The device is unusable.");
+        return MA_FORMAT_NOT_SUPPORTED;
+    }
+
+    if (deviceType == ma_device_type_capture) {
+        pDevice->audio4.fdCapture  = fd;
+    } else {
+        pDevice->audio4.fdPlayback = fd;
+    }
+
+    pDescriptor->format             = internalFormat;
+    pDescriptor->channels           = internalChannels;
+    pDescriptor->sampleRate         = internalSampleRate;
+    ma_channel_map_init_standard(ma_standard_channel_map_sound4, pDescriptor->channelMap, ma_countof(pDescriptor->channelMap), internalChannels);
+    pDescriptor->periodSizeInFrames = internalPeriodSizeInFrames;
+    pDescriptor->periodCount        = internalPeriods;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init__audio4(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ZERO_OBJECT(&pDevice->audio4);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    pDevice->audio4.fdCapture  = -1;
+    pDevice->audio4.fdPlayback = -1;
+
+    /*
+    The version of the operating system dictates whether or not the device is exclusive or shared. NetBSD
+    introduced in-kernel mixing which means it's shared. All other BSD flavours are exclusive as far as
+    I'm aware.
+    */
+#if defined(__NetBSD_Version__) && __NetBSD_Version__ >= 800000000
+    /* NetBSD 8.0+ */
+    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
+        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
+        return MA_SHARE_MODE_NOT_SUPPORTED;
+    }
+#else
+    /* All other flavors. */
+#endif
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ma_result result = ma_device_init_fd__audio4(pDevice, pConfig, pDescriptorCapture, ma_device_type_capture);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_result result = ma_device_init_fd__audio4(pDevice, pConfig, pDescriptorPlayback, ma_device_type_playback);
+        if (result != MA_SUCCESS) {
+            if (pConfig->deviceType == ma_device_type_duplex) {
+                close(pDevice->audio4.fdCapture);
+            }
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start__audio4(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->audio4.fdCapture == -1) {
+            return MA_INVALID_ARGS;
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->audio4.fdPlayback == -1) {
+            return MA_INVALID_ARGS;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop_fd__audio4(ma_device* pDevice, int fd)
+{
+    if (fd == -1) {
+        return MA_INVALID_ARGS;
+    }
+
+#if !defined(MA_AUDIO4_USE_NEW_API)
+    if (ioctl(fd, AUDIO_FLUSH, 0) < 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to stop device. AUDIO_FLUSH failed.");
+        return ma_result_from_errno(errno);
+    }
+#else
+    if (ioctl(fd, AUDIO_STOP, 0) < 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to stop device. AUDIO_STOP failed.");
+        return ma_result_from_errno(errno);
+    }
+#endif
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__audio4(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ma_result result;
+
+        result = ma_device_stop_fd__audio4(pDevice, pDevice->audio4.fdCapture);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_result result;
+
+        /* Drain the device first. If this fails we'll just need to flush without draining. Unfortunately draining isn't available on newer version of OpenBSD. */
+    #if !defined(MA_AUDIO4_USE_NEW_API)
+        ioctl(pDevice->audio4.fdPlayback, AUDIO_DRAIN, 0);
+    #endif
+
+        /* Here is where the device is stopped immediately. */
+        result = ma_device_stop_fd__audio4(pDevice, pDevice->audio4.fdPlayback);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_write__audio4(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
+{
+    int result;
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = 0;
+    }
+
+    result = write(pDevice->audio4.fdPlayback, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
+    if (result < 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to write data to the device.");
+        return ma_result_from_errno(errno);
+    }
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = (ma_uint32)result / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_read__audio4(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
+{
+    int result;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    result = read(pDevice->audio4.fdCapture, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
+    if (result < 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[audio4] Failed to read data from the device.");
+        return ma_result_from_errno(errno);
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = (ma_uint32)result / ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_uninit__audio4(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_audio4);
+
+    (void)pContext;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__audio4(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    MA_ASSERT(pContext != NULL);
+
+    (void)pConfig;
+
+    pCallbacks->onContextInit             = ma_context_init__audio4;
+    pCallbacks->onContextUninit           = ma_context_uninit__audio4;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__audio4;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__audio4;
+    pCallbacks->onDeviceInit              = ma_device_init__audio4;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__audio4;
+    pCallbacks->onDeviceStart             = ma_device_start__audio4;
+    pCallbacks->onDeviceStop              = ma_device_stop__audio4;
+    pCallbacks->onDeviceRead              = ma_device_read__audio4;
+    pCallbacks->onDeviceWrite             = ma_device_write__audio4;
+    pCallbacks->onDeviceDataLoop          = NULL;
+
+    return MA_SUCCESS;
+}
+#endif  /* MA_HAS_AUDIO4 */
+
+
+/******************************************************************************
+
+OSS Backend
+
+******************************************************************************/
+#ifdef MA_HAS_OSS
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/soundcard.h>
+
+#ifndef SNDCTL_DSP_HALT
+#define SNDCTL_DSP_HALT SNDCTL_DSP_RESET
+#endif
+
+#define MA_OSS_DEFAULT_DEVICE_NAME  "/dev/dsp"
+
+static int ma_open_temp_device__oss()
+{
+    /* The OSS sample code uses "/dev/mixer" as the device for getting system properties so I'm going to do the same. */
+    int fd = open("/dev/mixer", O_RDONLY, 0);
+    if (fd >= 0) {
+        return fd;
+    }
+
+    return -1;
+}
+
+static ma_result ma_context_open_device__oss(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_share_mode shareMode, int* pfd)
+{
+    const char* deviceName;
+    int flags;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pfd != NULL);
+    (void)pContext;
+
+    *pfd = -1;
+
+    /* This function should only be called for playback or capture, not duplex. */
+    if (deviceType == ma_device_type_duplex) {
+        return MA_INVALID_ARGS;
+    }
+
+    deviceName = MA_OSS_DEFAULT_DEVICE_NAME;
+    if (pDeviceID != NULL) {
+        deviceName = pDeviceID->oss;
+    }
+
+    flags = (deviceType == ma_device_type_playback) ? O_WRONLY : O_RDONLY;
+    if (shareMode == ma_share_mode_exclusive) {
+        flags |= O_EXCL;
+    }
+
+    *pfd = open(deviceName, flags, 0);
+    if (*pfd == -1) {
+        return ma_result_from_errno(errno);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_enumerate_devices__oss(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    int fd;
+    oss_sysinfo si;
+    int result;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    fd = ma_open_temp_device__oss();
+    if (fd == -1) {
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open a temporary device for retrieving system information used for device enumeration.");
+        return MA_NO_BACKEND;
+    }
+
+    result = ioctl(fd, SNDCTL_SYSINFO, &si);
+    if (result != -1) {
+        int iAudioDevice;
+        for (iAudioDevice = 0; iAudioDevice < si.numaudios; ++iAudioDevice) {
+            oss_audioinfo ai;
+            ai.dev = iAudioDevice;
+            result = ioctl(fd, SNDCTL_AUDIOINFO, &ai);
+            if (result != -1) {
+                if (ai.devnode[0] != '\0') {    /* <-- Can be blank, according to documentation. */
+                    ma_device_info deviceInfo;
+                    ma_bool32 isTerminating = MA_FALSE;
+
+                    MA_ZERO_OBJECT(&deviceInfo);
+
+                    /* ID */
+                    ma_strncpy_s(deviceInfo.id.oss, sizeof(deviceInfo.id.oss), ai.devnode, (size_t)-1);
+
+                    /*
+                    The human readable device name should be in the "ai.handle" variable, but it can
+                    sometimes be empty in which case we just fall back to "ai.name" which is less user
+                    friendly, but usually has a value.
+                    */
+                    if (ai.handle[0] != '\0') {
+                        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), ai.handle, (size_t)-1);
+                    } else {
+                        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), ai.name, (size_t)-1);
+                    }
+
+                    /* The device can be both playback and capture. */
+                    if (!isTerminating && (ai.caps & PCM_CAP_OUTPUT) != 0) {
+                        isTerminating = !callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+                    }
+                    if (!isTerminating && (ai.caps & PCM_CAP_INPUT) != 0) {
+                        isTerminating = !callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+                    }
+
+                    if (isTerminating) {
+                        break;
+                    }
+                }
+            }
+        }
+    } else {
+        close(fd);
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to retrieve system information for device enumeration.");
+        return MA_NO_BACKEND;
+    }
+
+    close(fd);
+    return MA_SUCCESS;
+}
+
+static void ma_context_add_native_data_format__oss(ma_context* pContext, oss_audioinfo* pAudioInfo, ma_format format, ma_device_info* pDeviceInfo)
+{
+    unsigned int minChannels;
+    unsigned int maxChannels;
+    unsigned int iRate;
+
+    MA_ASSERT(pContext    != NULL);
+    MA_ASSERT(pAudioInfo  != NULL);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    /* If we support all channels we just report 0. */
+    minChannels = ma_clamp(pAudioInfo->min_channels, MA_MIN_CHANNELS, MA_MAX_CHANNELS);
+    maxChannels = ma_clamp(pAudioInfo->max_channels, MA_MIN_CHANNELS, MA_MAX_CHANNELS);
+
+    /*
+    OSS has this annoying thing where sample rates can be reported in two ways. We prefer explicitness,
+    which OSS has in the form of nrates/rates, however there are times where nrates can be 0, in which
+    case we'll need to use min_rate and max_rate and report only standard rates.
+    */
+    if (pAudioInfo->nrates > 0) {
+        for (iRate = 0; iRate < pAudioInfo->nrates; iRate += 1) {
+            unsigned int rate = pAudioInfo->rates[iRate];
+
+            if (minChannels == MA_MIN_CHANNELS && maxChannels == MA_MAX_CHANNELS) {
+                ma_device_info_add_native_data_format(pDeviceInfo, format, 0, rate, 0);   /* Set the channel count to 0 to indicate that all channel counts are supported. */
+            } else {
+                unsigned int iChannel;
+                for (iChannel = minChannels; iChannel <= maxChannels; iChannel += 1) {
+                     ma_device_info_add_native_data_format(pDeviceInfo, format, iChannel, rate, 0);
+                }
+            }
+        }
+    } else {
+        for (iRate = 0; iRate < ma_countof(g_maStandardSampleRatePriorities); iRate += 1) {
+            ma_uint32 standardRate = g_maStandardSampleRatePriorities[iRate];
+
+            if (standardRate >= (ma_uint32)pAudioInfo->min_rate && standardRate <= (ma_uint32)pAudioInfo->max_rate) {
+                if (minChannels == MA_MIN_CHANNELS && maxChannels == MA_MAX_CHANNELS) {
+                    ma_device_info_add_native_data_format(pDeviceInfo, format, 0, standardRate, 0);   /* Set the channel count to 0 to indicate that all channel counts are supported. */
+                } else {
+                    unsigned int iChannel;
+                    for (iChannel = minChannels; iChannel <= maxChannels; iChannel += 1) {
+                         ma_device_info_add_native_data_format(pDeviceInfo, format, iChannel, standardRate, 0);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static ma_result ma_context_get_device_info__oss(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    ma_bool32 foundDevice;
+    int fdTemp;
+    oss_sysinfo si;
+    int result;
+
+    MA_ASSERT(pContext != NULL);
+
+    /* Handle the default device a little differently. */
+    if (pDeviceID == NULL) {
+        if (deviceType == ma_device_type_playback) {
+            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+        } else {
+            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+        }
+
+        return MA_SUCCESS;
+    }
+
+
+    /* If we get here it means we are _not_ using the default device. */
+    foundDevice = MA_FALSE;
+
+    fdTemp = ma_open_temp_device__oss();
+    if (fdTemp == -1) {
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open a temporary device for retrieving system information used for device enumeration.");
+        return MA_NO_BACKEND;
+    }
+
+    result = ioctl(fdTemp, SNDCTL_SYSINFO, &si);
+    if (result != -1) {
+        int iAudioDevice;
+        for (iAudioDevice = 0; iAudioDevice < si.numaudios; ++iAudioDevice) {
+            oss_audioinfo ai;
+            ai.dev = iAudioDevice;
+            result = ioctl(fdTemp, SNDCTL_AUDIOINFO, &ai);
+            if (result != -1) {
+                if (ma_strcmp(ai.devnode, pDeviceID->oss) == 0) {
+                    /* It has the same name, so now just confirm the type. */
+                    if ((deviceType == ma_device_type_playback && ((ai.caps & PCM_CAP_OUTPUT) != 0)) ||
+                        (deviceType == ma_device_type_capture  && ((ai.caps & PCM_CAP_INPUT)  != 0))) {
+                        unsigned int formatMask;
+
+                        /* ID */
+                        ma_strncpy_s(pDeviceInfo->id.oss, sizeof(pDeviceInfo->id.oss), ai.devnode, (size_t)-1);
+
+                        /*
+                        The human readable device name should be in the "ai.handle" variable, but it can
+                        sometimes be empty in which case we just fall back to "ai.name" which is less user
+                        friendly, but usually has a value.
+                        */
+                        if (ai.handle[0] != '\0') {
+                            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), ai.handle, (size_t)-1);
+                        } else {
+                            ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), ai.name, (size_t)-1);
+                        }
+
+
+                        pDeviceInfo->nativeDataFormatCount = 0;
+
+                        if (deviceType == ma_device_type_playback) {
+                            formatMask = ai.oformats;
+                        } else {
+                            formatMask = ai.iformats;
+                        }
+
+                        if (((formatMask & AFMT_S16_LE) != 0 && ma_is_little_endian()) || (AFMT_S16_BE && ma_is_big_endian())) {
+                            ma_context_add_native_data_format__oss(pContext, &ai, ma_format_s16, pDeviceInfo);
+                        }
+                        if (((formatMask & AFMT_S32_LE) != 0 && ma_is_little_endian()) || (AFMT_S32_BE && ma_is_big_endian())) {
+                            ma_context_add_native_data_format__oss(pContext, &ai, ma_format_s32, pDeviceInfo);
+                        }
+                        if ((formatMask & AFMT_U8) != 0) {
+                            ma_context_add_native_data_format__oss(pContext, &ai, ma_format_u8, pDeviceInfo);
+                        }
+
+                        foundDevice = MA_TRUE;
+                        break;
+                    }
+                }
+            }
+        }
+    } else {
+        close(fdTemp);
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to retrieve system information for device enumeration.");
+        return MA_NO_BACKEND;
+    }
+
+
+    close(fdTemp);
+
+    if (!foundDevice) {
+        return MA_NO_DEVICE;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_uninit__oss(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        close(pDevice->oss.fdCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        close(pDevice->oss.fdPlayback);
+    }
+
+    return MA_SUCCESS;
+}
+
+static int ma_format_to_oss(ma_format format)
+{
+    int ossFormat = AFMT_U8;
+    switch (format) {
+        case ma_format_s16: ossFormat = (ma_is_little_endian()) ? AFMT_S16_LE : AFMT_S16_BE; break;
+        case ma_format_s24: ossFormat = (ma_is_little_endian()) ? AFMT_S32_LE : AFMT_S32_BE; break;
+        case ma_format_s32: ossFormat = (ma_is_little_endian()) ? AFMT_S32_LE : AFMT_S32_BE; break;
+        case ma_format_f32: ossFormat = (ma_is_little_endian()) ? AFMT_S16_LE : AFMT_S16_BE; break;
+        case ma_format_u8:
+        default: ossFormat = AFMT_U8; break;
+    }
+
+    return ossFormat;
+}
+
+static ma_format ma_format_from_oss(int ossFormat)
+{
+    if (ossFormat == AFMT_U8) {
+        return ma_format_u8;
+    } else {
+        if (ma_is_little_endian()) {
+            switch (ossFormat) {
+                case AFMT_S16_LE: return ma_format_s16;
+                case AFMT_S32_LE: return ma_format_s32;
+                default: return ma_format_unknown;
+            }
+        } else {
+            switch (ossFormat) {
+                case AFMT_S16_BE: return ma_format_s16;
+                case AFMT_S32_BE: return ma_format_s32;
+                default: return ma_format_unknown;
+            }
+        }
+    }
+
+    return ma_format_unknown;
+}
+
+static ma_result ma_device_init_fd__oss(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptor, ma_device_type deviceType)
+{
+    ma_result result;
+    int ossResult;
+    int fd;
+    const ma_device_id* pDeviceID = NULL;
+    ma_share_mode shareMode;
+    int ossFormat;
+    int ossChannels;
+    int ossSampleRate;
+    int ossFragment;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(deviceType != ma_device_type_duplex);
+
+    pDeviceID     = pDescriptor->pDeviceID;
+    shareMode     = pDescriptor->shareMode;
+    ossFormat     = ma_format_to_oss((pDescriptor->format != ma_format_unknown) ? pDescriptor->format : ma_format_s16); /* Use s16 by default because OSS doesn't like floating point. */
+    ossChannels   = (int)(pDescriptor->channels   > 0) ? pDescriptor->channels   : MA_DEFAULT_CHANNELS;
+    ossSampleRate = (int)(pDescriptor->sampleRate > 0) ? pDescriptor->sampleRate : MA_DEFAULT_SAMPLE_RATE;
+
+    result = ma_context_open_device__oss(pDevice->pContext, deviceType, pDeviceID, shareMode, &fd);
+    if (result != MA_SUCCESS) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open device.");
+        return result;
+    }
+
+    /*
+    The OSS documentation is very clear about the order we should be initializing the device's properties:
+      1) Format
+      2) Channels
+      3) Sample rate.
+    */
+
+    /* Format. */
+    ossResult = ioctl(fd, SNDCTL_DSP_SETFMT, &ossFormat);
+    if (ossResult == -1) {
+        close(fd);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to set format.");
+        return ma_result_from_errno(errno);
+    }
+
+    /* Channels. */
+    ossResult = ioctl(fd, SNDCTL_DSP_CHANNELS, &ossChannels);
+    if (ossResult == -1) {
+        close(fd);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to set channel count.");
+        return ma_result_from_errno(errno);
+    }
+
+    /* Sample Rate. */
+    ossResult = ioctl(fd, SNDCTL_DSP_SPEED, &ossSampleRate);
+    if (ossResult == -1) {
+        close(fd);
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to set sample rate.");
+        return ma_result_from_errno(errno);
+    }
+
+    /*
+    Buffer.
+
+    The documentation says that the fragment settings should be set as soon as possible, but I'm not sure if
+    it should be done before or after format/channels/rate.
+
+    OSS wants the fragment size in bytes and a power of 2. When setting, we specify the power, not the actual
+    value.
+    */
+    {
+        ma_uint32 periodSizeInFrames;
+        ma_uint32 periodSizeInBytes;
+        ma_uint32 ossFragmentSizePower;
+
+        periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, (ma_uint32)ossSampleRate, pConfig->performanceProfile);
+
+        periodSizeInBytes = ma_round_to_power_of_2(periodSizeInFrames * ma_get_bytes_per_frame(ma_format_from_oss(ossFormat), ossChannels));
+        if (periodSizeInBytes < 16) {
+            periodSizeInBytes = 16;
+        }
+
+        ossFragmentSizePower = 4;
+        periodSizeInBytes >>= 4;
+        while (periodSizeInBytes >>= 1) {
+            ossFragmentSizePower += 1;
+        }
+
+        ossFragment = (int)((pConfig->periods << 16) | ossFragmentSizePower);
+        ossResult = ioctl(fd, SNDCTL_DSP_SETFRAGMENT, &ossFragment);
+        if (ossResult == -1) {
+            close(fd);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to set fragment size and period count.");
+            return ma_result_from_errno(errno);
+        }
+    }
+
+    /* Internal settings. */
+    if (deviceType == ma_device_type_capture) {
+        pDevice->oss.fdCapture  = fd;
+    } else {
+        pDevice->oss.fdPlayback = fd;
+    }
+
+    pDescriptor->format             = ma_format_from_oss(ossFormat);
+    pDescriptor->channels           = ossChannels;
+    pDescriptor->sampleRate         = ossSampleRate;
+    ma_channel_map_init_standard(ma_standard_channel_map_sound4, pDescriptor->channelMap, ma_countof(pDescriptor->channelMap), pDescriptor->channels);
+    pDescriptor->periodCount        = (ma_uint32)(ossFragment >> 16);
+    pDescriptor->periodSizeInFrames = (ma_uint32)(1 << (ossFragment & 0xFFFF)) / ma_get_bytes_per_frame(pDescriptor->format, pDescriptor->channels);
+
+    if (pDescriptor->format == ma_format_unknown) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] The device's internal format is not supported by miniaudio.");
+        return MA_FORMAT_NOT_SUPPORTED;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init__oss(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    MA_ASSERT(pDevice  != NULL);
+    MA_ASSERT(pConfig  != NULL);
+
+    MA_ZERO_OBJECT(&pDevice->oss);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ma_result result = ma_device_init_fd__oss(pDevice, pConfig, pDescriptorCapture, ma_device_type_capture);
+        if (result != MA_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open device.");
+            return result;
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_result result = ma_device_init_fd__oss(pDevice, pConfig, pDescriptorPlayback, ma_device_type_playback);
+        if (result != MA_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open device.");
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+/*
+Note on Starting and Stopping
+=============================
+In the past I was using SNDCTL_DSP_HALT to stop the device, however this results in issues when
+trying to resume the device again. If we use SNDCTL_DSP_HALT, the next write() or read() will
+fail. Instead what we need to do is just not write or read to and from the device when the
+device is not running.
+
+As a result, both the start and stop functions for OSS are just empty stubs. The starting and
+stopping logic is handled by ma_device_write__oss() and ma_device_read__oss(). These will check
+the device state, and if the device is stopped they will simply not do any kind of processing.
+
+The downside to this technique is that I've noticed a fairly lengthy delay in stopping the
+device, up to a second. This is on a virtual machine, and as such might just be due to the
+virtual drivers, but I'm not fully sure. I am not sure how to work around this problem so for
+the moment that's just how it's going to have to be.
+
+When starting the device, OSS will automatically start it when write() or read() is called.
+*/
+static ma_result ma_device_start__oss(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /* The device is automatically started with reading and writing. */
+    (void)pDevice;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__oss(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /* See note above on why this is empty. */
+    (void)pDevice;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_write__oss(ma_device* pDevice, const void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesWritten)
+{
+    int resultOSS;
+    ma_uint32 deviceState;
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = 0;
+    }
+
+    /* Don't do any processing if the device is stopped. */
+    deviceState = ma_device_get_state(pDevice);
+    if (deviceState != ma_device_state_started && deviceState != ma_device_state_starting) {
+        return MA_SUCCESS;
+    }
+
+    resultOSS = write(pDevice->oss.fdPlayback, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
+    if (resultOSS < 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to send data from the client to the device.");
+        return ma_result_from_errno(errno);
+    }
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = (ma_uint32)resultOSS / ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_read__oss(ma_device* pDevice, void* pPCMFrames, ma_uint32 frameCount, ma_uint32* pFramesRead)
+{
+    int resultOSS;
+    ma_uint32 deviceState;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    /* Don't do any processing if the device is stopped. */
+    deviceState = ma_device_get_state(pDevice);
+    if (deviceState != ma_device_state_started && deviceState != ma_device_state_starting) {
+        return MA_SUCCESS;
+    }
+
+    resultOSS = read(pDevice->oss.fdCapture, pPCMFrames, frameCount * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels));
+    if (resultOSS < 0) {
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OSS] Failed to read data from the device to be sent to the client.");
+        return ma_result_from_errno(errno);
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = (ma_uint32)resultOSS / ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_uninit__oss(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_oss);
+
+    (void)pContext;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__oss(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    int fd;
+    int ossVersion;
+    int result;
+
+    MA_ASSERT(pContext != NULL);
+
+    (void)pConfig;
+
+    /* Try opening a temporary device first so we can get version information. This is closed at the end. */
+    fd = ma_open_temp_device__oss();
+    if (fd == -1) {
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to open temporary device for retrieving system properties.");   /* Looks liks OSS isn't installed, or there are no available devices. */
+        return MA_NO_BACKEND;
+    }
+
+    /* Grab the OSS version. */
+    ossVersion = 0;
+    result = ioctl(fd, OSS_GETVERSION, &ossVersion);
+    if (result == -1) {
+        close(fd);
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_ERROR, "[OSS] Failed to retrieve OSS version.");
+        return MA_NO_BACKEND;
+    }
+
+    /* The file handle to temp device is no longer needed. Close ASAP. */
+    close(fd);
+
+    pContext->oss.versionMajor = ((ossVersion & 0xFF0000) >> 16);
+    pContext->oss.versionMinor = ((ossVersion & 0x00FF00) >> 8);
+
+    pCallbacks->onContextInit             = ma_context_init__oss;
+    pCallbacks->onContextUninit           = ma_context_uninit__oss;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__oss;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__oss;
+    pCallbacks->onDeviceInit              = ma_device_init__oss;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__oss;
+    pCallbacks->onDeviceStart             = ma_device_start__oss;
+    pCallbacks->onDeviceStop              = ma_device_stop__oss;
+    pCallbacks->onDeviceRead              = ma_device_read__oss;
+    pCallbacks->onDeviceWrite             = ma_device_write__oss;
+    pCallbacks->onDeviceDataLoop          = NULL;
+
+    return MA_SUCCESS;
+}
+#endif  /* MA_HAS_OSS */
+
+
+
+
+
+/******************************************************************************
+
+AAudio Backend
+
+******************************************************************************/
+#ifdef MA_HAS_AAUDIO
+
+#ifdef MA_NO_RUNTIME_LINKING
+    #include <AAudio/AAudio.h>
+#endif
+
+typedef int32_t                                         ma_aaudio_result_t;
+typedef int32_t                                         ma_aaudio_direction_t;
+typedef int32_t                                         ma_aaudio_sharing_mode_t;
+typedef int32_t                                         ma_aaudio_format_t;
+typedef int32_t                                         ma_aaudio_stream_state_t;
+typedef int32_t                                         ma_aaudio_performance_mode_t;
+typedef int32_t                                         ma_aaudio_usage_t;
+typedef int32_t                                         ma_aaudio_content_type_t;
+typedef int32_t                                         ma_aaudio_input_preset_t;
+typedef int32_t                                         ma_aaudio_allowed_capture_policy_t;
+typedef int32_t                                         ma_aaudio_data_callback_result_t;
+typedef struct ma_AAudioStreamBuilder_t*                ma_AAudioStreamBuilder;
+typedef struct ma_AAudioStream_t*                       ma_AAudioStream;
+
+#define MA_AAUDIO_UNSPECIFIED                           0
+
+/* Result codes. miniaudio only cares about the success code. */
+#define MA_AAUDIO_OK                                    0
+
+/* Directions. */
+#define MA_AAUDIO_DIRECTION_OUTPUT                      0
+#define MA_AAUDIO_DIRECTION_INPUT                       1
+
+/* Sharing modes. */
+#define MA_AAUDIO_SHARING_MODE_EXCLUSIVE                0
+#define MA_AAUDIO_SHARING_MODE_SHARED                   1
+
+/* Formats. */
+#define MA_AAUDIO_FORMAT_PCM_I16                        1
+#define MA_AAUDIO_FORMAT_PCM_FLOAT                      2
+
+/* Stream states. */
+#define MA_AAUDIO_STREAM_STATE_UNINITIALIZED            0
+#define MA_AAUDIO_STREAM_STATE_UNKNOWN                  1
+#define MA_AAUDIO_STREAM_STATE_OPEN                     2
+#define MA_AAUDIO_STREAM_STATE_STARTING                 3
+#define MA_AAUDIO_STREAM_STATE_STARTED                  4
+#define MA_AAUDIO_STREAM_STATE_PAUSING                  5
+#define MA_AAUDIO_STREAM_STATE_PAUSED                   6
+#define MA_AAUDIO_STREAM_STATE_FLUSHING                 7
+#define MA_AAUDIO_STREAM_STATE_FLUSHED                  8
+#define MA_AAUDIO_STREAM_STATE_STOPPING                 9
+#define MA_AAUDIO_STREAM_STATE_STOPPED                  10
+#define MA_AAUDIO_STREAM_STATE_CLOSING                  11
+#define MA_AAUDIO_STREAM_STATE_CLOSED                   12
+#define MA_AAUDIO_STREAM_STATE_DISCONNECTED             13
+
+/* Performance modes. */
+#define MA_AAUDIO_PERFORMANCE_MODE_NONE                 10
+#define MA_AAUDIO_PERFORMANCE_MODE_POWER_SAVING         11
+#define MA_AAUDIO_PERFORMANCE_MODE_LOW_LATENCY          12
+
+/* Usage types. */
+#define MA_AAUDIO_USAGE_MEDIA                           1
+#define MA_AAUDIO_USAGE_VOICE_COMMUNICATION             2
+#define MA_AAUDIO_USAGE_VOICE_COMMUNICATION_SIGNALLING  3
+#define MA_AAUDIO_USAGE_ALARM                           4
+#define MA_AAUDIO_USAGE_NOTIFICATION                    5
+#define MA_AAUDIO_USAGE_NOTIFICATION_RINGTONE           6
+#define MA_AAUDIO_USAGE_NOTIFICATION_EVENT              10
+#define MA_AAUDIO_USAGE_ASSISTANCE_ACCESSIBILITY        11
+#define MA_AAUDIO_USAGE_ASSISTANCE_NAVIGATION_GUIDANCE  12
+#define MA_AAUDIO_USAGE_ASSISTANCE_SONIFICATION         13
+#define MA_AAUDIO_USAGE_GAME                            14
+#define MA_AAUDIO_USAGE_ASSISTANT                       16
+#define MA_AAUDIO_SYSTEM_USAGE_EMERGENCY                1000
+#define MA_AAUDIO_SYSTEM_USAGE_SAFETY                   1001
+#define MA_AAUDIO_SYSTEM_USAGE_VEHICLE_STATUS           1002
+#define MA_AAUDIO_SYSTEM_USAGE_ANNOUNCEMENT             1003
+
+/* Content types. */
+#define MA_AAUDIO_CONTENT_TYPE_SPEECH                   1
+#define MA_AAUDIO_CONTENT_TYPE_MUSIC                    2
+#define MA_AAUDIO_CONTENT_TYPE_MOVIE                    3
+#define MA_AAUDIO_CONTENT_TYPE_SONIFICATION             4
+
+/* Input presets. */
+#define MA_AAUDIO_INPUT_PRESET_GENERIC                  1
+#define MA_AAUDIO_INPUT_PRESET_CAMCORDER                5
+#define MA_AAUDIO_INPUT_PRESET_VOICE_RECOGNITION        6
+#define MA_AAUDIO_INPUT_PRESET_VOICE_COMMUNICATION      7
+#define MA_AAUDIO_INPUT_PRESET_UNPROCESSED              9
+#define MA_AAUDIO_INPUT_PRESET_VOICE_PERFORMANCE        10
+
+/* Allowed Capture Policies */
+#define MA_AAUDIO_ALLOW_CAPTURE_BY_ALL                  1
+#define MA_AAUDIO_ALLOW_CAPTURE_BY_SYSTEM               2
+#define MA_AAUDIO_ALLOW_CAPTURE_BY_NONE                 3
+
+/* Callback results. */
+#define MA_AAUDIO_CALLBACK_RESULT_CONTINUE              0
+#define MA_AAUDIO_CALLBACK_RESULT_STOP                  1
+
+
+typedef ma_aaudio_data_callback_result_t (* ma_AAudioStream_dataCallback) (ma_AAudioStream* pStream, void* pUserData, void* pAudioData, int32_t numFrames);
+typedef void                             (* ma_AAudioStream_errorCallback)(ma_AAudioStream *pStream, void *pUserData, ma_aaudio_result_t error);
+
+typedef ma_aaudio_result_t       (* MA_PFN_AAudio_createStreamBuilder)                   (ma_AAudioStreamBuilder** ppBuilder);
+typedef ma_aaudio_result_t       (* MA_PFN_AAudioStreamBuilder_delete)                   (ma_AAudioStreamBuilder* pBuilder);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setDeviceId)              (ma_AAudioStreamBuilder* pBuilder, int32_t deviceId);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setDirection)             (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_direction_t direction);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setSharingMode)           (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_sharing_mode_t sharingMode);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setFormat)                (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_format_t format);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setChannelCount)          (ma_AAudioStreamBuilder* pBuilder, int32_t channelCount);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setSampleRate)            (ma_AAudioStreamBuilder* pBuilder, int32_t sampleRate);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setBufferCapacityInFrames)(ma_AAudioStreamBuilder* pBuilder, int32_t numFrames);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setFramesPerDataCallback) (ma_AAudioStreamBuilder* pBuilder, int32_t numFrames);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setDataCallback)          (ma_AAudioStreamBuilder* pBuilder, ma_AAudioStream_dataCallback callback, void* pUserData);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setErrorCallback)         (ma_AAudioStreamBuilder* pBuilder, ma_AAudioStream_errorCallback callback, void* pUserData);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setPerformanceMode)       (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_performance_mode_t mode);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setUsage)                 (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_usage_t contentType);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setContentType)           (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_content_type_t contentType);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setInputPreset)           (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_input_preset_t inputPreset);
+typedef void                     (* MA_PFN_AAudioStreamBuilder_setAllowedCapturePolicy)  (ma_AAudioStreamBuilder* pBuilder, ma_aaudio_allowed_capture_policy_t policy);
+typedef ma_aaudio_result_t       (* MA_PFN_AAudioStreamBuilder_openStream)               (ma_AAudioStreamBuilder* pBuilder, ma_AAudioStream** ppStream);
+typedef ma_aaudio_result_t       (* MA_PFN_AAudioStream_close)                           (ma_AAudioStream* pStream);
+typedef ma_aaudio_stream_state_t (* MA_PFN_AAudioStream_getState)                        (ma_AAudioStream* pStream);
+typedef ma_aaudio_result_t       (* MA_PFN_AAudioStream_waitForStateChange)              (ma_AAudioStream* pStream, ma_aaudio_stream_state_t inputState, ma_aaudio_stream_state_t* pNextState, int64_t timeoutInNanoseconds);
+typedef ma_aaudio_format_t       (* MA_PFN_AAudioStream_getFormat)                       (ma_AAudioStream* pStream);
+typedef int32_t                  (* MA_PFN_AAudioStream_getChannelCount)                 (ma_AAudioStream* pStream);
+typedef int32_t                  (* MA_PFN_AAudioStream_getSampleRate)                   (ma_AAudioStream* pStream);
+typedef int32_t                  (* MA_PFN_AAudioStream_getBufferCapacityInFrames)       (ma_AAudioStream* pStream);
+typedef int32_t                  (* MA_PFN_AAudioStream_getFramesPerDataCallback)        (ma_AAudioStream* pStream);
+typedef int32_t                  (* MA_PFN_AAudioStream_getFramesPerBurst)               (ma_AAudioStream* pStream);
+typedef ma_aaudio_result_t       (* MA_PFN_AAudioStream_requestStart)                    (ma_AAudioStream* pStream);
+typedef ma_aaudio_result_t       (* MA_PFN_AAudioStream_requestStop)                     (ma_AAudioStream* pStream);
+
+static ma_result ma_result_from_aaudio(ma_aaudio_result_t resultAA)
+{
+    switch (resultAA)
+    {
+        case MA_AAUDIO_OK: return MA_SUCCESS;
+        default: break;
+    }
+
+    return MA_ERROR;
+}
+
+static ma_aaudio_usage_t ma_to_usage__aaudio(ma_aaudio_usage usage)
+{
+    switch (usage) {
+        case ma_aaudio_usage_media:                          return MA_AAUDIO_USAGE_MEDIA;
+        case ma_aaudio_usage_voice_communication:            return MA_AAUDIO_USAGE_VOICE_COMMUNICATION;
+        case ma_aaudio_usage_voice_communication_signalling: return MA_AAUDIO_USAGE_VOICE_COMMUNICATION_SIGNALLING;
+        case ma_aaudio_usage_alarm:                          return MA_AAUDIO_USAGE_ALARM;
+        case ma_aaudio_usage_notification:                   return MA_AAUDIO_USAGE_NOTIFICATION;
+        case ma_aaudio_usage_notification_ringtone:          return MA_AAUDIO_USAGE_NOTIFICATION_RINGTONE;
+        case ma_aaudio_usage_notification_event:             return MA_AAUDIO_USAGE_NOTIFICATION_EVENT;
+        case ma_aaudio_usage_assistance_accessibility:       return MA_AAUDIO_USAGE_ASSISTANCE_ACCESSIBILITY;
+        case ma_aaudio_usage_assistance_navigation_guidance: return MA_AAUDIO_USAGE_ASSISTANCE_NAVIGATION_GUIDANCE;
+        case ma_aaudio_usage_assistance_sonification:        return MA_AAUDIO_USAGE_ASSISTANCE_SONIFICATION;
+        case ma_aaudio_usage_game:                           return MA_AAUDIO_USAGE_GAME;
+        case ma_aaudio_usage_assitant:                       return MA_AAUDIO_USAGE_ASSISTANT;
+        case ma_aaudio_usage_emergency:                      return MA_AAUDIO_SYSTEM_USAGE_EMERGENCY;
+        case ma_aaudio_usage_safety:                         return MA_AAUDIO_SYSTEM_USAGE_SAFETY;
+        case ma_aaudio_usage_vehicle_status:                 return MA_AAUDIO_SYSTEM_USAGE_VEHICLE_STATUS;
+        case ma_aaudio_usage_announcement:                   return MA_AAUDIO_SYSTEM_USAGE_ANNOUNCEMENT;
+        default: break;
+    }
+
+    return MA_AAUDIO_USAGE_MEDIA;
+}
+
+static ma_aaudio_content_type_t ma_to_content_type__aaudio(ma_aaudio_content_type contentType)
+{
+    switch (contentType) {
+        case ma_aaudio_content_type_speech:       return MA_AAUDIO_CONTENT_TYPE_SPEECH;
+        case ma_aaudio_content_type_music:        return MA_AAUDIO_CONTENT_TYPE_MUSIC;
+        case ma_aaudio_content_type_movie:        return MA_AAUDIO_CONTENT_TYPE_MOVIE;
+        case ma_aaudio_content_type_sonification: return MA_AAUDIO_CONTENT_TYPE_SONIFICATION;
+        default: break;
+    }
+
+    return MA_AAUDIO_CONTENT_TYPE_SPEECH;
+}
+
+static ma_aaudio_input_preset_t ma_to_input_preset__aaudio(ma_aaudio_input_preset inputPreset)
+{
+    switch (inputPreset) {
+        case ma_aaudio_input_preset_generic:             return MA_AAUDIO_INPUT_PRESET_GENERIC;
+        case ma_aaudio_input_preset_camcorder:           return MA_AAUDIO_INPUT_PRESET_CAMCORDER;
+        case ma_aaudio_input_preset_voice_recognition:   return MA_AAUDIO_INPUT_PRESET_VOICE_RECOGNITION;
+        case ma_aaudio_input_preset_voice_communication: return MA_AAUDIO_INPUT_PRESET_VOICE_COMMUNICATION;
+        case ma_aaudio_input_preset_unprocessed:         return MA_AAUDIO_INPUT_PRESET_UNPROCESSED;
+        case ma_aaudio_input_preset_voice_performance:   return MA_AAUDIO_INPUT_PRESET_VOICE_PERFORMANCE;
+        default: break;
+    }
+
+    return MA_AAUDIO_INPUT_PRESET_GENERIC;
+}
+
+static ma_aaudio_allowed_capture_policy_t ma_to_allowed_capture_policy__aaudio(ma_aaudio_allowed_capture_policy allowedCapturePolicy)
+{
+    switch (allowedCapturePolicy) {
+        case ma_aaudio_allow_capture_by_all:    return MA_AAUDIO_ALLOW_CAPTURE_BY_ALL;
+        case ma_aaudio_allow_capture_by_system: return MA_AAUDIO_ALLOW_CAPTURE_BY_SYSTEM;
+        case ma_aaudio_allow_capture_by_none:   return MA_AAUDIO_ALLOW_CAPTURE_BY_NONE;
+        default: break;
+    }
+
+    return MA_AAUDIO_ALLOW_CAPTURE_BY_ALL;
+}
+
+static void ma_stream_error_callback__aaudio(ma_AAudioStream* pStream, void* pUserData, ma_aaudio_result_t error)
+{
+    ma_result result;
+    ma_job job;
+    ma_device* pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    (void)error;
+    ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] ERROR CALLBACK: error=%d, AAudioStream_getState()=%d\n", error, ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream));
+    /*
+    When we get an error, we'll assume that the stream is in an erroneous state and needs to be restarted. From the documentation,
+    we cannot do this from the error callback. Therefore we are going to use an event thread for the AAudio backend to do this
+    cleanly and safely.
+    */
+    job = ma_job_init(MA_JOB_TYPE_DEVICE_AAUDIO_REROUTE);
+    job.data.device.aaudio.reroute.pDevice = pDevice;
+
+    if (pStream == pDevice->aaudio.pStreamCapture) {
+        job.data.device.aaudio.reroute.deviceType = ma_device_type_capture;
+    }
+    else {
+        job.data.device.aaudio.reroute.deviceType = ma_device_type_playback;
+    }
+
+    result = ma_device_job_thread_post(&pDevice->pContext->aaudio.jobThread, &job);
+    if (result != MA_SUCCESS) {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] Device Disconnected. Failed to post job for rerouting.\n");
+        return;
+    }
+}
+
+static ma_aaudio_data_callback_result_t ma_stream_data_callback_capture__aaudio(ma_AAudioStream* pStream, void* pUserData, void* pAudioData, int32_t frameCount)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    if (frameCount > 0) {
+        ma_device_handle_backend_data_callback(pDevice, NULL, pAudioData, (ma_uint32)frameCount);
+    }
+
+    (void)pStream;
+    return MA_AAUDIO_CALLBACK_RESULT_CONTINUE;
+}
+
+static ma_aaudio_data_callback_result_t ma_stream_data_callback_playback__aaudio(ma_AAudioStream* pStream, void* pUserData, void* pAudioData, int32_t frameCount)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    MA_ASSERT(pDevice != NULL);
+
+    /*
+    I've had a report that AAudio can sometimes post a frame count of 0. We need to check for that here
+    so we don't get any errors at a deeper level. I'm doing the same with the capture side for safety,
+    though I've not yet had any reports about that one.
+    */
+    if (frameCount > 0) {
+        ma_device_handle_backend_data_callback(pDevice, pAudioData, NULL, (ma_uint32)frameCount);
+    }
+
+    (void)pStream;
+    return MA_AAUDIO_CALLBACK_RESULT_CONTINUE;
+}
+
+static ma_result ma_create_and_configure_AAudioStreamBuilder__aaudio(ma_context* pContext, const ma_device_id* pDeviceID, ma_device_type deviceType, ma_share_mode shareMode, const ma_device_descriptor* pDescriptor, const ma_device_config* pConfig, ma_device* pDevice, ma_AAudioStreamBuilder** ppBuilder)
+{
+    ma_AAudioStreamBuilder* pBuilder;
+    ma_aaudio_result_t resultAA;
+
+    /* Safety. */
+    *ppBuilder = NULL;
+
+    resultAA = ((MA_PFN_AAudio_createStreamBuilder)pContext->aaudio.AAudio_createStreamBuilder)(&pBuilder);
+    if (resultAA != MA_AAUDIO_OK) {
+        return ma_result_from_aaudio(resultAA);
+    }
+
+    if (pDeviceID != NULL) {
+        ((MA_PFN_AAudioStreamBuilder_setDeviceId)pContext->aaudio.AAudioStreamBuilder_setDeviceId)(pBuilder, pDeviceID->aaudio);
+    }
+
+    ((MA_PFN_AAudioStreamBuilder_setDirection)pContext->aaudio.AAudioStreamBuilder_setDirection)(pBuilder, (deviceType == ma_device_type_playback) ? MA_AAUDIO_DIRECTION_OUTPUT : MA_AAUDIO_DIRECTION_INPUT);
+    ((MA_PFN_AAudioStreamBuilder_setSharingMode)pContext->aaudio.AAudioStreamBuilder_setSharingMode)(pBuilder, (shareMode == ma_share_mode_shared) ? MA_AAUDIO_SHARING_MODE_SHARED : MA_AAUDIO_SHARING_MODE_EXCLUSIVE);
+
+
+    /* If we have a device descriptor make sure we configure the stream builder to take our requested parameters. */
+    if (pDescriptor != NULL) {
+        MA_ASSERT(pConfig != NULL); /* We must have a device config if we also have a descriptor. The config is required for AAudio specific configuration options. */
+
+        if (pDescriptor->sampleRate != 0) {
+            ((MA_PFN_AAudioStreamBuilder_setSampleRate)pContext->aaudio.AAudioStreamBuilder_setSampleRate)(pBuilder, pDescriptor->sampleRate);
+        }
+
+        if (pDescriptor->channels != 0) {
+            ((MA_PFN_AAudioStreamBuilder_setChannelCount)pContext->aaudio.AAudioStreamBuilder_setChannelCount)(pBuilder, pDescriptor->channels);
+        }
+
+        if (pDescriptor->format != ma_format_unknown) {
+            ((MA_PFN_AAudioStreamBuilder_setFormat)pContext->aaudio.AAudioStreamBuilder_setFormat)(pBuilder, (pDescriptor->format == ma_format_s16) ? MA_AAUDIO_FORMAT_PCM_I16 : MA_AAUDIO_FORMAT_PCM_FLOAT);
+        }
+
+
+        /*
+        There have been reports where setting the frames per data callback results in an error.
+        In particular, re-routing may inadvertently switch from low-latency mode, resulting in a less stable
+        stream from the legacy path (AudioStreamLegacy). To address this, we simply don't set the value. It
+        can still be set if it's explicitly requested via the aaudio.allowSetBufferCapacity variable in the
+        device config.
+        */
+        if ((!pConfig->aaudio.enableCompatibilityWorkarounds || ma_android_sdk_version() > 30) && pConfig->aaudio.allowSetBufferCapacity) {
+            /*
+            AAudio is annoying when it comes to its buffer calculation stuff because it doesn't let you
+            retrieve the actual sample rate until after you've opened the stream. But you need to configure
+            the buffer capacity before you open the stream... :/
+
+            To solve, we're just going to assume MA_DEFAULT_SAMPLE_RATE (48000) and move on.
+            */
+            ma_uint32 bufferCapacityInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptor, pDescriptor->sampleRate, pConfig->performanceProfile) * pDescriptor->periodCount;
+
+            ((MA_PFN_AAudioStreamBuilder_setBufferCapacityInFrames)pContext->aaudio.AAudioStreamBuilder_setBufferCapacityInFrames)(pBuilder, bufferCapacityInFrames);
+            ((MA_PFN_AAudioStreamBuilder_setFramesPerDataCallback)pContext->aaudio.AAudioStreamBuilder_setFramesPerDataCallback)(pBuilder, bufferCapacityInFrames / pDescriptor->periodCount);
+        }
+
+        if (deviceType == ma_device_type_capture) {
+            if (pConfig->aaudio.inputPreset != ma_aaudio_input_preset_default && pContext->aaudio.AAudioStreamBuilder_setInputPreset != NULL) {
+                ((MA_PFN_AAudioStreamBuilder_setInputPreset)pContext->aaudio.AAudioStreamBuilder_setInputPreset)(pBuilder, ma_to_input_preset__aaudio(pConfig->aaudio.inputPreset));
+            }
+
+            ((MA_PFN_AAudioStreamBuilder_setDataCallback)pContext->aaudio.AAudioStreamBuilder_setDataCallback)(pBuilder, ma_stream_data_callback_capture__aaudio, (void*)pDevice);
+        } else {
+            if (pConfig->aaudio.usage != ma_aaudio_usage_default && pContext->aaudio.AAudioStreamBuilder_setUsage != NULL) {
+                ((MA_PFN_AAudioStreamBuilder_setUsage)pContext->aaudio.AAudioStreamBuilder_setUsage)(pBuilder, ma_to_usage__aaudio(pConfig->aaudio.usage));
+            }
+
+            if (pConfig->aaudio.contentType != ma_aaudio_content_type_default && pContext->aaudio.AAudioStreamBuilder_setContentType != NULL) {
+                ((MA_PFN_AAudioStreamBuilder_setContentType)pContext->aaudio.AAudioStreamBuilder_setContentType)(pBuilder, ma_to_content_type__aaudio(pConfig->aaudio.contentType));
+            }
+
+            if (pConfig->aaudio.allowedCapturePolicy != ma_aaudio_allow_capture_default && pContext->aaudio.AAudioStreamBuilder_setAllowedCapturePolicy != NULL) {
+                ((MA_PFN_AAudioStreamBuilder_setAllowedCapturePolicy)pContext->aaudio.AAudioStreamBuilder_setAllowedCapturePolicy)(pBuilder, ma_to_allowed_capture_policy__aaudio(pConfig->aaudio.allowedCapturePolicy));
+            }
+
+            ((MA_PFN_AAudioStreamBuilder_setDataCallback)pContext->aaudio.AAudioStreamBuilder_setDataCallback)(pBuilder, ma_stream_data_callback_playback__aaudio, (void*)pDevice);
+        }
+
+        /*
+        If we set AAUDIO_PERFORMANCE_MODE_LOW_LATENCY, we allow for MMAP (non-legacy path).
+        Since there's a mapping between miniaudio's performance profiles and AAudio's performance modes, let's use it.
+        Beware though, with a conservative performance profile, AAudio will indeed take the legacy path.
+        */
+        ((MA_PFN_AAudioStreamBuilder_setPerformanceMode)pContext->aaudio.AAudioStreamBuilder_setPerformanceMode)(pBuilder, (pConfig->performanceProfile == ma_performance_profile_low_latency) ? MA_AAUDIO_PERFORMANCE_MODE_LOW_LATENCY : MA_AAUDIO_PERFORMANCE_MODE_NONE);
+
+        /* We need to set an error callback to detect device changes. */
+        if (pDevice != NULL) {  /* <-- pDevice should never be null if pDescriptor is not null, which is always the case if we hit this branch. Check anyway for safety. */
+            ((MA_PFN_AAudioStreamBuilder_setErrorCallback)pContext->aaudio.AAudioStreamBuilder_setErrorCallback)(pBuilder, ma_stream_error_callback__aaudio, (void*)pDevice);
+        }
+    }
+
+    *ppBuilder = pBuilder;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_open_stream_and_close_builder__aaudio(ma_context* pContext, ma_AAudioStreamBuilder* pBuilder, ma_AAudioStream** ppStream)
+{
+    ma_result result;
+
+    result = ma_result_from_aaudio(((MA_PFN_AAudioStreamBuilder_openStream)pContext->aaudio.AAudioStreamBuilder_openStream)(pBuilder, ppStream));
+    ((MA_PFN_AAudioStreamBuilder_delete)pContext->aaudio.AAudioStreamBuilder_delete)(pBuilder);
+
+    return result;
+}
+
+static ma_result ma_open_stream_basic__aaudio(ma_context* pContext, const ma_device_id* pDeviceID, ma_device_type deviceType, ma_share_mode shareMode, ma_AAudioStream** ppStream)
+{
+    ma_result result;
+    ma_AAudioStreamBuilder* pBuilder;
+
+    *ppStream = NULL;
+
+    result = ma_create_and_configure_AAudioStreamBuilder__aaudio(pContext, pDeviceID, deviceType, shareMode, NULL, NULL, NULL, &pBuilder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* Let's give AAudio a hint to avoid the legacy path (AudioStreamLegacy). */
+    ((MA_PFN_AAudioStreamBuilder_setPerformanceMode)pContext->aaudio.AAudioStreamBuilder_setPerformanceMode)(pBuilder, MA_AAUDIO_PERFORMANCE_MODE_LOW_LATENCY);
+
+    return ma_open_stream_and_close_builder__aaudio(pContext, pBuilder, ppStream);
+}
+
+static ma_result ma_open_stream__aaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_type deviceType, const ma_device_descriptor* pDescriptor, ma_AAudioStream** ppStream)
+{
+    ma_result result;
+    ma_AAudioStreamBuilder* pBuilder;
+
+    MA_ASSERT(pDevice != NULL);
+    MA_ASSERT(pDescriptor != NULL);
+    MA_ASSERT(deviceType != ma_device_type_duplex);   /* This function should not be called for a full-duplex device type. */
+
+    *ppStream = NULL;
+
+    result = ma_create_and_configure_AAudioStreamBuilder__aaudio(pDevice->pContext, pDescriptor->pDeviceID, deviceType, pDescriptor->shareMode, pDescriptor, pConfig, pDevice, &pBuilder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return ma_open_stream_and_close_builder__aaudio(pDevice->pContext, pBuilder, ppStream);
+}
+
+static ma_result ma_close_stream__aaudio(ma_context* pContext, ma_AAudioStream* pStream)
+{
+    if (pStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_result_from_aaudio(((MA_PFN_AAudioStream_close)pContext->aaudio.AAudioStream_close)(pStream));
+}
+
+static ma_bool32 ma_has_default_device__aaudio(ma_context* pContext, ma_device_type deviceType)
+{
+    /* The only way to know this is to try creating a stream. */
+    ma_AAudioStream* pStream;
+    ma_result result = ma_open_stream_basic__aaudio(pContext, NULL, deviceType, ma_share_mode_shared, &pStream);
+    if (result != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+
+    ma_close_stream__aaudio(pContext, pStream);
+    return MA_TRUE;
+}
+
+static ma_result ma_wait_for_simple_state_transition__aaudio(ma_context* pContext, ma_AAudioStream* pStream, ma_aaudio_stream_state_t oldState, ma_aaudio_stream_state_t newState)
+{
+    ma_aaudio_stream_state_t actualNewState;
+    ma_aaudio_result_t resultAA = ((MA_PFN_AAudioStream_waitForStateChange)pContext->aaudio.AAudioStream_waitForStateChange)(pStream, oldState, &actualNewState, 5000000000); /* 5 second timeout. */
+    if (resultAA != MA_AAUDIO_OK) {
+        return ma_result_from_aaudio(resultAA);
+    }
+
+    if (newState != actualNewState) {
+        return MA_ERROR;   /* Failed to transition into the expected state. */
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_context_enumerate_devices__aaudio(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_bool32 cbResult = MA_TRUE;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    /* Unfortunately AAudio does not have an enumeration API. Therefore I'm only going to report default devices, but only if it can instantiate a stream. */
+
+    /* Playback. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        deviceInfo.id.aaudio = MA_AAUDIO_UNSPECIFIED;
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+
+        if (ma_has_default_device__aaudio(pContext, ma_device_type_playback)) {
+            cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+        }
+    }
+
+    /* Capture. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        deviceInfo.id.aaudio = MA_AAUDIO_UNSPECIFIED;
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+
+        if (ma_has_default_device__aaudio(pContext, ma_device_type_capture)) {
+            cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_context_add_native_data_format_from_AAudioStream_ex__aaudio(ma_context* pContext, ma_AAudioStream* pStream, ma_format format, ma_uint32 flags, ma_device_info* pDeviceInfo)
+{
+    MA_ASSERT(pContext    != NULL);
+    MA_ASSERT(pStream     != NULL);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
+    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = ((MA_PFN_AAudioStream_getChannelCount)pContext->aaudio.AAudioStream_getChannelCount)(pStream);
+    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = ((MA_PFN_AAudioStream_getSampleRate)pContext->aaudio.AAudioStream_getSampleRate)(pStream);
+    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = flags;
+    pDeviceInfo->nativeDataFormatCount += 1;
+}
+
+static void ma_context_add_native_data_format_from_AAudioStream__aaudio(ma_context* pContext, ma_AAudioStream* pStream, ma_uint32 flags, ma_device_info* pDeviceInfo)
+{
+    /* AAudio supports s16 and f32. */
+    ma_context_add_native_data_format_from_AAudioStream_ex__aaudio(pContext, pStream, ma_format_f32, flags, pDeviceInfo);
+    ma_context_add_native_data_format_from_AAudioStream_ex__aaudio(pContext, pStream, ma_format_s16, flags, pDeviceInfo);
+}
+
+static ma_result ma_context_get_device_info__aaudio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    ma_AAudioStream* pStream;
+    ma_result result;
+
+    MA_ASSERT(pContext != NULL);
+
+    /* ID */
+    if (pDeviceID != NULL) {
+        pDeviceInfo->id.aaudio = pDeviceID->aaudio;
+    } else {
+        pDeviceInfo->id.aaudio = MA_AAUDIO_UNSPECIFIED;
+    }
+
+    /* Name */
+    if (deviceType == ma_device_type_playback) {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+    } else {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+    }
+
+
+    pDeviceInfo->nativeDataFormatCount = 0;
+
+    /* We'll need to open the device to get accurate sample rate and channel count information. */
+    result = ma_open_stream_basic__aaudio(pContext, pDeviceID, deviceType, ma_share_mode_shared, &pStream);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    ma_context_add_native_data_format_from_AAudioStream__aaudio(pContext, pStream, 0, pDeviceInfo);
+
+    ma_close_stream__aaudio(pContext, pStream);
+    pStream = NULL;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_close_streams__aaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /* When re-routing, streams may have been closed and never re-opened. Hence the extra checks below. */
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ma_close_stream__aaudio(pDevice->pContext, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
+        pDevice->aaudio.pStreamCapture = NULL;
+    }
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_close_stream__aaudio(pDevice->pContext, (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback);
+        pDevice->aaudio.pStreamPlayback = NULL;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_uninit__aaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /* Wait for any rerouting to finish before attempting to close the streams. */
+    ma_mutex_lock(&pDevice->aaudio.rerouteLock);
+    {
+        ma_close_streams__aaudio(pDevice);
+    }
+    ma_mutex_unlock(&pDevice->aaudio.rerouteLock);
+
+    /* Destroy re-routing lock. */
+    ma_mutex_uninit(&pDevice->aaudio.rerouteLock);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init_by_type__aaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_type deviceType, ma_device_descriptor* pDescriptor, ma_AAudioStream** ppStream)
+{
+    ma_result result;
+    int32_t bufferCapacityInFrames;
+    int32_t framesPerDataCallback;
+    ma_AAudioStream* pStream;
+
+    MA_ASSERT(pDevice     != NULL);
+    MA_ASSERT(pConfig     != NULL);
+    MA_ASSERT(pDescriptor != NULL);
+
+    *ppStream = NULL;   /* Safety. */
+
+    /* First step is to open the stream. From there we'll be able to extract the internal configuration. */
+    result = ma_open_stream__aaudio(pDevice, pConfig, deviceType, pDescriptor, &pStream);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to open the AAudio stream. */
+    }
+
+    /* Now extract the internal configuration. */
+    pDescriptor->format     = (((MA_PFN_AAudioStream_getFormat)pDevice->pContext->aaudio.AAudioStream_getFormat)(pStream) == MA_AAUDIO_FORMAT_PCM_I16) ? ma_format_s16 : ma_format_f32;
+    pDescriptor->channels   = ((MA_PFN_AAudioStream_getChannelCount)pDevice->pContext->aaudio.AAudioStream_getChannelCount)(pStream);
+    pDescriptor->sampleRate = ((MA_PFN_AAudioStream_getSampleRate)pDevice->pContext->aaudio.AAudioStream_getSampleRate)(pStream);
+
+    /* For the channel map we need to be sure we don't overflow any buffers. */
+    if (pDescriptor->channels <= MA_MAX_CHANNELS) {
+        ma_channel_map_init_standard(ma_standard_channel_map_default, pDescriptor->channelMap, ma_countof(pDescriptor->channelMap), pDescriptor->channels); /* <-- Cannot find info on channel order, so assuming a default. */
+    } else {
+        ma_channel_map_init_blank(pDescriptor->channelMap, MA_MAX_CHANNELS); /* Too many channels. Use a blank channel map. */
+    }
+
+    bufferCapacityInFrames = ((MA_PFN_AAudioStream_getBufferCapacityInFrames)pDevice->pContext->aaudio.AAudioStream_getBufferCapacityInFrames)(pStream);
+    framesPerDataCallback = ((MA_PFN_AAudioStream_getFramesPerDataCallback)pDevice->pContext->aaudio.AAudioStream_getFramesPerDataCallback)(pStream);
+
+    if (framesPerDataCallback > 0) {
+        pDescriptor->periodSizeInFrames = framesPerDataCallback;
+        pDescriptor->periodCount        = bufferCapacityInFrames / framesPerDataCallback;
+    } else {
+        pDescriptor->periodSizeInFrames = bufferCapacityInFrames;
+        pDescriptor->periodCount        = 1;
+    }
+
+    *ppStream = pStream;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init_streams__aaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    pDevice->aaudio.usage                   = pConfig->aaudio.usage;
+    pDevice->aaudio.contentType             = pConfig->aaudio.contentType;
+    pDevice->aaudio.inputPreset             = pConfig->aaudio.inputPreset;
+    pDevice->aaudio.allowedCapturePolicy    = pConfig->aaudio.allowedCapturePolicy;
+    pDevice->aaudio.noAutoStartAfterReroute = pConfig->aaudio.noAutoStartAfterReroute;
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        result = ma_device_init_by_type__aaudio(pDevice, pConfig, ma_device_type_capture, pDescriptorCapture, (ma_AAudioStream**)&pDevice->aaudio.pStreamCapture);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        result = ma_device_init_by_type__aaudio(pDevice, pConfig, ma_device_type_playback, pDescriptorPlayback, (ma_AAudioStream**)&pDevice->aaudio.pStreamPlayback);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init__aaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    result = ma_device_init_streams__aaudio(pDevice, pConfig, pDescriptorPlayback, pDescriptorCapture);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_mutex_init(&pDevice->aaudio.rerouteLock);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start_stream__aaudio(ma_device* pDevice, ma_AAudioStream* pStream)
+{
+    ma_aaudio_result_t resultAA;
+    ma_aaudio_stream_state_t currentState;
+
+    MA_ASSERT(pDevice != NULL);
+
+    if (pStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    resultAA = ((MA_PFN_AAudioStream_requestStart)pDevice->pContext->aaudio.AAudioStream_requestStart)(pStream);
+    if (resultAA != MA_AAUDIO_OK) {
+        return ma_result_from_aaudio(resultAA);
+    }
+
+    /* Do we actually need to wait for the device to transition into its started state? */
+
+    /* The device should be in either a starting or started state. If it's not set to started we need to wait for it to transition. It should go from starting to started. */
+    currentState = ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream);
+    if (currentState != MA_AAUDIO_STREAM_STATE_STARTED) {
+        ma_result result;
+
+        if (currentState != MA_AAUDIO_STREAM_STATE_STARTING) {
+            return MA_ERROR;   /* Expecting the stream to be a starting or started state. */
+        }
+
+        result = ma_wait_for_simple_state_transition__aaudio(pDevice->pContext, pStream, currentState, MA_AAUDIO_STREAM_STATE_STARTED);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop_stream__aaudio(ma_device* pDevice, ma_AAudioStream* pStream)
+{
+    ma_aaudio_result_t resultAA;
+    ma_aaudio_stream_state_t currentState;
+
+    MA_ASSERT(pDevice != NULL);
+
+    if (pStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    From the AAudio documentation:
+
+        The stream will stop after all of the data currently buffered has been played.
+
+    This maps with miniaudio's requirement that device's be drained which means we don't need to implement any draining logic.
+    */
+    currentState = ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream);
+    if (currentState == MA_AAUDIO_STREAM_STATE_DISCONNECTED) {
+        return MA_SUCCESS;  /* The device is disconnected. Don't try stopping it. */
+    }
+
+    resultAA = ((MA_PFN_AAudioStream_requestStop)pDevice->pContext->aaudio.AAudioStream_requestStop)(pStream);
+    if (resultAA != MA_AAUDIO_OK) {
+        return ma_result_from_aaudio(resultAA);
+    }
+
+    /* The device should be in either a stopping or stopped state. If it's not set to started we need to wait for it to transition. It should go from stopping to stopped. */
+    currentState = ((MA_PFN_AAudioStream_getState)pDevice->pContext->aaudio.AAudioStream_getState)(pStream);
+    if (currentState != MA_AAUDIO_STREAM_STATE_STOPPED) {
+        ma_result result;
+
+        if (currentState != MA_AAUDIO_STREAM_STATE_STOPPING) {
+            return MA_ERROR;   /* Expecting the stream to be a stopping or stopped state. */
+        }
+
+        result = ma_wait_for_simple_state_transition__aaudio(pDevice->pContext, pStream, currentState, MA_AAUDIO_STREAM_STATE_STOPPED);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_start__aaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ma_result result = ma_device_start_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_result result = ma_device_start_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback);
+        if (result != MA_SUCCESS) {
+            if (pDevice->type == ma_device_type_duplex) {
+                ma_device_stop_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
+            }
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__aaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ma_result result = ma_device_stop_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamCapture);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_result result = ma_device_stop_stream__aaudio(pDevice, (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    ma_device__on_notification_stopped(pDevice);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_reinit__aaudio(ma_device* pDevice, ma_device_type deviceType)
+{
+    ma_result result;
+    int32_t retries = 0;
+
+    MA_ASSERT(pDevice != NULL);
+
+    /*
+     TODO: Stop retrying if main thread is about to uninit device.
+    */
+    ma_mutex_lock(&pDevice->aaudio.rerouteLock);
+    {
+error_disconnected:
+        /* The first thing to do is close the streams. */
+        ma_close_streams__aaudio(pDevice);
+
+        /* Now we need to reinitialize each streams. The hardest part with this is just filling output the config and descriptors. */
+        ma_device_config deviceConfig;
+        ma_device_descriptor descriptorPlayback;
+        ma_device_descriptor descriptorCapture;
+
+        deviceConfig = ma_device_config_init(deviceType);
+        deviceConfig.playback.pDeviceID             = NULL; /* Only doing rerouting with default devices. */
+        deviceConfig.playback.shareMode             = pDevice->playback.shareMode;
+        deviceConfig.playback.format                = pDevice->playback.format;
+        deviceConfig.playback.channels              = pDevice->playback.channels;
+        deviceConfig.capture.pDeviceID              = NULL; /* Only doing rerouting with default devices. */
+        deviceConfig.capture.shareMode              = pDevice->capture.shareMode;
+        deviceConfig.capture.format                 = pDevice->capture.format;
+        deviceConfig.capture.channels               = pDevice->capture.channels;
+        deviceConfig.sampleRate                     = pDevice->sampleRate;
+        deviceConfig.aaudio.usage                   = pDevice->aaudio.usage;
+        deviceConfig.aaudio.contentType             = pDevice->aaudio.contentType;
+        deviceConfig.aaudio.inputPreset             = pDevice->aaudio.inputPreset;
+        deviceConfig.aaudio.allowedCapturePolicy    = pDevice->aaudio.allowedCapturePolicy;
+        deviceConfig.aaudio.noAutoStartAfterReroute = pDevice->aaudio.noAutoStartAfterReroute;
+        deviceConfig.periods                        = 1;
+
+        /* Try to get an accurate period size. */
+        if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
+            deviceConfig.periodSizeInFrames = pDevice->playback.internalPeriodSizeInFrames;
+        } else {
+            deviceConfig.periodSizeInFrames = pDevice->capture.internalPeriodSizeInFrames;
+        }
+
+        if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
+            descriptorCapture.pDeviceID           = deviceConfig.capture.pDeviceID;
+            descriptorCapture.shareMode           = deviceConfig.capture.shareMode;
+            descriptorCapture.format              = deviceConfig.capture.format;
+            descriptorCapture.channels            = deviceConfig.capture.channels;
+            descriptorCapture.sampleRate          = deviceConfig.sampleRate;
+            descriptorCapture.periodSizeInFrames  = deviceConfig.periodSizeInFrames;
+            descriptorCapture.periodCount         = deviceConfig.periods;
+        }
+
+        if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
+            descriptorPlayback.pDeviceID          = deviceConfig.playback.pDeviceID;
+            descriptorPlayback.shareMode          = deviceConfig.playback.shareMode;
+            descriptorPlayback.format             = deviceConfig.playback.format;
+            descriptorPlayback.channels           = deviceConfig.playback.channels;
+            descriptorPlayback.sampleRate         = deviceConfig.sampleRate;
+            descriptorPlayback.periodSizeInFrames = deviceConfig.periodSizeInFrames;
+            descriptorPlayback.periodCount        = deviceConfig.periods;
+        }
+
+        result = ma_device_init_streams__aaudio(pDevice, &deviceConfig, &descriptorPlayback, &descriptorCapture);
+        if (result != MA_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[AAudio] Failed to create stream after route change.");
+            goto done;
+        }
+
+        result = ma_device_post_init(pDevice, deviceType, &descriptorPlayback, &descriptorCapture);
+        if (result != MA_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_WARNING, "[AAudio] Failed to initialize device after route change.");
+            ma_close_streams__aaudio(pDevice);
+            goto done;
+        }
+
+        /* We'll only ever do this in response to a reroute. */
+        ma_device__on_notification_rerouted(pDevice);
+
+        /* If the device is started, start the streams. Maybe make this configurable? */
+        if (ma_device_get_state(pDevice) == ma_device_state_started) {
+            if (pDevice->aaudio.noAutoStartAfterReroute == MA_FALSE) {
+                result = ma_device_start__aaudio(pDevice);
+                if (result != MA_SUCCESS) {
+                    /* We got disconnected! Retry a few times, until we find a connected device! */
+                    retries += 1;
+                    if (retries <= 3) {
+                        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] Failed to start stream after route change, retrying(%d)", retries);
+                        goto error_disconnected;
+                    }
+                    ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[AAudio] Failed to start stream after route change.");
+                    goto done;
+                }
+            } else {
+                ma_device_stop(pDevice);    /* Do a full device stop so we set internal state correctly. */
+            }
+        }
+        
+        result = MA_SUCCESS;
+    }
+done:
+    /* Re-routing done */
+    ma_mutex_unlock(&pDevice->aaudio.rerouteLock);
+
+    return result;
+}
+
+static ma_result ma_device_get_info__aaudio(ma_device* pDevice, ma_device_type type, ma_device_info* pDeviceInfo)
+{
+    ma_AAudioStream* pStream = NULL;
+
+    MA_ASSERT(pDevice     != NULL);
+    MA_ASSERT(type        != ma_device_type_duplex);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    if (type == ma_device_type_capture) {
+        pStream = (ma_AAudioStream*)pDevice->aaudio.pStreamCapture;
+        pDeviceInfo->id.aaudio = pDevice->capture.id.aaudio;
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);     /* Only supporting default devices. */
+    }
+    if (type == ma_device_type_playback) {
+        pStream = (ma_AAudioStream*)pDevice->aaudio.pStreamPlayback;
+        pDeviceInfo->id.aaudio = pDevice->playback.id.aaudio;
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);    /* Only supporting default devices. */
+    }
+
+    /* Safety. Should never happen. */
+    if (pStream == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    pDeviceInfo->nativeDataFormatCount = 0;
+    ma_context_add_native_data_format_from_AAudioStream__aaudio(pDevice->pContext, pStream, 0, pDeviceInfo);
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_context_uninit__aaudio(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_aaudio);
+
+    ma_device_job_thread_uninit(&pContext->aaudio.jobThread, &pContext->allocationCallbacks);
+
+    ma_dlclose(ma_context_get_log(pContext), pContext->aaudio.hAAudio);
+    pContext->aaudio.hAAudio = NULL;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__aaudio(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+#if !defined(MA_NO_RUNTIME_LINKING)
+    size_t i;
+    const char* libNames[] = {
+        "libaaudio.so"
+    };
+
+    for (i = 0; i < ma_countof(libNames); ++i) {
+        pContext->aaudio.hAAudio = ma_dlopen(ma_context_get_log(pContext), libNames[i]);
+        if (pContext->aaudio.hAAudio != NULL) {
+            break;
+        }
+    }
+
+    if (pContext->aaudio.hAAudio == NULL) {
+        return MA_FAILED_TO_INIT_BACKEND;
+    }
+
+    pContext->aaudio.AAudio_createStreamBuilder                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudio_createStreamBuilder");
+    pContext->aaudio.AAudioStreamBuilder_delete                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_delete");
+    pContext->aaudio.AAudioStreamBuilder_setDeviceId               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setDeviceId");
+    pContext->aaudio.AAudioStreamBuilder_setDirection              = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setDirection");
+    pContext->aaudio.AAudioStreamBuilder_setSharingMode            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setSharingMode");
+    pContext->aaudio.AAudioStreamBuilder_setFormat                 = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setFormat");
+    pContext->aaudio.AAudioStreamBuilder_setChannelCount           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setChannelCount");
+    pContext->aaudio.AAudioStreamBuilder_setSampleRate             = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setSampleRate");
+    pContext->aaudio.AAudioStreamBuilder_setBufferCapacityInFrames = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setBufferCapacityInFrames");
+    pContext->aaudio.AAudioStreamBuilder_setFramesPerDataCallback  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setFramesPerDataCallback");
+    pContext->aaudio.AAudioStreamBuilder_setDataCallback           = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setDataCallback");
+    pContext->aaudio.AAudioStreamBuilder_setErrorCallback          = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setErrorCallback");
+    pContext->aaudio.AAudioStreamBuilder_setPerformanceMode        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setPerformanceMode");
+    pContext->aaudio.AAudioStreamBuilder_setUsage                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setUsage");
+    pContext->aaudio.AAudioStreamBuilder_setContentType            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setContentType");
+    pContext->aaudio.AAudioStreamBuilder_setInputPreset            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setInputPreset");
+    pContext->aaudio.AAudioStreamBuilder_setAllowedCapturePolicy   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_setAllowedCapturePolicy");
+    pContext->aaudio.AAudioStreamBuilder_openStream                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStreamBuilder_openStream");
+    pContext->aaudio.AAudioStream_close                            = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_close");
+    pContext->aaudio.AAudioStream_getState                         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getState");
+    pContext->aaudio.AAudioStream_waitForStateChange               = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_waitForStateChange");
+    pContext->aaudio.AAudioStream_getFormat                        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getFormat");
+    pContext->aaudio.AAudioStream_getChannelCount                  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getChannelCount");
+    pContext->aaudio.AAudioStream_getSampleRate                    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getSampleRate");
+    pContext->aaudio.AAudioStream_getBufferCapacityInFrames        = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getBufferCapacityInFrames");
+    pContext->aaudio.AAudioStream_getFramesPerDataCallback         = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getFramesPerDataCallback");
+    pContext->aaudio.AAudioStream_getFramesPerBurst                = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_getFramesPerBurst");
+    pContext->aaudio.AAudioStream_requestStart                     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_requestStart");
+    pContext->aaudio.AAudioStream_requestStop                      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->aaudio.hAAudio, "AAudioStream_requestStop");
+#else
+    pContext->aaudio.AAudio_createStreamBuilder                    = (ma_proc)AAudio_createStreamBuilder;
+    pContext->aaudio.AAudioStreamBuilder_delete                    = (ma_proc)AAudioStreamBuilder_delete;
+    pContext->aaudio.AAudioStreamBuilder_setDeviceId               = (ma_proc)AAudioStreamBuilder_setDeviceId;
+    pContext->aaudio.AAudioStreamBuilder_setDirection              = (ma_proc)AAudioStreamBuilder_setDirection;
+    pContext->aaudio.AAudioStreamBuilder_setSharingMode            = (ma_proc)AAudioStreamBuilder_setSharingMode;
+    pContext->aaudio.AAudioStreamBuilder_setFormat                 = (ma_proc)AAudioStreamBuilder_setFormat;
+    pContext->aaudio.AAudioStreamBuilder_setChannelCount           = (ma_proc)AAudioStreamBuilder_setChannelCount;
+    pContext->aaudio.AAudioStreamBuilder_setSampleRate             = (ma_proc)AAudioStreamBuilder_setSampleRate;
+    pContext->aaudio.AAudioStreamBuilder_setBufferCapacityInFrames = (ma_proc)AAudioStreamBuilder_setBufferCapacityInFrames;
+    pContext->aaudio.AAudioStreamBuilder_setFramesPerDataCallback  = (ma_proc)AAudioStreamBuilder_setFramesPerDataCallback;
+    pContext->aaudio.AAudioStreamBuilder_setDataCallback           = (ma_proc)AAudioStreamBuilder_setDataCallback;
+    pContext->aaudio.AAudioStreamBuilder_setErrorCallback          = (ma_proc)AAudioStreamBuilder_setErrorCallback;
+    pContext->aaudio.AAudioStreamBuilder_setPerformanceMode        = (ma_proc)AAudioStreamBuilder_setPerformanceMode;
+    pContext->aaudio.AAudioStreamBuilder_setUsage                  = (ma_proc)AAudioStreamBuilder_setUsage;
+    pContext->aaudio.AAudioStreamBuilder_setContentType            = (ma_proc)AAudioStreamBuilder_setContentType;
+    pContext->aaudio.AAudioStreamBuilder_setInputPreset            = (ma_proc)AAudioStreamBuilder_setInputPreset;
+    #if defined(__ANDROID_API__) && __ANDROID_API__ >= 29
+    pContext->aaudio.AAudioStreamBuilder_setAllowedCapturePolicy   = (ma_proc)AAudioStreamBuilder_setAllowedCapturePolicy;
+    #endif
+    pContext->aaudio.AAudioStreamBuilder_openStream                = (ma_proc)AAudioStreamBuilder_openStream;
+    pContext->aaudio.AAudioStream_close                            = (ma_proc)AAudioStream_close;
+    pContext->aaudio.AAudioStream_getState                         = (ma_proc)AAudioStream_getState;
+    pContext->aaudio.AAudioStream_waitForStateChange               = (ma_proc)AAudioStream_waitForStateChange;
+    pContext->aaudio.AAudioStream_getFormat                        = (ma_proc)AAudioStream_getFormat;
+    pContext->aaudio.AAudioStream_getChannelCount                  = (ma_proc)AAudioStream_getChannelCount;
+    pContext->aaudio.AAudioStream_getSampleRate                    = (ma_proc)AAudioStream_getSampleRate;
+    pContext->aaudio.AAudioStream_getBufferCapacityInFrames        = (ma_proc)AAudioStream_getBufferCapacityInFrames;
+    pContext->aaudio.AAudioStream_getFramesPerDataCallback         = (ma_proc)AAudioStream_getFramesPerDataCallback;
+    pContext->aaudio.AAudioStream_getFramesPerBurst                = (ma_proc)AAudioStream_getFramesPerBurst;
+    pContext->aaudio.AAudioStream_requestStart                     = (ma_proc)AAudioStream_requestStart;
+    pContext->aaudio.AAudioStream_requestStop                      = (ma_proc)AAudioStream_requestStop;
+#endif
+
+    pCallbacks->onContextInit             = ma_context_init__aaudio;
+    pCallbacks->onContextUninit           = ma_context_uninit__aaudio;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__aaudio;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__aaudio;
+    pCallbacks->onDeviceInit              = ma_device_init__aaudio;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__aaudio;
+    pCallbacks->onDeviceStart             = ma_device_start__aaudio;
+    pCallbacks->onDeviceStop              = ma_device_stop__aaudio;
+    pCallbacks->onDeviceRead              = NULL;   /* Not used because AAudio is asynchronous. */
+    pCallbacks->onDeviceWrite             = NULL;   /* Not used because AAudio is asynchronous. */
+    pCallbacks->onDeviceDataLoop          = NULL;   /* Not used because AAudio is asynchronous. */
+    pCallbacks->onDeviceGetInfo           = ma_device_get_info__aaudio;
+
+
+    /* We need a job thread so we can deal with rerouting. */
+    {
+        ma_result result;
+        ma_device_job_thread_config jobThreadConfig;
+
+        jobThreadConfig = ma_device_job_thread_config_init();
+
+        result = ma_device_job_thread_init(&jobThreadConfig, &pContext->allocationCallbacks, &pContext->aaudio.jobThread);
+        if (result != MA_SUCCESS) {
+            ma_dlclose(ma_context_get_log(pContext), pContext->aaudio.hAAudio);
+            pContext->aaudio.hAAudio = NULL;
+            return result;
+        }
+    }
+
+
+    (void)pConfig;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_job_process__device__aaudio_reroute(ma_job* pJob)
+{
+    ma_result result;
+    ma_device* pDevice;
+
+    MA_ASSERT(pJob != NULL);
+
+    pDevice = (ma_device*)pJob->data.device.aaudio.reroute.pDevice;
+    MA_ASSERT(pDevice != NULL);
+
+    /* Here is where we need to reroute the device. To do this we need to uninitialize the stream and reinitialize it. */
+    result = ma_device_reinit__aaudio(pDevice, (ma_device_type)pJob->data.device.aaudio.reroute.deviceType);
+    if (result != MA_SUCCESS) {
+        /*
+        Getting here means we failed to reroute the device. The best thing I can think of here is to
+        just stop the device.
+        */
+        ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[AAudio] Stopping device due to reroute failure.");
+        ma_device_stop(pDevice);
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+#else
+/* Getting here means there is no AAudio backend so we need a no-op job implementation. */
+static ma_result ma_job_process__device__aaudio_reroute(ma_job* pJob)
+{
+    return ma_job_process__noop(pJob);
+}
+#endif  /* AAudio */
+
+
+/******************************************************************************
+
+OpenSL|ES Backend
+
+******************************************************************************/
+#ifdef MA_HAS_OPENSL
+#include <SLES/OpenSLES.h>
+#ifdef MA_ANDROID
+#include <SLES/OpenSLES_Android.h>
+#endif
+
+typedef SLresult (SLAPIENTRY * ma_slCreateEngine_proc)(SLObjectItf* pEngine, SLuint32 numOptions, SLEngineOption* pEngineOptions, SLuint32 numInterfaces, SLInterfaceID* pInterfaceIds, SLboolean* pInterfaceRequired);
+
+/* OpenSL|ES has one-per-application objects :( */
+static SLObjectItf g_maEngineObjectSL    = NULL;
+static SLEngineItf g_maEngineSL          = NULL;
+static ma_uint32   g_maOpenSLInitCounter = 0;
+static ma_spinlock g_maOpenSLSpinlock    = 0;   /* For init/uninit. */
+
+#define MA_OPENSL_OBJ(p)         (*((SLObjectItf)(p)))
+#define MA_OPENSL_OUTPUTMIX(p)   (*((SLOutputMixItf)(p)))
+#define MA_OPENSL_PLAY(p)        (*((SLPlayItf)(p)))
+#define MA_OPENSL_RECORD(p)      (*((SLRecordItf)(p)))
+
+#ifdef MA_ANDROID
+#define MA_OPENSL_BUFFERQUEUE(p) (*((SLAndroidSimpleBufferQueueItf)(p)))
+#else
+#define MA_OPENSL_BUFFERQUEUE(p) (*((SLBufferQueueItf)(p)))
+#endif
+
+static ma_result ma_result_from_OpenSL(SLuint32 result)
+{
+    switch (result)
+    {
+        case SL_RESULT_SUCCESS:                 return MA_SUCCESS;
+        case SL_RESULT_PRECONDITIONS_VIOLATED:  return MA_ERROR;
+        case SL_RESULT_PARAMETER_INVALID:       return MA_INVALID_ARGS;
+        case SL_RESULT_MEMORY_FAILURE:          return MA_OUT_OF_MEMORY;
+        case SL_RESULT_RESOURCE_ERROR:          return MA_INVALID_DATA;
+        case SL_RESULT_RESOURCE_LOST:           return MA_ERROR;
+        case SL_RESULT_IO_ERROR:                return MA_IO_ERROR;
+        case SL_RESULT_BUFFER_INSUFFICIENT:     return MA_NO_SPACE;
+        case SL_RESULT_CONTENT_CORRUPTED:       return MA_INVALID_DATA;
+        case SL_RESULT_CONTENT_UNSUPPORTED:     return MA_FORMAT_NOT_SUPPORTED;
+        case SL_RESULT_CONTENT_NOT_FOUND:       return MA_ERROR;
+        case SL_RESULT_PERMISSION_DENIED:       return MA_ACCESS_DENIED;
+        case SL_RESULT_FEATURE_UNSUPPORTED:     return MA_NOT_IMPLEMENTED;
+        case SL_RESULT_INTERNAL_ERROR:          return MA_ERROR;
+        case SL_RESULT_UNKNOWN_ERROR:           return MA_ERROR;
+        case SL_RESULT_OPERATION_ABORTED:       return MA_ERROR;
+        case SL_RESULT_CONTROL_LOST:            return MA_ERROR;
+        default:                                return MA_ERROR;
+    }
+}
+
+/* Converts an individual OpenSL-style channel identifier (SL_SPEAKER_FRONT_LEFT, etc.) to miniaudio. */
+static ma_uint8 ma_channel_id_to_ma__opensl(SLuint32 id)
+{
+    switch (id)
+    {
+        case SL_SPEAKER_FRONT_LEFT:            return MA_CHANNEL_FRONT_LEFT;
+        case SL_SPEAKER_FRONT_RIGHT:           return MA_CHANNEL_FRONT_RIGHT;
+        case SL_SPEAKER_FRONT_CENTER:          return MA_CHANNEL_FRONT_CENTER;
+        case SL_SPEAKER_LOW_FREQUENCY:         return MA_CHANNEL_LFE;
+        case SL_SPEAKER_BACK_LEFT:             return MA_CHANNEL_BACK_LEFT;
+        case SL_SPEAKER_BACK_RIGHT:            return MA_CHANNEL_BACK_RIGHT;
+        case SL_SPEAKER_FRONT_LEFT_OF_CENTER:  return MA_CHANNEL_FRONT_LEFT_CENTER;
+        case SL_SPEAKER_FRONT_RIGHT_OF_CENTER: return MA_CHANNEL_FRONT_RIGHT_CENTER;
+        case SL_SPEAKER_BACK_CENTER:           return MA_CHANNEL_BACK_CENTER;
+        case SL_SPEAKER_SIDE_LEFT:             return MA_CHANNEL_SIDE_LEFT;
+        case SL_SPEAKER_SIDE_RIGHT:            return MA_CHANNEL_SIDE_RIGHT;
+        case SL_SPEAKER_TOP_CENTER:            return MA_CHANNEL_TOP_CENTER;
+        case SL_SPEAKER_TOP_FRONT_LEFT:        return MA_CHANNEL_TOP_FRONT_LEFT;
+        case SL_SPEAKER_TOP_FRONT_CENTER:      return MA_CHANNEL_TOP_FRONT_CENTER;
+        case SL_SPEAKER_TOP_FRONT_RIGHT:       return MA_CHANNEL_TOP_FRONT_RIGHT;
+        case SL_SPEAKER_TOP_BACK_LEFT:         return MA_CHANNEL_TOP_BACK_LEFT;
+        case SL_SPEAKER_TOP_BACK_CENTER:       return MA_CHANNEL_TOP_BACK_CENTER;
+        case SL_SPEAKER_TOP_BACK_RIGHT:        return MA_CHANNEL_TOP_BACK_RIGHT;
+        default: return 0;
+    }
+}
+
+/* Converts an individual miniaudio channel identifier (MA_CHANNEL_FRONT_LEFT, etc.) to OpenSL-style. */
+static SLuint32 ma_channel_id_to_opensl(ma_uint8 id)
+{
+    switch (id)
+    {
+        case MA_CHANNEL_MONO:               return SL_SPEAKER_FRONT_CENTER;
+        case MA_CHANNEL_FRONT_LEFT:         return SL_SPEAKER_FRONT_LEFT;
+        case MA_CHANNEL_FRONT_RIGHT:        return SL_SPEAKER_FRONT_RIGHT;
+        case MA_CHANNEL_FRONT_CENTER:       return SL_SPEAKER_FRONT_CENTER;
+        case MA_CHANNEL_LFE:                return SL_SPEAKER_LOW_FREQUENCY;
+        case MA_CHANNEL_BACK_LEFT:          return SL_SPEAKER_BACK_LEFT;
+        case MA_CHANNEL_BACK_RIGHT:         return SL_SPEAKER_BACK_RIGHT;
+        case MA_CHANNEL_FRONT_LEFT_CENTER:  return SL_SPEAKER_FRONT_LEFT_OF_CENTER;
+        case MA_CHANNEL_FRONT_RIGHT_CENTER: return SL_SPEAKER_FRONT_RIGHT_OF_CENTER;
+        case MA_CHANNEL_BACK_CENTER:        return SL_SPEAKER_BACK_CENTER;
+        case MA_CHANNEL_SIDE_LEFT:          return SL_SPEAKER_SIDE_LEFT;
+        case MA_CHANNEL_SIDE_RIGHT:         return SL_SPEAKER_SIDE_RIGHT;
+        case MA_CHANNEL_TOP_CENTER:         return SL_SPEAKER_TOP_CENTER;
+        case MA_CHANNEL_TOP_FRONT_LEFT:     return SL_SPEAKER_TOP_FRONT_LEFT;
+        case MA_CHANNEL_TOP_FRONT_CENTER:   return SL_SPEAKER_TOP_FRONT_CENTER;
+        case MA_CHANNEL_TOP_FRONT_RIGHT:    return SL_SPEAKER_TOP_FRONT_RIGHT;
+        case MA_CHANNEL_TOP_BACK_LEFT:      return SL_SPEAKER_TOP_BACK_LEFT;
+        case MA_CHANNEL_TOP_BACK_CENTER:    return SL_SPEAKER_TOP_BACK_CENTER;
+        case MA_CHANNEL_TOP_BACK_RIGHT:     return SL_SPEAKER_TOP_BACK_RIGHT;
+        default: return 0;
+    }
+}
+
+/* Converts a channel mapping to an OpenSL-style channel mask. */
+static SLuint32 ma_channel_map_to_channel_mask__opensl(const ma_channel* pChannelMap, ma_uint32 channels)
+{
+    SLuint32 channelMask = 0;
+    ma_uint32 iChannel;
+    for (iChannel = 0; iChannel < channels; ++iChannel) {
+        channelMask |= ma_channel_id_to_opensl(pChannelMap[iChannel]);
+    }
+
+    return channelMask;
+}
+
+/* Converts an OpenSL-style channel mask to a miniaudio channel map. */
+static void ma_channel_mask_to_channel_map__opensl(SLuint32 channelMask, ma_uint32 channels, ma_channel* pChannelMap)
+{
+    if (channels == 1 && channelMask == 0) {
+        pChannelMap[0] = MA_CHANNEL_MONO;
+    } else if (channels == 2 && channelMask == 0) {
+        pChannelMap[0] = MA_CHANNEL_FRONT_LEFT;
+        pChannelMap[1] = MA_CHANNEL_FRONT_RIGHT;
+    } else {
+        if (channels == 1 && (channelMask & SL_SPEAKER_FRONT_CENTER) != 0) {
+            pChannelMap[0] = MA_CHANNEL_MONO;
+        } else {
+            /* Just iterate over each bit. */
+            ma_uint32 iChannel = 0;
+            ma_uint32 iBit;
+            for (iBit = 0; iBit < 32 && iChannel < channels; ++iBit) {
+                SLuint32 bitValue = (channelMask & (1UL << iBit));
+                if (bitValue != 0) {
+                    /* The bit is set. */
+                    pChannelMap[iChannel] = ma_channel_id_to_ma__opensl(bitValue);
+                    iChannel += 1;
+                }
+            }
+        }
+    }
+}
+
+static SLuint32 ma_round_to_standard_sample_rate__opensl(SLuint32 samplesPerSec)
+{
+    if (samplesPerSec <= SL_SAMPLINGRATE_8) {
+        return SL_SAMPLINGRATE_8;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_11_025) {
+        return SL_SAMPLINGRATE_11_025;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_12) {
+        return SL_SAMPLINGRATE_12;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_16) {
+        return SL_SAMPLINGRATE_16;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_22_05) {
+        return SL_SAMPLINGRATE_22_05;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_24) {
+        return SL_SAMPLINGRATE_24;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_32) {
+        return SL_SAMPLINGRATE_32;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_44_1) {
+        return SL_SAMPLINGRATE_44_1;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_48) {
+        return SL_SAMPLINGRATE_48;
+    }
+
+    /* Android doesn't support more than 48000. */
+#ifndef MA_ANDROID
+    if (samplesPerSec <= SL_SAMPLINGRATE_64) {
+        return SL_SAMPLINGRATE_64;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_88_2) {
+        return SL_SAMPLINGRATE_88_2;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_96) {
+        return SL_SAMPLINGRATE_96;
+    }
+    if (samplesPerSec <= SL_SAMPLINGRATE_192) {
+        return SL_SAMPLINGRATE_192;
+    }
+#endif
+
+    return SL_SAMPLINGRATE_16;
+}
+
+
+static SLint32 ma_to_stream_type__opensl(ma_opensl_stream_type streamType)
+{
+    switch (streamType) {
+        case ma_opensl_stream_type_voice:        return SL_ANDROID_STREAM_VOICE;
+        case ma_opensl_stream_type_system:       return SL_ANDROID_STREAM_SYSTEM;
+        case ma_opensl_stream_type_ring:         return SL_ANDROID_STREAM_RING;
+        case ma_opensl_stream_type_media:        return SL_ANDROID_STREAM_MEDIA;
+        case ma_opensl_stream_type_alarm:        return SL_ANDROID_STREAM_ALARM;
+        case ma_opensl_stream_type_notification: return SL_ANDROID_STREAM_NOTIFICATION;
+        default: break;
+    }
+
+    return SL_ANDROID_STREAM_VOICE;
+}
+
+static SLint32 ma_to_recording_preset__opensl(ma_opensl_recording_preset recordingPreset)
+{
+    switch (recordingPreset) {
+        case ma_opensl_recording_preset_generic:             return SL_ANDROID_RECORDING_PRESET_GENERIC;
+        case ma_opensl_recording_preset_camcorder:           return SL_ANDROID_RECORDING_PRESET_CAMCORDER;
+        case ma_opensl_recording_preset_voice_recognition:   return SL_ANDROID_RECORDING_PRESET_VOICE_RECOGNITION;
+        case ma_opensl_recording_preset_voice_communication: return SL_ANDROID_RECORDING_PRESET_VOICE_COMMUNICATION;
+        case ma_opensl_recording_preset_voice_unprocessed:   return SL_ANDROID_RECORDING_PRESET_UNPROCESSED;
+        default: break;
+    }
+
+    return SL_ANDROID_RECORDING_PRESET_NONE;
+}
+
+
+static ma_result ma_context_enumerate_devices__opensl(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_bool32 cbResult;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it and then attempted to enumerate devices. */
+    if (g_maOpenSLInitCounter == 0) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /*
+    TODO: Test Me.
+
+    This is currently untested, so for now we are just returning default devices.
+    */
+#if 0 && !defined(MA_ANDROID)
+    ma_bool32 isTerminated = MA_FALSE;
+
+    SLuint32 pDeviceIDs[128];
+    SLint32 deviceCount = sizeof(pDeviceIDs) / sizeof(pDeviceIDs[0]);
+
+    SLAudioIODeviceCapabilitiesItf deviceCaps;
+    SLresult resultSL = (*g_maEngineObjectSL)->GetInterface(g_maEngineObjectSL, (SLInterfaceID)pContext->opensl.SL_IID_AUDIOIODEVICECAPABILITIES, &deviceCaps);
+    if (resultSL != SL_RESULT_SUCCESS) {
+        /* The interface may not be supported so just report a default device. */
+        goto return_default_device;
+    }
+
+    /* Playback */
+    if (!isTerminated) {
+        resultSL = (*deviceCaps)->GetAvailableAudioOutputs(deviceCaps, &deviceCount, pDeviceIDs);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        for (SLint32 iDevice = 0; iDevice < deviceCount; ++iDevice) {
+            ma_device_info deviceInfo;
+            MA_ZERO_OBJECT(&deviceInfo);
+            deviceInfo.id.opensl = pDeviceIDs[iDevice];
+
+            SLAudioOutputDescriptor desc;
+            resultSL = (*deviceCaps)->QueryAudioOutputCapabilities(deviceCaps, deviceInfo.id.opensl, &desc);
+            if (resultSL == SL_RESULT_SUCCESS) {
+                ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), (const char*)desc.pDeviceName, (size_t)-1);
+
+                ma_bool32 cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+                if (cbResult == MA_FALSE) {
+                    isTerminated = MA_TRUE;
+                    break;
+                }
+            }
+        }
+    }
+
+    /* Capture */
+    if (!isTerminated) {
+        resultSL = (*deviceCaps)->GetAvailableAudioInputs(deviceCaps, &deviceCount, pDeviceIDs);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        for (SLint32 iDevice = 0; iDevice < deviceCount; ++iDevice) {
+            ma_device_info deviceInfo;
+            MA_ZERO_OBJECT(&deviceInfo);
+            deviceInfo.id.opensl = pDeviceIDs[iDevice];
+
+            SLAudioInputDescriptor desc;
+            resultSL = (*deviceCaps)->QueryAudioInputCapabilities(deviceCaps, deviceInfo.id.opensl, &desc);
+            if (resultSL == SL_RESULT_SUCCESS) {
+                ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), (const char*)desc.deviceName, (size_t)-1);
+
+                ma_bool32 cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+                if (cbResult == MA_FALSE) {
+                    isTerminated = MA_TRUE;
+                    break;
+                }
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+#else
+    goto return_default_device;
+#endif
+
+return_default_device:;
+    cbResult = MA_TRUE;
+
+    /* Playback. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        deviceInfo.id.opensl = SL_DEFAULTDEVICEID_AUDIOOUTPUT;
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+    }
+
+    /* Capture. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        deviceInfo.id.opensl = SL_DEFAULTDEVICEID_AUDIOINPUT;
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+        cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_context_add_data_format_ex__opensl(ma_context* pContext, ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_device_info* pDeviceInfo)
+{
+    MA_ASSERT(pContext    != NULL);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].format     = format;
+    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].channels   = channels;
+    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].sampleRate = sampleRate;
+    pDeviceInfo->nativeDataFormats[pDeviceInfo->nativeDataFormatCount].flags      = 0;
+    pDeviceInfo->nativeDataFormatCount += 1;
+}
+
+static void ma_context_add_data_format__opensl(ma_context* pContext, ma_format format, ma_device_info* pDeviceInfo)
+{
+    ma_uint32 minChannels   = 1;
+    ma_uint32 maxChannels   = 2;
+    ma_uint32 minSampleRate = (ma_uint32)ma_standard_sample_rate_8000;
+    ma_uint32 maxSampleRate = (ma_uint32)ma_standard_sample_rate_48000;
+    ma_uint32 iChannel;
+    ma_uint32 iSampleRate;
+
+    MA_ASSERT(pContext    != NULL);
+    MA_ASSERT(pDeviceInfo != NULL);
+
+    /*
+    Each sample format can support mono and stereo, and we'll support a small subset of standard
+    rates (up to 48000). A better solution would be to somehow find a native sample rate.
+    */
+    for (iChannel = minChannels; iChannel < maxChannels; iChannel += 1) {
+        for (iSampleRate = 0; iSampleRate < ma_countof(g_maStandardSampleRatePriorities); iSampleRate += 1) {
+            ma_uint32 standardSampleRate = g_maStandardSampleRatePriorities[iSampleRate];
+            if (standardSampleRate >= minSampleRate && standardSampleRate <= maxSampleRate) {
+                ma_context_add_data_format_ex__opensl(pContext, format, iChannel, standardSampleRate, pDeviceInfo);
+            }
+        }
+    }
+}
+
+static ma_result ma_context_get_device_info__opensl(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    MA_ASSERT(pContext != NULL);
+
+    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it and then attempted to get device info. */
+    if (g_maOpenSLInitCounter == 0) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /*
+    TODO: Test Me.
+
+    This is currently untested, so for now we are just returning default devices.
+    */
+#if 0 && !defined(MA_ANDROID)
+    SLAudioIODeviceCapabilitiesItf deviceCaps;
+    SLresult resultSL = (*g_maEngineObjectSL)->GetInterface(g_maEngineObjectSL, (SLInterfaceID)pContext->opensl.SL_IID_AUDIOIODEVICECAPABILITIES, &deviceCaps);
+    if (resultSL != SL_RESULT_SUCCESS) {
+        /* The interface may not be supported so just report a default device. */
+        goto return_default_device;
+    }
+
+    if (deviceType == ma_device_type_playback) {
+        SLAudioOutputDescriptor desc;
+        resultSL = (*deviceCaps)->QueryAudioOutputCapabilities(deviceCaps, pDeviceID->opensl, &desc);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), (const char*)desc.pDeviceName, (size_t)-1);
+    } else {
+        SLAudioInputDescriptor desc;
+        resultSL = (*deviceCaps)->QueryAudioInputCapabilities(deviceCaps, pDeviceID->opensl, &desc);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), (const char*)desc.deviceName, (size_t)-1);
+    }
+
+    goto return_detailed_info;
+#else
+    goto return_default_device;
+#endif
+
+return_default_device:
+    if (pDeviceID != NULL) {
+        if ((deviceType == ma_device_type_playback && pDeviceID->opensl != SL_DEFAULTDEVICEID_AUDIOOUTPUT) ||
+            (deviceType == ma_device_type_capture  && pDeviceID->opensl != SL_DEFAULTDEVICEID_AUDIOINPUT)) {
+            return MA_NO_DEVICE;   /* Don't know the device. */
+        }
+    }
+
+    /* ID and Name / Description */
+    if (deviceType == ma_device_type_playback) {
+        pDeviceInfo->id.opensl = SL_DEFAULTDEVICEID_AUDIOOUTPUT;
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+    } else {
+        pDeviceInfo->id.opensl = SL_DEFAULTDEVICEID_AUDIOINPUT;
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+    }
+
+    pDeviceInfo->isDefault = MA_TRUE;
+
+    goto return_detailed_info;
+
+
+return_detailed_info:
+
+    /*
+    For now we're just outputting a set of values that are supported by the API but not necessarily supported
+    by the device natively. Later on we should work on this so that it more closely reflects the device's
+    actual native format.
+    */
+    pDeviceInfo->nativeDataFormatCount = 0;
+#if defined(MA_ANDROID) && __ANDROID_API__ >= 21
+    ma_context_add_data_format__opensl(pContext, ma_format_f32, pDeviceInfo);
+#endif
+    ma_context_add_data_format__opensl(pContext, ma_format_s16, pDeviceInfo);
+    ma_context_add_data_format__opensl(pContext, ma_format_u8,  pDeviceInfo);
+
+    return MA_SUCCESS;
+}
+
+
+#ifdef MA_ANDROID
+/*void ma_buffer_queue_callback_capture__opensl_android(SLAndroidSimpleBufferQueueItf pBufferQueue, SLuint32 eventFlags, const void* pBuffer, SLuint32 bufferSize, SLuint32 dataUsed, void* pContext)*/
+static void ma_buffer_queue_callback_capture__opensl_android(SLAndroidSimpleBufferQueueItf pBufferQueue, void* pUserData)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    size_t periodSizeInBytes;
+    ma_uint8* pBuffer;
+    SLresult resultSL;
+
+    MA_ASSERT(pDevice != NULL);
+
+    (void)pBufferQueue;
+
+    /*
+    For now, don't do anything unless the buffer was fully processed. From what I can tell, it looks like
+    OpenSL|ES 1.1 improves on buffer queues to the point that we could much more intelligently handle this,
+    but unfortunately it looks like Android is only supporting OpenSL|ES 1.0.1 for now :(
+    */
+
+    /* Don't do anything if the device is not started. */
+    if (ma_device_get_state(pDevice) != ma_device_state_started) {
+        return;
+    }
+
+    /* Don't do anything if the device is being drained. */
+    if (pDevice->opensl.isDrainingCapture) {
+        return;
+    }
+
+    periodSizeInBytes = pDevice->capture.internalPeriodSizeInFrames * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+    pBuffer = pDevice->opensl.pBufferCapture + (pDevice->opensl.currentBufferIndexCapture * periodSizeInBytes);
+
+    ma_device_handle_backend_data_callback(pDevice, NULL, pBuffer, pDevice->capture.internalPeriodSizeInFrames);
+
+    resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueueCapture)->Enqueue((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture, pBuffer, periodSizeInBytes);
+    if (resultSL != SL_RESULT_SUCCESS) {
+        return;
+    }
+
+    pDevice->opensl.currentBufferIndexCapture = (pDevice->opensl.currentBufferIndexCapture + 1) % pDevice->capture.internalPeriods;
+}
+
+static void ma_buffer_queue_callback_playback__opensl_android(SLAndroidSimpleBufferQueueItf pBufferQueue, void* pUserData)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    size_t periodSizeInBytes;
+    ma_uint8* pBuffer;
+    SLresult resultSL;
+
+    MA_ASSERT(pDevice != NULL);
+
+    (void)pBufferQueue;
+
+    /* Don't do anything if the device is not started. */
+    if (ma_device_get_state(pDevice) != ma_device_state_started) {
+        return;
+    }
+
+    /* Don't do anything if the device is being drained. */
+    if (pDevice->opensl.isDrainingPlayback) {
+        return;
+    }
+
+    periodSizeInBytes = pDevice->playback.internalPeriodSizeInFrames * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+    pBuffer = pDevice->opensl.pBufferPlayback + (pDevice->opensl.currentBufferIndexPlayback * periodSizeInBytes);
+
+    ma_device_handle_backend_data_callback(pDevice, pBuffer, NULL, pDevice->playback.internalPeriodSizeInFrames);
+
+    resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueuePlayback)->Enqueue((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback, pBuffer, periodSizeInBytes);
+    if (resultSL != SL_RESULT_SUCCESS) {
+        return;
+    }
+
+    pDevice->opensl.currentBufferIndexPlayback = (pDevice->opensl.currentBufferIndexPlayback + 1) % pDevice->playback.internalPeriods;
+}
+#endif
+
+static ma_result ma_device_uninit__opensl(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it before uninitializing the device. */
+    if (g_maOpenSLInitCounter == 0) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->opensl.pAudioRecorderObj) {
+            MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->Destroy((SLObjectItf)pDevice->opensl.pAudioRecorderObj);
+        }
+
+        ma_free(pDevice->opensl.pBufferCapture, &pDevice->pContext->allocationCallbacks);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        if (pDevice->opensl.pAudioPlayerObj) {
+            MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->Destroy((SLObjectItf)pDevice->opensl.pAudioPlayerObj);
+        }
+        if (pDevice->opensl.pOutputMixObj) {
+            MA_OPENSL_OBJ(pDevice->opensl.pOutputMixObj)->Destroy((SLObjectItf)pDevice->opensl.pOutputMixObj);
+        }
+
+        ma_free(pDevice->opensl.pBufferPlayback, &pDevice->pContext->allocationCallbacks);
+    }
+
+    return MA_SUCCESS;
+}
+
+#if defined(MA_ANDROID) && __ANDROID_API__ >= 21
+typedef SLAndroidDataFormat_PCM_EX  ma_SLDataFormat_PCM;
+#else
+typedef SLDataFormat_PCM            ma_SLDataFormat_PCM;
+#endif
+
+static ma_result ma_SLDataFormat_PCM_init__opensl(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, const ma_channel* channelMap, ma_SLDataFormat_PCM* pDataFormat)
+{
+    /* We need to convert our format/channels/rate so that they aren't set to default. */
+    if (format == ma_format_unknown) {
+        format = MA_DEFAULT_FORMAT;
+    }
+    if (channels == 0) {
+        channels = MA_DEFAULT_CHANNELS;
+    }
+    if (sampleRate == 0) {
+        sampleRate = MA_DEFAULT_SAMPLE_RATE;
+    }
+
+#if defined(MA_ANDROID) && __ANDROID_API__ >= 21
+    if (format == ma_format_f32) {
+        pDataFormat->formatType     = SL_ANDROID_DATAFORMAT_PCM_EX;
+        pDataFormat->representation = SL_ANDROID_PCM_REPRESENTATION_FLOAT;
+    } else {
+        pDataFormat->formatType = SL_DATAFORMAT_PCM;
+    }
+#else
+    pDataFormat->formatType = SL_DATAFORMAT_PCM;
+#endif
+
+    pDataFormat->numChannels   = channels;
+    ((SLDataFormat_PCM*)pDataFormat)->samplesPerSec = ma_round_to_standard_sample_rate__opensl(sampleRate * 1000);  /* In millihertz. Annoyingly, the sample rate variable is named differently between SLAndroidDataFormat_PCM_EX and SLDataFormat_PCM */
+    pDataFormat->bitsPerSample = ma_get_bytes_per_sample(format) * 8;
+    pDataFormat->channelMask   = ma_channel_map_to_channel_mask__opensl(channelMap, channels);
+    pDataFormat->endianness    = (ma_is_little_endian()) ? SL_BYTEORDER_LITTLEENDIAN : SL_BYTEORDER_BIGENDIAN;
+
+    /*
+    Android has a few restrictions on the format as documented here: https://developer.android.com/ndk/guides/audio/opensl-for-android.html
+     - Only mono and stereo is supported.
+     - Only u8 and s16 formats are supported.
+     - Maximum sample rate of 48000.
+    */
+#ifdef MA_ANDROID
+    if (pDataFormat->numChannels > 2) {
+        pDataFormat->numChannels = 2;
+    }
+#if __ANDROID_API__ >= 21
+    if (pDataFormat->formatType == SL_ANDROID_DATAFORMAT_PCM_EX) {
+        /* It's floating point. */
+        MA_ASSERT(pDataFormat->representation == SL_ANDROID_PCM_REPRESENTATION_FLOAT);
+        if (pDataFormat->bitsPerSample > 32) {
+            pDataFormat->bitsPerSample = 32;
+        }
+    } else {
+        if (pDataFormat->bitsPerSample > 16) {
+            pDataFormat->bitsPerSample = 16;
+        }
+    }
+#else
+    if (pDataFormat->bitsPerSample > 16) {
+        pDataFormat->bitsPerSample = 16;
+    }
+#endif
+    if (((SLDataFormat_PCM*)pDataFormat)->samplesPerSec > SL_SAMPLINGRATE_48) {
+        ((SLDataFormat_PCM*)pDataFormat)->samplesPerSec = SL_SAMPLINGRATE_48;
+    }
+#endif
+
+    pDataFormat->containerSize = pDataFormat->bitsPerSample;  /* Always tightly packed for now. */
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_deconstruct_SLDataFormat_PCM__opensl(ma_SLDataFormat_PCM* pDataFormat, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    ma_bool32 isFloatingPoint = MA_FALSE;
+#if defined(MA_ANDROID) && __ANDROID_API__ >= 21
+    if (pDataFormat->formatType == SL_ANDROID_DATAFORMAT_PCM_EX) {
+        MA_ASSERT(pDataFormat->representation == SL_ANDROID_PCM_REPRESENTATION_FLOAT);
+        isFloatingPoint = MA_TRUE;
+    }
+#endif
+    if (isFloatingPoint) {
+        if (pDataFormat->bitsPerSample == 32) {
+            *pFormat = ma_format_f32;
+        }
+    } else {
+        if (pDataFormat->bitsPerSample == 8) {
+            *pFormat = ma_format_u8;
+        } else if (pDataFormat->bitsPerSample == 16) {
+            *pFormat = ma_format_s16;
+        } else if (pDataFormat->bitsPerSample == 24) {
+            *pFormat = ma_format_s24;
+        } else if (pDataFormat->bitsPerSample == 32) {
+            *pFormat = ma_format_s32;
+        }
+    }
+
+    *pChannels   = pDataFormat->numChannels;
+    *pSampleRate = ((SLDataFormat_PCM*)pDataFormat)->samplesPerSec / 1000;
+    ma_channel_mask_to_channel_map__opensl(pDataFormat->channelMask, ma_min(pDataFormat->numChannels, channelMapCap), pChannelMap);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_init__opensl(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+#ifdef MA_ANDROID
+    SLDataLocator_AndroidSimpleBufferQueue queue;
+    SLresult resultSL;
+    size_t bufferSizeInBytes;
+    SLInterfaceID itfIDs[2];
+    const SLboolean itfIDsRequired[] = {
+        SL_BOOLEAN_TRUE,    /* SL_IID_ANDROIDSIMPLEBUFFERQUEUE */
+        SL_BOOLEAN_FALSE    /* SL_IID_ANDROIDCONFIGURATION */
+    };
+#endif
+
+    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it and then attempted to initialize a new device. */
+    if (g_maOpenSLInitCounter == 0) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    /*
+    For now, only supporting Android implementations of OpenSL|ES since that's the only one I've
+    been able to test with and I currently depend on Android-specific extensions (simple buffer
+    queues).
+    */
+#ifdef MA_ANDROID
+    itfIDs[0] = (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE;
+    itfIDs[1] = (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDCONFIGURATION;
+
+    /* No exclusive mode with OpenSL|ES. */
+    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
+        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
+        return MA_SHARE_MODE_NOT_SUPPORTED;
+    }
+
+    /* Now we can start initializing the device properly. */
+    MA_ASSERT(pDevice != NULL);
+    MA_ZERO_OBJECT(&pDevice->opensl);
+
+    queue.locatorType = SL_DATALOCATOR_ANDROIDSIMPLEBUFFERQUEUE;
+
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        ma_SLDataFormat_PCM pcm;
+        SLDataLocator_IODevice locatorDevice;
+        SLDataSource source;
+        SLDataSink sink;
+        SLAndroidConfigurationItf pRecorderConfig;
+
+        ma_SLDataFormat_PCM_init__opensl(pDescriptorCapture->format, pDescriptorCapture->channels, pDescriptorCapture->sampleRate, pDescriptorCapture->channelMap, &pcm);
+
+        locatorDevice.locatorType = SL_DATALOCATOR_IODEVICE;
+        locatorDevice.deviceType  = SL_IODEVICE_AUDIOINPUT;
+        locatorDevice.deviceID    = SL_DEFAULTDEVICEID_AUDIOINPUT;  /* Must always use the default device with Android. */
+        locatorDevice.device      = NULL;
+
+        source.pLocator = &locatorDevice;
+        source.pFormat  = NULL;
+
+        queue.numBuffers = pDescriptorCapture->periodCount;
+
+        sink.pLocator = &queue;
+        sink.pFormat  = (SLDataFormat_PCM*)&pcm;
+
+        resultSL = (*g_maEngineSL)->CreateAudioRecorder(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pAudioRecorderObj, &source, &sink, ma_countof(itfIDs), itfIDs, itfIDsRequired);
+        if (resultSL == SL_RESULT_CONTENT_UNSUPPORTED || resultSL == SL_RESULT_PARAMETER_INVALID) {
+            /* Unsupported format. Fall back to something safer and try again. If this fails, just abort. */
+            pcm.formatType    = SL_DATAFORMAT_PCM;
+            pcm.numChannels   = 1;
+            ((SLDataFormat_PCM*)&pcm)->samplesPerSec = SL_SAMPLINGRATE_16;  /* The name of the sample rate variable is different between SLAndroidDataFormat_PCM_EX and SLDataFormat_PCM. */
+            pcm.bitsPerSample = 16;
+            pcm.containerSize = pcm.bitsPerSample;  /* Always tightly packed for now. */
+            pcm.channelMask   = 0;
+            resultSL = (*g_maEngineSL)->CreateAudioRecorder(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pAudioRecorderObj, &source, &sink, ma_countof(itfIDs), itfIDs, itfIDsRequired);
+        }
+
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to create audio recorder.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+
+        /* Set the recording preset before realizing the player. */
+        if (pConfig->opensl.recordingPreset != ma_opensl_recording_preset_default) {
+            resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioRecorderObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDCONFIGURATION, &pRecorderConfig);
+            if (resultSL == SL_RESULT_SUCCESS) {
+                SLint32 recordingPreset = ma_to_recording_preset__opensl(pConfig->opensl.recordingPreset);
+                resultSL = (*pRecorderConfig)->SetConfiguration(pRecorderConfig, SL_ANDROID_KEY_RECORDING_PRESET, &recordingPreset, sizeof(SLint32));
+                if (resultSL != SL_RESULT_SUCCESS) {
+                    /* Failed to set the configuration. Just keep going. */
+                }
+            }
+        }
+
+        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->Realize((SLObjectItf)pDevice->opensl.pAudioRecorderObj, SL_BOOLEAN_FALSE);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to realize audio recorder.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioRecorderObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_RECORD, &pDevice->opensl.pAudioRecorder);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_RECORD interface.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioRecorderObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioRecorderObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE, &pDevice->opensl.pBufferQueueCapture);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_ANDROIDSIMPLEBUFFERQUEUE interface.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueueCapture)->RegisterCallback((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture, ma_buffer_queue_callback_capture__opensl_android, pDevice);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to register buffer queue callback.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        /* The internal format is determined by the "pcm" object. */
+        ma_deconstruct_SLDataFormat_PCM__opensl(&pcm, &pDescriptorCapture->format, &pDescriptorCapture->channels, &pDescriptorCapture->sampleRate, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap));
+
+        /* Buffer. */
+        pDescriptorCapture->periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptorCapture, pDescriptorCapture->sampleRate, pConfig->performanceProfile);
+        pDevice->opensl.currentBufferIndexCapture = 0;
+
+        bufferSizeInBytes = pDescriptorCapture->periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorCapture->format, pDescriptorCapture->channels) * pDescriptorCapture->periodCount;
+        pDevice->opensl.pBufferCapture = (ma_uint8*)ma_calloc(bufferSizeInBytes, &pDevice->pContext->allocationCallbacks);
+        if (pDevice->opensl.pBufferCapture == NULL) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to allocate memory for data buffer.");
+            return MA_OUT_OF_MEMORY;
+        }
+        MA_ZERO_MEMORY(pDevice->opensl.pBufferCapture, bufferSizeInBytes);
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        ma_SLDataFormat_PCM pcm;
+        SLDataSource source;
+        SLDataLocator_OutputMix outmixLocator;
+        SLDataSink sink;
+        SLAndroidConfigurationItf pPlayerConfig;
+
+        ma_SLDataFormat_PCM_init__opensl(pDescriptorPlayback->format, pDescriptorPlayback->channels, pDescriptorPlayback->sampleRate, pDescriptorPlayback->channelMap, &pcm);
+
+        resultSL = (*g_maEngineSL)->CreateOutputMix(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pOutputMixObj, 0, NULL, NULL);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to create output mix.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pOutputMixObj)->Realize((SLObjectItf)pDevice->opensl.pOutputMixObj, SL_BOOLEAN_FALSE);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to realize output mix object.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pOutputMixObj)->GetInterface((SLObjectItf)pDevice->opensl.pOutputMixObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_OUTPUTMIX, &pDevice->opensl.pOutputMix);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_OUTPUTMIX interface.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        /* Set the output device. */
+        if (pDescriptorPlayback->pDeviceID != NULL) {
+            SLuint32 deviceID_OpenSL = pDescriptorPlayback->pDeviceID->opensl;
+            MA_OPENSL_OUTPUTMIX(pDevice->opensl.pOutputMix)->ReRoute((SLOutputMixItf)pDevice->opensl.pOutputMix, 1, &deviceID_OpenSL);
+        }
+
+        queue.numBuffers = pDescriptorPlayback->periodCount;
+
+        source.pLocator = &queue;
+        source.pFormat  = (SLDataFormat_PCM*)&pcm;
+
+        outmixLocator.locatorType = SL_DATALOCATOR_OUTPUTMIX;
+        outmixLocator.outputMix   = (SLObjectItf)pDevice->opensl.pOutputMixObj;
+
+        sink.pLocator = &outmixLocator;
+        sink.pFormat  = NULL;
+
+        resultSL = (*g_maEngineSL)->CreateAudioPlayer(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pAudioPlayerObj, &source, &sink, ma_countof(itfIDs), itfIDs, itfIDsRequired);
+        if (resultSL == SL_RESULT_CONTENT_UNSUPPORTED || resultSL == SL_RESULT_PARAMETER_INVALID) {
+            /* Unsupported format. Fall back to something safer and try again. If this fails, just abort. */
+            pcm.formatType = SL_DATAFORMAT_PCM;
+            pcm.numChannels = 2;
+            ((SLDataFormat_PCM*)&pcm)->samplesPerSec = SL_SAMPLINGRATE_16;
+            pcm.bitsPerSample = 16;
+            pcm.containerSize = pcm.bitsPerSample;  /* Always tightly packed for now. */
+            pcm.channelMask = SL_SPEAKER_FRONT_LEFT | SL_SPEAKER_FRONT_RIGHT;
+            resultSL = (*g_maEngineSL)->CreateAudioPlayer(g_maEngineSL, (SLObjectItf*)&pDevice->opensl.pAudioPlayerObj, &source, &sink, ma_countof(itfIDs), itfIDs, itfIDsRequired);
+        }
+
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to create audio player.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+
+        /* Set the stream type before realizing the player. */
+        if (pConfig->opensl.streamType != ma_opensl_stream_type_default) {
+            resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioPlayerObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDCONFIGURATION, &pPlayerConfig);
+            if (resultSL == SL_RESULT_SUCCESS) {
+                SLint32 streamType = ma_to_stream_type__opensl(pConfig->opensl.streamType);
+                resultSL = (*pPlayerConfig)->SetConfiguration(pPlayerConfig, SL_ANDROID_KEY_STREAM_TYPE, &streamType, sizeof(SLint32));
+                if (resultSL != SL_RESULT_SUCCESS) {
+                    /* Failed to set the configuration. Just keep going. */
+                }
+            }
+        }
+
+        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->Realize((SLObjectItf)pDevice->opensl.pAudioPlayerObj, SL_BOOLEAN_FALSE);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to realize audio player.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioPlayerObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_PLAY, &pDevice->opensl.pAudioPlayer);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_PLAY interface.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        resultSL = MA_OPENSL_OBJ(pDevice->opensl.pAudioPlayerObj)->GetInterface((SLObjectItf)pDevice->opensl.pAudioPlayerObj, (SLInterfaceID)pDevice->pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE, &pDevice->opensl.pBufferQueuePlayback);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to retrieve SL_IID_ANDROIDSIMPLEBUFFERQUEUE interface.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueuePlayback)->RegisterCallback((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback, ma_buffer_queue_callback_playback__opensl_android, pDevice);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to register buffer queue callback.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        /* The internal format is determined by the "pcm" object. */
+        ma_deconstruct_SLDataFormat_PCM__opensl(&pcm, &pDescriptorPlayback->format, &pDescriptorPlayback->channels, &pDescriptorPlayback->sampleRate, pDescriptorPlayback->channelMap, ma_countof(pDescriptorPlayback->channelMap));
+
+        /* Buffer. */
+        pDescriptorPlayback->periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_descriptor(pDescriptorPlayback, pDescriptorPlayback->sampleRate, pConfig->performanceProfile);
+        pDevice->opensl.currentBufferIndexPlayback   = 0;
+
+        bufferSizeInBytes = pDescriptorPlayback->periodSizeInFrames * ma_get_bytes_per_frame(pDescriptorPlayback->format, pDescriptorPlayback->channels) * pDescriptorPlayback->periodCount;
+        pDevice->opensl.pBufferPlayback = (ma_uint8*)ma_calloc(bufferSizeInBytes, &pDevice->pContext->allocationCallbacks);
+        if (pDevice->opensl.pBufferPlayback == NULL) {
+            ma_device_uninit__opensl(pDevice);
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to allocate memory for data buffer.");
+            return MA_OUT_OF_MEMORY;
+        }
+        MA_ZERO_MEMORY(pDevice->opensl.pBufferPlayback, bufferSizeInBytes);
+    }
+
+    return MA_SUCCESS;
+#else
+    return MA_NO_BACKEND;   /* Non-Android implementations are not supported. */
+#endif
+}
+
+static ma_result ma_device_start__opensl(ma_device* pDevice)
+{
+    SLresult resultSL;
+    size_t periodSizeInBytes;
+    ma_uint32 iPeriod;
+
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it and then attempted to start the device. */
+    if (g_maOpenSLInitCounter == 0) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        resultSL = MA_OPENSL_RECORD(pDevice->opensl.pAudioRecorder)->SetRecordState((SLRecordItf)pDevice->opensl.pAudioRecorder, SL_RECORDSTATE_RECORDING);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to start internal capture device.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        periodSizeInBytes = pDevice->capture.internalPeriodSizeInFrames * ma_get_bytes_per_frame(pDevice->capture.internalFormat, pDevice->capture.internalChannels);
+        for (iPeriod = 0; iPeriod < pDevice->capture.internalPeriods; ++iPeriod) {
+            resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueueCapture)->Enqueue((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture, pDevice->opensl.pBufferCapture + (periodSizeInBytes * iPeriod), periodSizeInBytes);
+            if (resultSL != SL_RESULT_SUCCESS) {
+                MA_OPENSL_RECORD(pDevice->opensl.pAudioRecorder)->SetRecordState((SLRecordItf)pDevice->opensl.pAudioRecorder, SL_RECORDSTATE_STOPPED);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to enqueue buffer for capture device.");
+                return ma_result_from_OpenSL(resultSL);
+            }
+        }
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        resultSL = MA_OPENSL_PLAY(pDevice->opensl.pAudioPlayer)->SetPlayState((SLPlayItf)pDevice->opensl.pAudioPlayer, SL_PLAYSTATE_PLAYING);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to start internal playback device.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        /* In playback mode (no duplex) we need to load some initial buffers. In duplex mode we need to enqueue silent buffers. */
+        if (pDevice->type == ma_device_type_duplex) {
+            MA_ZERO_MEMORY(pDevice->opensl.pBufferPlayback, pDevice->playback.internalPeriodSizeInFrames * pDevice->playback.internalPeriods * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels));
+        } else {
+            ma_device__read_frames_from_client(pDevice, pDevice->playback.internalPeriodSizeInFrames * pDevice->playback.internalPeriods, pDevice->opensl.pBufferPlayback);
+        }
+
+        periodSizeInBytes = pDevice->playback.internalPeriodSizeInFrames * ma_get_bytes_per_frame(pDevice->playback.internalFormat, pDevice->playback.internalChannels);
+        for (iPeriod = 0; iPeriod < pDevice->playback.internalPeriods; ++iPeriod) {
+            resultSL = MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueuePlayback)->Enqueue((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback, pDevice->opensl.pBufferPlayback + (periodSizeInBytes * iPeriod), periodSizeInBytes);
+            if (resultSL != SL_RESULT_SUCCESS) {
+                MA_OPENSL_PLAY(pDevice->opensl.pAudioPlayer)->SetPlayState((SLPlayItf)pDevice->opensl.pAudioPlayer, SL_PLAYSTATE_STOPPED);
+                ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to enqueue buffer for playback device.");
+                return ma_result_from_OpenSL(resultSL);
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_drain__opensl(ma_device* pDevice, ma_device_type deviceType)
+{
+    SLAndroidSimpleBufferQueueItf pBufferQueue;
+
+    MA_ASSERT(deviceType == ma_device_type_capture || deviceType == ma_device_type_playback);
+
+    if (pDevice->type == ma_device_type_capture) {
+        pBufferQueue = (SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture;
+        pDevice->opensl.isDrainingCapture  = MA_TRUE;
+    } else {
+        pBufferQueue = (SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback;
+        pDevice->opensl.isDrainingPlayback = MA_TRUE;
+    }
+
+    for (;;) {
+        SLAndroidSimpleBufferQueueState state;
+
+        MA_OPENSL_BUFFERQUEUE(pBufferQueue)->GetState(pBufferQueue, &state);
+        if (state.count == 0) {
+            break;
+        }
+
+        ma_sleep(10);
+    }
+
+    if (pDevice->type == ma_device_type_capture) {
+        pDevice->opensl.isDrainingCapture  = MA_FALSE;
+    } else {
+        pDevice->opensl.isDrainingPlayback = MA_FALSE;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__opensl(ma_device* pDevice)
+{
+    SLresult resultSL;
+
+    MA_ASSERT(pDevice != NULL);
+
+    MA_ASSERT(g_maOpenSLInitCounter > 0); /* <-- If you trigger this it means you've either not initialized the context, or you've uninitialized it before stopping/uninitializing the device. */
+    if (g_maOpenSLInitCounter == 0) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex) {
+        ma_device_drain__opensl(pDevice, ma_device_type_capture);
+
+        resultSL = MA_OPENSL_RECORD(pDevice->opensl.pAudioRecorder)->SetRecordState((SLRecordItf)pDevice->opensl.pAudioRecorder, SL_RECORDSTATE_STOPPED);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to stop internal capture device.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueueCapture)->Clear((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueueCapture);
+    }
+
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_device_drain__opensl(pDevice, ma_device_type_playback);
+
+        resultSL = MA_OPENSL_PLAY(pDevice->opensl.pAudioPlayer)->SetPlayState((SLPlayItf)pDevice->opensl.pAudioPlayer, SL_PLAYSTATE_STOPPED);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            ma_log_post(ma_device_get_log(pDevice), MA_LOG_LEVEL_ERROR, "[OpenSL] Failed to stop internal playback device.");
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        MA_OPENSL_BUFFERQUEUE(pDevice->opensl.pBufferQueuePlayback)->Clear((SLAndroidSimpleBufferQueueItf)pDevice->opensl.pBufferQueuePlayback);
+    }
+
+    /* Make sure the client is aware that the device has stopped. There may be an OpenSL|ES callback for this, but I haven't found it. */
+    ma_device__on_notification_stopped(pDevice);
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_context_uninit__opensl(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_opensl);
+    (void)pContext;
+
+    /* Uninit global data. */
+    ma_spinlock_lock(&g_maOpenSLSpinlock);
+    {
+        MA_ASSERT(g_maOpenSLInitCounter > 0);   /* If you've triggered this, it means you have ma_context_init/uninit mismatch. Each successful call to ma_context_init() must be matched up with a call to ma_context_uninit(). */
+
+        g_maOpenSLInitCounter -= 1;
+        if (g_maOpenSLInitCounter == 0) {
+            (*g_maEngineObjectSL)->Destroy(g_maEngineObjectSL);
+        }
+    }
+    ma_spinlock_unlock(&g_maOpenSLSpinlock);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_dlsym_SLInterfaceID__opensl(ma_context* pContext, const char* pName, ma_handle* pHandle)
+{
+    /* We need to return an error if the symbol cannot be found. This is important because there have been reports that some symbols do not exist. */
+    ma_handle* p = (ma_handle*)ma_dlsym(ma_context_get_log(pContext), pContext->opensl.libOpenSLES, pName);
+    if (p == NULL) {
+        ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_INFO, "[OpenSL] Cannot find symbol %s", pName);
+        return MA_NO_BACKEND;
+    }
+
+    *pHandle = *p;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init_engine_nolock__opensl(ma_context* pContext)
+{
+    g_maOpenSLInitCounter += 1;
+    if (g_maOpenSLInitCounter == 1) {
+        SLresult resultSL;
+
+        resultSL = ((ma_slCreateEngine_proc)pContext->opensl.slCreateEngine)(&g_maEngineObjectSL, 0, NULL, 0, NULL, NULL);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            g_maOpenSLInitCounter -= 1;
+            return ma_result_from_OpenSL(resultSL);
+        }
+
+        (*g_maEngineObjectSL)->Realize(g_maEngineObjectSL, SL_BOOLEAN_FALSE);
+
+        resultSL = (*g_maEngineObjectSL)->GetInterface(g_maEngineObjectSL, (SLInterfaceID)pContext->opensl.SL_IID_ENGINE, &g_maEngineSL);
+        if (resultSL != SL_RESULT_SUCCESS) {
+            (*g_maEngineObjectSL)->Destroy(g_maEngineObjectSL);
+            g_maOpenSLInitCounter -= 1;
+            return ma_result_from_OpenSL(resultSL);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__opensl(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    ma_result result;
+
+#if !defined(MA_NO_RUNTIME_LINKING)
+    size_t i;
+    const char* libOpenSLESNames[] = {
+        "libOpenSLES.so"
+    };
+#endif
+
+    MA_ASSERT(pContext != NULL);
+
+    (void)pConfig;
+
+#if !defined(MA_NO_RUNTIME_LINKING)
+    /*
+    Dynamically link against libOpenSLES.so. I have now had multiple reports that SL_IID_ANDROIDSIMPLEBUFFERQUEUE cannot be found. One
+    report was happening at compile time and another at runtime. To try working around this, I'm going to link to libOpenSLES at runtime
+    and extract the symbols rather than reference them directly. This should, hopefully, fix these issues as the compiler won't see any
+    references to the symbols and will hopefully skip the checks.
+    */
+    for (i = 0; i < ma_countof(libOpenSLESNames); i += 1) {
+        pContext->opensl.libOpenSLES = ma_dlopen(ma_context_get_log(pContext), libOpenSLESNames[i]);
+        if (pContext->opensl.libOpenSLES != NULL) {
+            break;
+        }
+    }
+
+    if (pContext->opensl.libOpenSLES == NULL) {
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_INFO, "[OpenSL] Could not find libOpenSLES.so");
+        return MA_NO_BACKEND;
+    }
+
+    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_ENGINE", &pContext->opensl.SL_IID_ENGINE);
+    if (result != MA_SUCCESS) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        return result;
+    }
+
+    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_AUDIOIODEVICECAPABILITIES", &pContext->opensl.SL_IID_AUDIOIODEVICECAPABILITIES);
+    if (result != MA_SUCCESS) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        return result;
+    }
+
+    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_ANDROIDSIMPLEBUFFERQUEUE", &pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE);
+    if (result != MA_SUCCESS) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        return result;
+    }
+
+    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_RECORD", &pContext->opensl.SL_IID_RECORD);
+    if (result != MA_SUCCESS) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        return result;
+    }
+
+    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_PLAY", &pContext->opensl.SL_IID_PLAY);
+    if (result != MA_SUCCESS) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        return result;
+    }
+
+    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_OUTPUTMIX", &pContext->opensl.SL_IID_OUTPUTMIX);
+    if (result != MA_SUCCESS) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        return result;
+    }
+
+    result = ma_dlsym_SLInterfaceID__opensl(pContext, "SL_IID_ANDROIDCONFIGURATION", &pContext->opensl.SL_IID_ANDROIDCONFIGURATION);
+    if (result != MA_SUCCESS) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        return result;
+    }
+
+    pContext->opensl.slCreateEngine = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->opensl.libOpenSLES, "slCreateEngine");
+    if (pContext->opensl.slCreateEngine == NULL) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_INFO, "[OpenSL] Cannot find symbol slCreateEngine.");
+        return MA_NO_BACKEND;
+    }
+#else
+    pContext->opensl.SL_IID_ENGINE                    = (ma_handle)SL_IID_ENGINE;
+    pContext->opensl.SL_IID_AUDIOIODEVICECAPABILITIES = (ma_handle)SL_IID_AUDIOIODEVICECAPABILITIES;
+    pContext->opensl.SL_IID_ANDROIDSIMPLEBUFFERQUEUE  = (ma_handle)SL_IID_ANDROIDSIMPLEBUFFERQUEUE;
+    pContext->opensl.SL_IID_RECORD                    = (ma_handle)SL_IID_RECORD;
+    pContext->opensl.SL_IID_PLAY                      = (ma_handle)SL_IID_PLAY;
+    pContext->opensl.SL_IID_OUTPUTMIX                 = (ma_handle)SL_IID_OUTPUTMIX;
+    pContext->opensl.SL_IID_ANDROIDCONFIGURATION      = (ma_handle)SL_IID_ANDROIDCONFIGURATION;
+    pContext->opensl.slCreateEngine                   = (ma_proc)slCreateEngine;
+#endif
+
+
+    /* Initialize global data first if applicable. */
+    ma_spinlock_lock(&g_maOpenSLSpinlock);
+    {
+        result = ma_context_init_engine_nolock__opensl(pContext);
+    }
+    ma_spinlock_unlock(&g_maOpenSLSpinlock);
+
+    if (result != MA_SUCCESS) {
+        ma_dlclose(ma_context_get_log(pContext), pContext->opensl.libOpenSLES);
+        ma_log_post(ma_context_get_log(pContext), MA_LOG_LEVEL_INFO, "[OpenSL] Failed to initialize OpenSL engine.");
+        return result;
+    }
+
+    pCallbacks->onContextInit             = ma_context_init__opensl;
+    pCallbacks->onContextUninit           = ma_context_uninit__opensl;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__opensl;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__opensl;
+    pCallbacks->onDeviceInit              = ma_device_init__opensl;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__opensl;
+    pCallbacks->onDeviceStart             = ma_device_start__opensl;
+    pCallbacks->onDeviceStop              = ma_device_stop__opensl;
+    pCallbacks->onDeviceRead              = NULL;   /* Not needed because OpenSL|ES is asynchronous. */
+    pCallbacks->onDeviceWrite             = NULL;   /* Not needed because OpenSL|ES is asynchronous. */
+    pCallbacks->onDeviceDataLoop          = NULL;   /* Not needed because OpenSL|ES is asynchronous. */
+
+    return MA_SUCCESS;
+}
+#endif  /* OpenSL|ES */
+
+
+/******************************************************************************
+
+Web Audio Backend
+
+******************************************************************************/
+#ifdef MA_HAS_WEBAUDIO
+#include <emscripten/emscripten.h>
+
+#if (__EMSCRIPTEN_major__ > 3) || (__EMSCRIPTEN_major__ == 3 && (__EMSCRIPTEN_minor__ > 1 || (__EMSCRIPTEN_minor__ == 1 && __EMSCRIPTEN_tiny__ >= 32)))
+    #include <emscripten/webaudio.h>
+    #define MA_SUPPORT_AUDIO_WORKLETS
+
+    #if (__EMSCRIPTEN_major__ > 3) || (__EMSCRIPTEN_major__ == 3 && (__EMSCRIPTEN_minor__ > 1 || (__EMSCRIPTEN_minor__ == 1 && __EMSCRIPTEN_tiny__ >= 70)))
+        #define MA_SUPPORT_AUDIO_WORKLETS_VARIABLE_BUFFER_SIZE
+    #endif
+#endif
+
+/*
+TODO: Version 0.12: Swap this logic around so that AudioWorklets are used by default. Add MA_NO_AUDIO_WORKLETS.
+*/
+#if defined(MA_ENABLE_AUDIO_WORKLETS) && defined(MA_SUPPORT_AUDIO_WORKLETS)
+    #define MA_USE_AUDIO_WORKLETS
+#endif
+
+/* The thread stack size must be a multiple of 16. */
+#ifndef MA_AUDIO_WORKLETS_THREAD_STACK_SIZE
+#define MA_AUDIO_WORKLETS_THREAD_STACK_SIZE     131072
+#endif
+
+#if defined(MA_USE_AUDIO_WORKLETS)
+#define MA_WEBAUDIO_LATENCY_HINT_BALANCED       "balanced"
+#define MA_WEBAUDIO_LATENCY_HINT_INTERACTIVE    "interactive"
+#define MA_WEBAUDIO_LATENCY_HINT_PLAYBACK       "playback"
+#endif
+
+static ma_bool32 ma_is_capture_supported__webaudio()
+{
+    return EM_ASM_INT({
+        return (navigator.mediaDevices !== undefined && navigator.mediaDevices.getUserMedia !== undefined);
+    }, 0) != 0; /* Must pass in a dummy argument for C99 compatibility. */
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void* EMSCRIPTEN_KEEPALIVE ma_malloc_emscripten(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_malloc(sz, pAllocationCallbacks);
+}
+
+void EMSCRIPTEN_KEEPALIVE ma_free_emscripten(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_free(p, pAllocationCallbacks);
+}
+
+void EMSCRIPTEN_KEEPALIVE ma_device_process_pcm_frames_capture__webaudio(ma_device* pDevice, int frameCount, float* pFrames)
+{
+    ma_device_handle_backend_data_callback(pDevice, NULL, pFrames, (ma_uint32)frameCount);
+}
+
+void EMSCRIPTEN_KEEPALIVE ma_device_process_pcm_frames_playback__webaudio(ma_device* pDevice, int frameCount, float* pFrames)
+{
+    ma_device_handle_backend_data_callback(pDevice, pFrames, NULL, (ma_uint32)frameCount);
+}
+#ifdef __cplusplus
+}
+#endif
+
+static ma_result ma_context_enumerate_devices__webaudio(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_bool32 cbResult = MA_TRUE;
+
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(callback != NULL);
+
+    /* Only supporting default devices for now. */
+
+    /* Playback. */
+    if (cbResult) {
+        ma_device_info deviceInfo;
+        MA_ZERO_OBJECT(&deviceInfo);
+        ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+        deviceInfo.isDefault = MA_TRUE;    /* Only supporting default devices. */
+        cbResult = callback(pContext, ma_device_type_playback, &deviceInfo, pUserData);
+    }
+
+    /* Capture. */
+    if (cbResult) {
+        if (ma_is_capture_supported__webaudio()) {
+            ma_device_info deviceInfo;
+            MA_ZERO_OBJECT(&deviceInfo);
+            ma_strncpy_s(deviceInfo.name, sizeof(deviceInfo.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+            deviceInfo.isDefault = MA_TRUE;    /* Only supporting default devices. */
+            cbResult = callback(pContext, ma_device_type_capture, &deviceInfo, pUserData);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_get_device_info__webaudio(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    MA_ASSERT(pContext != NULL);
+
+    if (deviceType == ma_device_type_capture && !ma_is_capture_supported__webaudio()) {
+        return MA_NO_DEVICE;
+    }
+
+    MA_ZERO_MEMORY(pDeviceInfo->id.webaudio, sizeof(pDeviceInfo->id.webaudio));
+
+    /* Only supporting default devices for now. */
+    (void)pDeviceID;
+    if (deviceType == ma_device_type_playback) {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+    } else {
+        ma_strncpy_s(pDeviceInfo->name, sizeof(pDeviceInfo->name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+    }
+
+    /* Only supporting default devices. */
+    pDeviceInfo->isDefault = MA_TRUE;
+
+    /* Web Audio can support any number of channels and sample rates. It only supports f32 formats, however. */
+    pDeviceInfo->nativeDataFormats[0].flags      = 0;
+    pDeviceInfo->nativeDataFormats[0].format     = ma_format_unknown;
+    pDeviceInfo->nativeDataFormats[0].channels   = 0; /* All channels are supported. */
+    pDeviceInfo->nativeDataFormats[0].sampleRate = EM_ASM_INT({
+        try {
+            var temp = new (window.AudioContext || window.webkitAudioContext)();
+            var sampleRate = temp.sampleRate;
+            temp.close();
+            return sampleRate;
+        } catch(e) {
+            return 0;
+        }
+    }, 0);  /* Must pass in a dummy argument for C99 compatibility. */
+
+    if (pDeviceInfo->nativeDataFormats[0].sampleRate == 0) {
+        return MA_NO_DEVICE;
+    }
+
+    pDeviceInfo->nativeDataFormatCount = 1;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_uninit__webaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    #if defined(MA_USE_AUDIO_WORKLETS)
+    {
+        EM_ASM({
+            var device = window.miniaudio.get_device_by_index($0);
+
+            if (device.streamNode !== undefined) {
+                device.streamNode.disconnect();
+                device.streamNode = undefined;
+            }
+
+            device.pDevice = undefined;
+        }, pDevice->webaudio.deviceIndex);
+
+        emscripten_destroy_web_audio_node(pDevice->webaudio.audioWorklet);
+        emscripten_destroy_audio_context(pDevice->webaudio.audioContext);
+        ma_free(pDevice->webaudio.pStackBuffer, &pDevice->pContext->allocationCallbacks);
+    }
+    #else
+    {
+        EM_ASM({
+            var device = window.miniaudio.get_device_by_index($0);
+
+            /* Make sure all nodes are disconnected and marked for collection. */
+            if (device.scriptNode !== undefined) {
+                device.scriptNode.onaudioprocess = function(e) {};  /* We want to reset the callback to ensure it doesn't get called after AudioContext.close() has returned. Shouldn't happen since we're disconnecting, but just to be safe... */
+                device.scriptNode.disconnect();
+                device.scriptNode = undefined;
+            }
+
+            if (device.streamNode !== undefined) {
+                device.streamNode.disconnect();
+                device.streamNode = undefined;
+            }
+
+            /*
+            Stop the device. I think there is a chance the callback could get fired after calling this, hence why we want
+            to clear the callback before closing.
+            */
+            device.webaudio.close();
+            device.webaudio = undefined;
+            device.pDevice = undefined;
+        }, pDevice->webaudio.deviceIndex);
+    }
+    #endif
+
+    /* Clean up the device on the JS side. */
+    EM_ASM({
+        window.miniaudio.untrack_device_by_index($0);
+    }, pDevice->webaudio.deviceIndex);
+
+    ma_free(pDevice->webaudio.pIntermediaryBuffer, &pDevice->pContext->allocationCallbacks);
+
+    return MA_SUCCESS;
+}
+
+#if !defined(MA_USE_AUDIO_WORKLETS)
+static ma_uint32 ma_calculate_period_size_in_frames_from_descriptor__webaudio(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
+{
+    /*
+    There have been reports of the default buffer size being too small on some browsers. If we're using
+    the default buffer size, we'll make sure the period size is bigger than our standard defaults.
+    */
+    ma_uint32 periodSizeInFrames;
+
+    if (nativeSampleRate == 0) {
+        nativeSampleRate = MA_DEFAULT_SAMPLE_RATE;
+    }
+
+    if (pDescriptor->periodSizeInFrames == 0) {
+        if (pDescriptor->periodSizeInMilliseconds == 0) {
+            if (performanceProfile == ma_performance_profile_low_latency) {
+                periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(33, nativeSampleRate);  /* 1 frame @ 30 FPS */
+            } else {
+                periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(333, nativeSampleRate);
+            }
+        } else {
+            periodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptor->periodSizeInMilliseconds, nativeSampleRate);
+        }
+    } else {
+        periodSizeInFrames = pDescriptor->periodSizeInFrames;
+    }
+
+    /* The size of the buffer must be a power of 2 and between 256 and 16384. */
+    if (periodSizeInFrames < 256) {
+        periodSizeInFrames = 256;
+    } else if (periodSizeInFrames > 16384) {
+        periodSizeInFrames = 16384;
+    } else {
+        periodSizeInFrames = ma_next_power_of_2(periodSizeInFrames);
+    }
+
+    return periodSizeInFrames;
+}
+#endif
+
+
+#if defined(MA_USE_AUDIO_WORKLETS)
+typedef struct
+{
+    ma_device* pDevice;
+    const ma_device_config* pConfig;
+    ma_device_descriptor* pDescriptorPlayback;
+    ma_device_descriptor* pDescriptorCapture;
+} ma_audio_worklet_thread_initialized_data;
+
+static EM_BOOL ma_audio_worklet_process_callback__webaudio(int inputCount, const AudioSampleFrame* pInputs, int outputCount, AudioSampleFrame* pOutputs, int paramCount, const AudioParamFrame* pParams, void* pUserData)
+{
+    ma_device* pDevice = (ma_device*)pUserData;
+    ma_uint32 frameCount;
+
+    (void)paramCount;
+    (void)pParams;
+
+    /*
+    The Emscripten documentation says that it'll always be 128 frames being passed in. Hard coding it like that feels
+    like a very bad idea to me. Even if it's hard coded in the backend, the API and documentation should always refer
+    to variables instead of a hard coded number. In any case, will follow along for the time being.
+
+    Unfortunately the audio data is not interleaved so we'll need to convert it before we give the data to miniaudio
+    for further processing.
+    */
+    if (pDevice->type == ma_device_type_playback) {
+        frameCount = pDevice->playback.internalPeriodSizeInFrames;
+    } else {
+        frameCount = pDevice->capture.internalPeriodSizeInFrames;
+    }
+
+    if (ma_device_get_state(pDevice) != ma_device_state_started) {
+        /* Fill the output buffer with zero to avoid a noise sound */
+        for (int i = 0; i < outputCount; i += 1) {
+            MA_ZERO_MEMORY(pOutputs[i].data, pOutputs[i].numberOfChannels * frameCount * sizeof(float));
+        }
+
+        return EM_TRUE;
+    }
+
+    if (inputCount > 0) {
+        /* Input data needs to be interleaved before we hand it to the client. */
+        for (ma_uint32 iChannel = 0; iChannel < pDevice->capture.internalChannels; iChannel += 1) {
+            for (ma_uint32 iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                pDevice->webaudio.pIntermediaryBuffer[iFrame*pDevice->capture.internalChannels + iChannel] = pInputs[0].data[frameCount*iChannel + iFrame];
+            }
+        }
+
+        ma_device_process_pcm_frames_capture__webaudio(pDevice, frameCount, pDevice->webaudio.pIntermediaryBuffer);
+    }
+
+    if (outputCount > 0) {
+        /* If it's a capture-only device, we'll need to output silence. */
+        if (pDevice->type == ma_device_type_capture) {
+            MA_ZERO_MEMORY(pOutputs[0].data, frameCount * pDevice->playback.internalChannels * sizeof(float));
+        } else {
+            ma_device_process_pcm_frames_playback__webaudio(pDevice, frameCount, pDevice->webaudio.pIntermediaryBuffer);
+
+            /* We've read the data from the client. Now we need to deinterleave the buffer and output to the output buffer. */
+            for (ma_uint32 iChannel = 0; iChannel < pDevice->playback.internalChannels; iChannel += 1) {
+                for (ma_uint32 iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                    pOutputs[0].data[frameCount*iChannel + iFrame] = pDevice->webaudio.pIntermediaryBuffer[iFrame*pDevice->playback.internalChannels + iChannel];
+                }
+            }
+        }
+    }
+
+    return EM_TRUE;
+}
+
+
+static void ma_audio_worklet_processor_created__webaudio(EMSCRIPTEN_WEBAUDIO_T audioContext, EM_BOOL success, void* pUserData)
+{
+    ma_audio_worklet_thread_initialized_data* pParameters = (ma_audio_worklet_thread_initialized_data*)pUserData;
+    EmscriptenAudioWorkletNodeCreateOptions audioWorkletOptions;
+    int channels = 0;
+    size_t intermediaryBufferSizeInFrames;
+    int sampleRate;
+
+    if (success == EM_FALSE) {
+        pParameters->pDevice->webaudio.initResult = MA_ERROR;
+        ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
+        return;
+    }
+
+    /* The next step is to initialize the audio worklet node. */
+    MA_ZERO_OBJECT(&audioWorkletOptions);
+
+    /*
+    The way channel counts work with Web Audio is confusing. As far as I can tell, there's no way to know the channel
+    count from MediaStreamAudioSourceNode (what we use for capture)? The only way to have control is to configure an
+    output channel count on the capture side. This is slightly confusing for capture mode because intuitively you
+    wouldn't actually connect an output to an input-only node, but this is what we'll have to do in order to have
+    proper control over the channel count. In the capture case, we'll have to output silence to its output node.
+    */
+    if (pParameters->pConfig->deviceType == ma_device_type_capture) {
+        channels = (int)((pParameters->pDescriptorCapture->channels > 0) ? pParameters->pDescriptorCapture->channels : MA_DEFAULT_CHANNELS);
+        audioWorkletOptions.numberOfInputs = 1;
+    } else {
+        channels = (int)((pParameters->pDescriptorPlayback->channels > 0) ? pParameters->pDescriptorPlayback->channels : MA_DEFAULT_CHANNELS);
+
+        if (pParameters->pConfig->deviceType == ma_device_type_duplex) {
+            audioWorkletOptions.numberOfInputs = 1;
+        } else {
+            audioWorkletOptions.numberOfInputs = 0;
+        }
+    }
+
+    audioWorkletOptions.numberOfOutputs = 1;
+    audioWorkletOptions.outputChannelCounts = &channels;
+
+
+    /*
+    Now that we know the channel count to use we can allocate the intermediary buffer. The
+    intermediary buffer is used for interleaving and deinterleaving.
+    */
+    #if defined(MA_SUPPORT_AUDIO_WORKLETS_VARIABLE_BUFFER_SIZE)
+    {
+        intermediaryBufferSizeInFrames = (size_t)emscripten_audio_context_quantum_size(audioContext);
+    }
+    #else
+    {
+        intermediaryBufferSizeInFrames = 128;
+    }
+    #endif
+
+    pParameters->pDevice->webaudio.pIntermediaryBuffer = (float*)ma_malloc(intermediaryBufferSizeInFrames * (ma_uint32)channels * sizeof(float), &pParameters->pDevice->pContext->allocationCallbacks);
+    if (pParameters->pDevice->webaudio.pIntermediaryBuffer == NULL) {
+        pParameters->pDevice->webaudio.initResult = MA_OUT_OF_MEMORY;
+        ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
+        return;
+    }
+
+    pParameters->pDevice->webaudio.audioWorklet = emscripten_create_wasm_audio_worklet_node(audioContext, "miniaudio", &audioWorkletOptions, &ma_audio_worklet_process_callback__webaudio, pParameters->pDevice);
+
+    /* With the audio worklet initialized we can now attach it to the graph. */
+    if (pParameters->pConfig->deviceType == ma_device_type_capture || pParameters->pConfig->deviceType == ma_device_type_duplex) {
+        ma_result attachmentResult = (ma_result)EM_ASM_INT({
+            var getUserMediaResult = 0;
+            var audioWorklet = emscriptenGetAudioObject($0);
+            var audioContext = emscriptenGetAudioObject($1);
+
+            navigator.mediaDevices.getUserMedia({audio:true, video:false})
+                .then(function(stream) {
+                    audioContext.streamNode = audioContext.createMediaStreamSource(stream);
+                    audioContext.streamNode.connect(audioWorklet);
+                    audioWorklet.connect(audioContext.destination);
+                    getUserMediaResult = 0;   /* 0 = MA_SUCCESS */
+                })
+                .catch(function(error) {
+                    console.log("navigator.mediaDevices.getUserMedia Failed: " + error);
+                    getUserMediaResult = -1;  /* -1 = MA_ERROR */
+                });
+
+            return getUserMediaResult;
+        }, pParameters->pDevice->webaudio.audioWorklet, audioContext);
+
+        if (attachmentResult != MA_SUCCESS) {
+            ma_log_postf(ma_device_get_log(pParameters->pDevice), MA_LOG_LEVEL_ERROR, "Web Audio: Failed to connect capture node.");
+            emscripten_destroy_web_audio_node(pParameters->pDevice->webaudio.audioWorklet);
+            pParameters->pDevice->webaudio.initResult = attachmentResult;
+            ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
+            return;
+        }
+    }
+
+    /* If it's playback only we can now attach the worklet node to the graph. This has already been done for the duplex case. */
+    if (pParameters->pConfig->deviceType == ma_device_type_playback) {
+        ma_result attachmentResult = (ma_result)EM_ASM_INT({
+            var audioWorklet = emscriptenGetAudioObject($0);
+            var audioContext = emscriptenGetAudioObject($1);
+            audioWorklet.connect(audioContext.destination);
+            return 0;   /* 0 = MA_SUCCESS */
+        }, pParameters->pDevice->webaudio.audioWorklet, audioContext);
+
+        if (attachmentResult != MA_SUCCESS) {
+            ma_log_postf(ma_device_get_log(pParameters->pDevice), MA_LOG_LEVEL_ERROR, "Web Audio: Failed to connect playback node.");
+            pParameters->pDevice->webaudio.initResult = attachmentResult;
+            ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
+            return;
+        }
+    }
+
+    /* We need to update the descriptors so that they reflect the internal data format. Both capture and playback should be the same. */
+    sampleRate = EM_ASM_INT({ return emscriptenGetAudioObject($0).sampleRate; }, audioContext);
+
+    if (pParameters->pDescriptorCapture != NULL) {
+        pParameters->pDescriptorCapture->format              = ma_format_f32;
+        pParameters->pDescriptorCapture->channels            = (ma_uint32)channels;
+        pParameters->pDescriptorCapture->sampleRate          = (ma_uint32)sampleRate;
+        ma_channel_map_init_standard(ma_standard_channel_map_webaudio, pParameters->pDescriptorCapture->channelMap, ma_countof(pParameters->pDescriptorCapture->channelMap), pParameters->pDescriptorCapture->channels);
+        pParameters->pDescriptorCapture->periodSizeInFrames  = intermediaryBufferSizeInFrames;
+        pParameters->pDescriptorCapture->periodCount         = 1;
+    }
+
+    if (pParameters->pDescriptorPlayback != NULL) {
+        pParameters->pDescriptorPlayback->format             = ma_format_f32;
+        pParameters->pDescriptorPlayback->channels           = (ma_uint32)channels;
+        pParameters->pDescriptorPlayback->sampleRate         = (ma_uint32)sampleRate;
+        ma_channel_map_init_standard(ma_standard_channel_map_webaudio, pParameters->pDescriptorPlayback->channelMap, ma_countof(pParameters->pDescriptorPlayback->channelMap), pParameters->pDescriptorPlayback->channels);
+        pParameters->pDescriptorPlayback->periodSizeInFrames = intermediaryBufferSizeInFrames;
+        pParameters->pDescriptorPlayback->periodCount        = 1;
+    }
+
+    /* At this point we're done and we can return. */
+    ma_log_postf(ma_device_get_log(pParameters->pDevice), MA_LOG_LEVEL_DEBUG, "AudioWorklets: Created worklet node: %d\n", pParameters->pDevice->webaudio.audioWorklet);
+    pParameters->pDevice->webaudio.initResult = MA_SUCCESS;
+    ma_free(pParameters, &pParameters->pDevice->pContext->allocationCallbacks);
+}
+
+static void ma_audio_worklet_thread_initialized__webaudio(EMSCRIPTEN_WEBAUDIO_T audioContext, EM_BOOL success, void* pUserData)
+{
+    ma_audio_worklet_thread_initialized_data* pParameters = (ma_audio_worklet_thread_initialized_data*)pUserData;
+    WebAudioWorkletProcessorCreateOptions workletProcessorOptions;
+
+    MA_ASSERT(pParameters != NULL);
+
+    if (success == EM_FALSE) {
+        pParameters->pDevice->webaudio.initResult = MA_ERROR;
+        return;
+    }
+
+    MA_ZERO_OBJECT(&workletProcessorOptions);
+    workletProcessorOptions.name = "miniaudio"; /* I'm not entirely sure what to call this. Does this need to be globally unique, or does it need only be unique for a given AudioContext? */
+
+    emscripten_create_wasm_audio_worklet_processor_async(audioContext, &workletProcessorOptions, ma_audio_worklet_processor_created__webaudio, pParameters);
+}
+#endif
+
+static ma_result ma_device_init__webaudio(ma_device* pDevice, const ma_device_config* pConfig, ma_device_descriptor* pDescriptorPlayback, ma_device_descriptor* pDescriptorCapture)
+{
+    if (pConfig->deviceType == ma_device_type_loopback) {
+        return MA_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    /* No exclusive mode with Web Audio. */
+    if (((pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) && pDescriptorPlayback->shareMode == ma_share_mode_exclusive) ||
+        ((pConfig->deviceType == ma_device_type_capture  || pConfig->deviceType == ma_device_type_duplex) && pDescriptorCapture->shareMode  == ma_share_mode_exclusive)) {
+        return MA_SHARE_MODE_NOT_SUPPORTED;
+    }
+
+    /*
+    With AudioWorklets we'll have just a single AudioContext. I'm not sure why I'm not doing this for ScriptProcessorNode so
+    it might be worthwhile to look into that as well.
+    */
+    #if defined(MA_USE_AUDIO_WORKLETS)
+    {
+        EmscriptenWebAudioCreateAttributes audioContextAttributes;
+        ma_audio_worklet_thread_initialized_data* pInitParameters;
+        void* pStackBuffer;
+
+        if (pConfig->performanceProfile == ma_performance_profile_conservative) {
+            audioContextAttributes.latencyHint = MA_WEBAUDIO_LATENCY_HINT_PLAYBACK;
+        } else {
+            audioContextAttributes.latencyHint = MA_WEBAUDIO_LATENCY_HINT_INTERACTIVE;
+        }
+
+        /*
+        In my testing, Firefox does not seem to capture audio data properly if the sample rate is set
+        to anything other than 48K. This does not seem to be the case for other browsers. For this reason,
+        if the device type is anything other than playback, we'll leave the sample rate as-is and let the
+        browser pick the appropriate rate for us.
+        */
+        if (pConfig->deviceType == ma_device_type_playback) {
+            audioContextAttributes.sampleRate = pDescriptorPlayback->sampleRate;
+        } else {
+            audioContextAttributes.sampleRate = 0;
+        }
+
+        /* It's not clear if this can return an error. None of the tests in the Emscripten repository check for this, so neither am I for now. */
+        pDevice->webaudio.audioContext = emscripten_create_audio_context(&audioContextAttributes);
+
+        /*
+        With the context created we can now create the worklet. We can only have a single worklet per audio
+        context which means we'll need to craft this appropriately to handle duplex devices correctly.
+        */
+
+        /*
+        We now need to create a worker thread. This is a bit weird because we need to allocate our
+        own buffer for the thread's stack. The stack needs to be aligned to 16 bytes. I'm going to
+        allocate this on the heap to keep it simple.
+        */
+        pStackBuffer = ma_aligned_malloc(MA_AUDIO_WORKLETS_THREAD_STACK_SIZE, 16, &pDevice->pContext->allocationCallbacks);
+        if (pStackBuffer == NULL) {
+            emscripten_destroy_audio_context(pDevice->webaudio.audioContext);
+            return MA_OUT_OF_MEMORY;
+        }
+
+        /* Our thread initialization parameters need to be allocated on the heap so they don't go out of scope. */
+        pInitParameters = (ma_audio_worklet_thread_initialized_data*)ma_malloc(sizeof(*pInitParameters), &pDevice->pContext->allocationCallbacks);
+        if (pInitParameters == NULL) {
+            ma_free(pStackBuffer, &pDevice->pContext->allocationCallbacks);
+            emscripten_destroy_audio_context(pDevice->webaudio.audioContext);
+            return MA_OUT_OF_MEMORY;
+        }
+
+        pInitParameters->pDevice = pDevice;
+        pInitParameters->pConfig = pConfig;
+        pInitParameters->pDescriptorPlayback = pDescriptorPlayback;
+        pInitParameters->pDescriptorCapture  = pDescriptorCapture;
+
+        /*
+        We need to flag the device as not yet initialized so we can wait on it later. Unfortunately all of
+        the Emscripten WebAudio stuff is asynchronous.
+        */
+        pDevice->webaudio.initResult = MA_BUSY;
+        {
+            emscripten_start_wasm_audio_worklet_thread_async(pDevice->webaudio.audioContext, pStackBuffer, MA_AUDIO_WORKLETS_THREAD_STACK_SIZE, ma_audio_worklet_thread_initialized__webaudio, pInitParameters);
+        }
+        while (pDevice->webaudio.initResult == MA_BUSY) { emscripten_sleep(1); }    /* We must wait for initialization to complete. We're just spinning here. The emscripten_sleep() call is why we need to build with `-sASYNCIFY`. */
+
+        /* Initialization is now complete. Descriptors were updated when the worklet was initialized. */
+        if (pDevice->webaudio.initResult != MA_SUCCESS) {
+            ma_free(pStackBuffer, &pDevice->pContext->allocationCallbacks);
+            emscripten_destroy_audio_context(pDevice->webaudio.audioContext);
+            return pDevice->webaudio.initResult;
+        }
+
+        /* We need to add an entry to the miniaudio.devices list on the JS side so we can do some JS/C interop. */
+        pDevice->webaudio.deviceIndex = EM_ASM_INT({
+            return window.miniaudio.track_device({
+                webaudio: emscriptenGetAudioObject($0),
+                state:    1, /* 1 = ma_device_state_stopped */
+                pDevice: $1
+            });
+        }, pDevice->webaudio.audioContext, pDevice);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* ScriptProcessorNode. This path requires us to do almost everything in JS, but we'll do as much as we can in C. */
+        ma_uint32 deviceIndex;
+        ma_uint32 channels;
+        ma_uint32 sampleRate;
+        ma_uint32 periodSizeInFrames;
+
+        /* The channel count will depend on the device type. If it's a capture, use its, otherwise use the playback side. */
+        if (pConfig->deviceType == ma_device_type_capture) {
+            channels = (pDescriptorCapture->channels  > 0) ? pDescriptorCapture->channels  : MA_DEFAULT_CHANNELS;
+        } else {
+            channels = (pDescriptorPlayback->channels > 0) ? pDescriptorPlayback->channels : MA_DEFAULT_CHANNELS;
+        }
+
+        /*
+        When testing in Firefox, I've seen it where capture mode fails if the sample rate is changed to anything other than it's
+        native rate. For this reason we're leaving the sample rate untouched for capture devices.
+        */
+        if (pConfig->deviceType == ma_device_type_playback) {
+            sampleRate = pDescriptorPlayback->sampleRate;
+        } else {
+            sampleRate = 0; /* Let the browser decide when capturing. */
+        }
+
+        /* The period size needs to be a power of 2. */
+        if (pConfig->deviceType == ma_device_type_capture) {
+            periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__webaudio(pDescriptorCapture, sampleRate, pConfig->performanceProfile);
+        } else {
+            periodSizeInFrames = ma_calculate_period_size_in_frames_from_descriptor__webaudio(pDescriptorPlayback, sampleRate, pConfig->performanceProfile);
+        }
+
+        /* We need an intermediary buffer for doing interleaving and deinterleaving. */
+        pDevice->webaudio.pIntermediaryBuffer = (float*)ma_malloc(periodSizeInFrames * channels * sizeof(float), &pDevice->pContext->allocationCallbacks);
+        if (pDevice->webaudio.pIntermediaryBuffer == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        deviceIndex = EM_ASM_INT({
+            var deviceType = $0;
+            var channels   = $1;
+            var sampleRate = $2;
+            var bufferSize = $3;
+            var pIntermediaryBuffer = $4;
+            var pDevice    = $5;
+
+            if (typeof(window.miniaudio) === 'undefined') {
+                return -1;  /* Context not initialized. */
+            }
+
+            var device = {};
+
+            /* First thing we need is an AudioContext. */
+            var audioContextOptions = {};
+            if (deviceType == window.miniaudio.device_type.playback && sampleRate != 0) {
+                audioContextOptions.sampleRate = sampleRate;
+            }
+
+            device.webaudio = new (window.AudioContext || window.webkitAudioContext)(audioContextOptions);
+            device.webaudio.suspend();  /* The AudioContext must be created in a suspended state. */
+            device.state = window.miniaudio.device_state.stopped;
+
+            /*
+            We need to create a ScriptProcessorNode. The channel situation is the same as the AudioWorklet path in that we
+            need to specify an output and configure the channel count there.
+            */
+            var channelCountIn  = 0;
+            var channelCountOut = channels;
+            if (deviceType != window.miniaudio.device_type.playback) {
+                channelCountIn  = channels;
+            }
+
+            device.scriptNode = device.webaudio.createScriptProcessor(bufferSize, channelCountIn, channelCountOut);
+
+            /* The node processing callback. */
+            device.scriptNode.onaudioprocess = function(e) {
+                if (device.intermediaryBufferView == null || device.intermediaryBufferView.length == 0) {
+                    device.intermediaryBufferView = new Float32Array(HEAPF32.buffer, pIntermediaryBuffer, bufferSize * channels);
+                }
+
+                /* Do the capture side first. */
+                if (deviceType == window.miniaudio.device_type.capture || deviceType == window.miniaudio.device_type.duplex) {
+                    /* The data must be interleaved before being processed miniaudio. */
+                    for (var iChannel = 0; iChannel < channels; iChannel += 1) {
+                        var inputBuffer = e.inputBuffer.getChannelData(iChannel);
+                        var intermediaryBuffer = device.intermediaryBufferView;
+
+                        for (var iFrame = 0; iFrame < bufferSize; iFrame += 1) {
+                            intermediaryBuffer[iFrame*channels + iChannel] = inputBuffer[iFrame];
+                        }
+                    }
+
+                    _ma_device_process_pcm_frames_capture__webaudio(pDevice, bufferSize, pIntermediaryBuffer);
+                }
+
+                if (deviceType == window.miniaudio.device_type.playback || deviceType == window.miniaudio.device_type.duplex) {
+                    _ma_device_process_pcm_frames_playback__webaudio(pDevice, bufferSize, pIntermediaryBuffer);
+
+                    for (var iChannel = 0; iChannel < e.outputBuffer.numberOfChannels; ++iChannel) {
+                        var outputBuffer = e.outputBuffer.getChannelData(iChannel);
+                        var intermediaryBuffer = device.intermediaryBufferView;
+
+                        for (var iFrame = 0; iFrame < bufferSize; iFrame += 1) {
+                            outputBuffer[iFrame] = intermediaryBuffer[iFrame*channels + iChannel];
+                        }
+                    }
+                } else {
+                    /* It's a capture-only device. Make sure the output is silenced. */
+                    for (var iChannel = 0; iChannel < e.outputBuffer.numberOfChannels; ++iChannel) {
+                        e.outputBuffer.getChannelData(iChannel).fill(0.0);
+                    }
+                }
+            };
+
+            /* Now we need to connect our node to the graph. */
+            if (deviceType == window.miniaudio.device_type.capture || deviceType == window.miniaudio.device_type.duplex) {
+                navigator.mediaDevices.getUserMedia({audio:true, video:false})
+                    .then(function(stream) {
+                        device.streamNode = device.webaudio.createMediaStreamSource(stream);
+                        device.streamNode.connect(device.scriptNode);
+                        device.scriptNode.connect(device.webaudio.destination);
+                    })
+                    .catch(function(error) {
+                        console.log("Failed to get user media: " + error);
+                    });
+            }
+
+            if (deviceType == window.miniaudio.device_type.playback) {
+                device.scriptNode.connect(device.webaudio.destination);
+            }
+
+            device.pDevice = pDevice;
+
+            return window.miniaudio.track_device(device);
+        }, pConfig->deviceType, channels, sampleRate, periodSizeInFrames, pDevice->webaudio.pIntermediaryBuffer, pDevice);
+
+        if (deviceIndex < 0) {
+            return MA_FAILED_TO_OPEN_BACKEND_DEVICE;
+        }
+
+        pDevice->webaudio.deviceIndex = deviceIndex;
+
+        /* Grab the sample rate from the audio context directly. */
+        sampleRate = (ma_uint32)EM_ASM_INT({ return window.miniaudio.get_device_by_index($0).webaudio.sampleRate; }, deviceIndex);
+
+        if (pDescriptorCapture != NULL) {
+            pDescriptorCapture->format              = ma_format_f32;
+            pDescriptorCapture->channels            = channels;
+            pDescriptorCapture->sampleRate          = sampleRate;
+            ma_channel_map_init_standard(ma_standard_channel_map_webaudio, pDescriptorCapture->channelMap, ma_countof(pDescriptorCapture->channelMap), pDescriptorCapture->channels);
+            pDescriptorCapture->periodSizeInFrames  = periodSizeInFrames;
+            pDescriptorCapture->periodCount         = 1;
+        }
+
+        if (pDescriptorPlayback != NULL) {
+            pDescriptorPlayback->format             = ma_format_f32;
+            pDescriptorPlayback->channels           = channels;
+            pDescriptorPlayback->sampleRate         = sampleRate;
+            ma_channel_map_init_standard(ma_standard_channel_map_webaudio, pDescriptorPlayback->channelMap, ma_countof(pDescriptorPlayback->channelMap), pDescriptorPlayback->channels);
+            pDescriptorPlayback->periodSizeInFrames = periodSizeInFrames;
+            pDescriptorPlayback->periodCount        = 1;
+        }
+
+        return MA_SUCCESS;
+    }
+    #endif
+}
+
+static ma_result ma_device_start__webaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    EM_ASM({
+        var device = window.miniaudio.get_device_by_index($0);
+        device.webaudio.resume();
+        device.state = window.miniaudio.device_state.started;
+    }, pDevice->webaudio.deviceIndex);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_device_stop__webaudio(ma_device* pDevice)
+{
+    MA_ASSERT(pDevice != NULL);
+
+    /*
+    From the WebAudio API documentation for AudioContext.suspend():
+
+        Suspends the progression of AudioContext's currentTime, allows any current context processing blocks that are already processed to be played to the
+        destination, and then allows the system to release its claim on audio hardware.
+
+    I read this to mean that "any current context processing blocks" are processed by suspend() - i.e. They they are drained. We therefore shouldn't need to
+    do any kind of explicit draining.
+    */
+    EM_ASM({
+        var device = window.miniaudio.get_device_by_index($0);
+        device.webaudio.suspend();
+        device.state = window.miniaudio.device_state.stopped;
+    }, pDevice->webaudio.deviceIndex);
+
+    ma_device__on_notification_stopped(pDevice);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_uninit__webaudio(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+    MA_ASSERT(pContext->backend == ma_backend_webaudio);
+
+    (void)pContext; /* Unused. */
+
+    /* Remove the global miniaudio object from window if there are no more references to it. */
+    EM_ASM({
+        if (typeof(window.miniaudio) !== 'undefined') {
+            miniaudio.unlock_event_types.map(function(event_type) {
+                document.removeEventListener(event_type, miniaudio.unlock, true);
+            });
+
+            window.miniaudio.referenceCount -= 1;
+            if (window.miniaudio.referenceCount === 0) {
+                delete window.miniaudio;
+            }
+        }
+    });
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init__webaudio(ma_context* pContext, const ma_context_config* pConfig, ma_backend_callbacks* pCallbacks)
+{
+    int resultFromJS;
+
+    MA_ASSERT(pContext != NULL);
+
+    (void)pConfig; /* Unused. */
+
+    /* Here is where our global JavaScript object is initialized. */
+    resultFromJS = EM_ASM_INT({
+        if (typeof window === 'undefined' || (window.AudioContext || window.webkitAudioContext) === undefined) {
+            return 0;   /* Web Audio not supported. */
+        }
+
+        if (typeof(window.miniaudio) === 'undefined') {
+            window.miniaudio = {
+                referenceCount: 0
+            };
+
+            /* Device types. */
+            window.miniaudio.device_type = {};
+            window.miniaudio.device_type.playback = $0;
+            window.miniaudio.device_type.capture  = $1;
+            window.miniaudio.device_type.duplex   = $2;
+
+            /* Device states. */
+            window.miniaudio.device_state = {};
+            window.miniaudio.device_state.stopped = $3;
+            window.miniaudio.device_state.started = $4;
+
+            /* Device cache for mapping devices to indexes for JavaScript/C interop. */
+            let miniaudio = window.miniaudio;
+            miniaudio.devices = [];
+
+            miniaudio.track_device = function(device) {
+                /* Try inserting into a free slot first. */
+                for (var iDevice = 0; iDevice < miniaudio.devices.length; ++iDevice) {
+                    if (miniaudio.devices[iDevice] == null) {
+                        miniaudio.devices[iDevice] = device;
+                        return iDevice;
+                    }
+                }
+
+                /* Getting here means there is no empty slots in the array so we just push to the end. */
+                miniaudio.devices.push(device);
+                return miniaudio.devices.length - 1;
+            };
+
+            miniaudio.untrack_device_by_index = function(deviceIndex) {
+                /* We just set the device's slot to null. The slot will get reused in the next call to ma_track_device. */
+                miniaudio.devices[deviceIndex] = null;
+
+                /* Trim the array if possible. */
+                while (miniaudio.devices.length > 0) {
+                    if (miniaudio.devices[miniaudio.devices.length-1] == null) {
+                        miniaudio.devices.pop();
+                    } else {
+                        break;
+                    }
+                }
+            };
+
+            miniaudio.untrack_device = function(device) {
+                for (var iDevice = 0; iDevice < miniaudio.devices.length; ++iDevice) {
+                    if (miniaudio.devices[iDevice] == device) {
+                        return miniaudio.untrack_device_by_index(iDevice);
+                    }
+                }
+            };
+
+            miniaudio.get_device_by_index = function(deviceIndex) {
+                return miniaudio.devices[deviceIndex];
+            };
+
+            miniaudio.unlock_event_types = (function(){
+                return ['touchend', 'click'];
+            })();
+
+            miniaudio.unlock = function() {
+                for(var i = 0; i < miniaudio.devices.length; ++i) {
+                    var device = miniaudio.devices[i];
+                    if (device != null &&
+                        device.webaudio != null &&
+                        device.state === miniaudio.device_state.started) {
+
+                        device.webaudio.resume().then(() => {
+                            _ma_device__on_notification_unlocked(device.pDevice);
+                        },
+                        (error) => {console.error("Failed to resume audiocontext", error);
+                        });
+                    }
+                }
+                miniaudio.unlock_event_types.map(function(event_type) {
+                    document.removeEventListener(event_type, miniaudio.unlock, true);
+                });
+            };
+
+            miniaudio.unlock_event_types.map(function(event_type) {
+                document.addEventListener(event_type, miniaudio.unlock, true);
+            });
+        }
+
+        window.miniaudio.referenceCount += 1;
+
+        return 1;
+    }, ma_device_type_playback, ma_device_type_capture, ma_device_type_duplex, ma_device_state_stopped, ma_device_state_started);
+
+    if (resultFromJS != 1) {
+        return MA_FAILED_TO_INIT_BACKEND;
+    }
+
+    pCallbacks->onContextInit             = ma_context_init__webaudio;
+    pCallbacks->onContextUninit           = ma_context_uninit__webaudio;
+    pCallbacks->onContextEnumerateDevices = ma_context_enumerate_devices__webaudio;
+    pCallbacks->onContextGetDeviceInfo    = ma_context_get_device_info__webaudio;
+    pCallbacks->onDeviceInit              = ma_device_init__webaudio;
+    pCallbacks->onDeviceUninit            = ma_device_uninit__webaudio;
+    pCallbacks->onDeviceStart             = ma_device_start__webaudio;
+    pCallbacks->onDeviceStop              = ma_device_stop__webaudio;
+    pCallbacks->onDeviceRead              = NULL;   /* Not needed because WebAudio is asynchronous. */
+    pCallbacks->onDeviceWrite             = NULL;   /* Not needed because WebAudio is asynchronous. */
+    pCallbacks->onDeviceDataLoop          = NULL;   /* Not needed because WebAudio is asynchronous. */
+
+    return MA_SUCCESS;
+}
+#endif  /* MA_HAS_WEBAUDIO */
+
+
+
+static ma_bool32 ma__is_channel_map_valid(const ma_channel* pChannelMap, ma_uint32 channels)
+{
+    /* A blank channel map should be allowed, in which case it should use an appropriate default which will depend on context. */
+    if (pChannelMap != NULL && pChannelMap[0] != MA_CHANNEL_NONE) {
+        ma_uint32 iChannel;
+
+        if (channels == 0 || channels > MA_MAX_CHANNELS) {
+            return MA_FALSE;   /* Channel count out of range. */
+        }
+
+        /* A channel cannot be present in the channel map more than once. */
+        for (iChannel = 0; iChannel < channels; ++iChannel) {
+            ma_uint32 jChannel;
+            for (jChannel = iChannel + 1; jChannel < channels; ++jChannel) {
+                if (pChannelMap[iChannel] == pChannelMap[jChannel]) {
+                    return MA_FALSE;
+                }
+            }
+        }
+    }
+
+    return MA_TRUE;
+}
+
+
+static ma_bool32 ma_context_is_backend_asynchronous(ma_context* pContext)
+{
+    MA_ASSERT(pContext != NULL);
+
+    if (pContext->callbacks.onDeviceRead == NULL && pContext->callbacks.onDeviceWrite == NULL) {
+        if (pContext->callbacks.onDeviceDataLoop == NULL) {
+            return MA_TRUE;
+        } else {
+            return MA_FALSE;
+        }
+    } else {
+        return MA_FALSE;
+    }
+}
+
+
+static ma_result ma_device__post_init_setup(ma_device* pDevice, ma_device_type deviceType)
+{
+    ma_result result;
+
+    MA_ASSERT(pDevice != NULL);
+
+    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
+        if (pDevice->capture.format == ma_format_unknown) {
+            pDevice->capture.format = pDevice->capture.internalFormat;
+        }
+        if (pDevice->capture.channels == 0) {
+            pDevice->capture.channels = pDevice->capture.internalChannels;
+        }
+        if (pDevice->capture.channelMap[0] == MA_CHANNEL_NONE) {
+            MA_ASSERT(pDevice->capture.channels <= MA_MAX_CHANNELS);
+            if (pDevice->capture.internalChannels == pDevice->capture.channels) {
+                ma_channel_map_copy(pDevice->capture.channelMap, pDevice->capture.internalChannelMap, pDevice->capture.channels);
+            } else {
+                if (pDevice->capture.channelMixMode == ma_channel_mix_mode_simple) {
+                    ma_channel_map_init_blank(pDevice->capture.channelMap, pDevice->capture.channels);
+                } else {
+                    ma_channel_map_init_standard(ma_standard_channel_map_default, pDevice->capture.channelMap, ma_countof(pDevice->capture.channelMap), pDevice->capture.channels);
+                }
+            }
+        }
+    }
+
+    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
+        if (pDevice->playback.format == ma_format_unknown) {
+            pDevice->playback.format = pDevice->playback.internalFormat;
+        }
+        if (pDevice->playback.channels == 0) {
+            pDevice->playback.channels = pDevice->playback.internalChannels;
+        }
+        if (pDevice->playback.channelMap[0] == MA_CHANNEL_NONE) {
+            MA_ASSERT(pDevice->playback.channels <= MA_MAX_CHANNELS);
+            if (pDevice->playback.internalChannels == pDevice->playback.channels) {
+                ma_channel_map_copy(pDevice->playback.channelMap, pDevice->playback.internalChannelMap, pDevice->playback.channels);
+            } else {
+                if (pDevice->playback.channelMixMode == ma_channel_mix_mode_simple) {
+                    ma_channel_map_init_blank(pDevice->playback.channelMap, pDevice->playback.channels);
+                } else {
+                    ma_channel_map_init_standard(ma_standard_channel_map_default, pDevice->playback.channelMap, ma_countof(pDevice->playback.channelMap), pDevice->playback.channels);
+                }
+            }
+        }
+    }
+
+    if (pDevice->sampleRate == 0) {
+        if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
+            pDevice->sampleRate = pDevice->capture.internalSampleRate;
+        } else {
+            pDevice->sampleRate = pDevice->playback.internalSampleRate;
+        }
+    }
+
+    /* Data converters. */
+    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
+        /* Converting from internal device format to client format. */
+        ma_data_converter_config converterConfig = ma_data_converter_config_init_default();
+        converterConfig.formatIn                        = pDevice->capture.internalFormat;
+        converterConfig.channelsIn                      = pDevice->capture.internalChannels;
+        converterConfig.sampleRateIn                    = pDevice->capture.internalSampleRate;
+        converterConfig.pChannelMapIn                   = pDevice->capture.internalChannelMap;
+        converterConfig.formatOut                       = pDevice->capture.format;
+        converterConfig.channelsOut                     = pDevice->capture.channels;
+        converterConfig.sampleRateOut                   = pDevice->sampleRate;
+        converterConfig.pChannelMapOut                  = pDevice->capture.channelMap;
+        converterConfig.channelMixMode                  = pDevice->capture.channelMixMode;
+        converterConfig.calculateLFEFromSpatialChannels = pDevice->capture.calculateLFEFromSpatialChannels;
+        converterConfig.allowDynamicSampleRate          = MA_FALSE;
+        converterConfig.resampling.algorithm            = pDevice->resampling.algorithm;
+        converterConfig.resampling.linear.lpfOrder      = pDevice->resampling.linear.lpfOrder;
+        converterConfig.resampling.pBackendVTable       = pDevice->resampling.pBackendVTable;
+        converterConfig.resampling.pBackendUserData     = pDevice->resampling.pBackendUserData;
+
+        /* Make sure the old converter is uninitialized first. */
+        if (ma_device_get_state(pDevice) != ma_device_state_uninitialized) {
+            ma_data_converter_uninit(&pDevice->capture.converter, &pDevice->pContext->allocationCallbacks);
+        }
+
+        result = ma_data_converter_init(&converterConfig, &pDevice->pContext->allocationCallbacks, &pDevice->capture.converter);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
+        /* Converting from client format to device format. */
+        ma_data_converter_config converterConfig = ma_data_converter_config_init_default();
+        converterConfig.formatIn                        = pDevice->playback.format;
+        converterConfig.channelsIn                      = pDevice->playback.channels;
+        converterConfig.sampleRateIn                    = pDevice->sampleRate;
+        converterConfig.pChannelMapIn                   = pDevice->playback.channelMap;
+        converterConfig.formatOut                       = pDevice->playback.internalFormat;
+        converterConfig.channelsOut                     = pDevice->playback.internalChannels;
+        converterConfig.sampleRateOut                   = pDevice->playback.internalSampleRate;
+        converterConfig.pChannelMapOut                  = pDevice->playback.internalChannelMap;
+        converterConfig.channelMixMode                  = pDevice->playback.channelMixMode;
+        converterConfig.calculateLFEFromSpatialChannels = pDevice->playback.calculateLFEFromSpatialChannels;
+        converterConfig.allowDynamicSampleRate          = MA_FALSE;
+        converterConfig.resampling.algorithm            = pDevice->resampling.algorithm;
+        converterConfig.resampling.linear.lpfOrder      = pDevice->resampling.linear.lpfOrder;
+        converterConfig.resampling.pBackendVTable       = pDevice->resampling.pBackendVTable;
+        converterConfig.resampling.pBackendUserData     = pDevice->resampling.pBackendUserData;
+
+        /* Make sure the old converter is uninitialized first. */
+        if (ma_device_get_state(pDevice) != ma_device_state_uninitialized) {
+            ma_data_converter_uninit(&pDevice->playback.converter, &pDevice->pContext->allocationCallbacks);
+        }
+
+        result = ma_data_converter_init(&converterConfig, &pDevice->pContext->allocationCallbacks, &pDevice->playback.converter);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+
+    /*
+    If the device is doing playback (ma_device_type_playback or ma_device_type_duplex), there's
+    a couple of situations where we'll need a heap allocated cache.
+
+    The first is a duplex device for backends that use a callback for data delivery. The reason
+    this is needed is that the input stage needs to have a buffer to place the input data while it
+    waits for the playback stage, after which the miniaudio data callback will get fired. This is
+    not needed for backends that use a blocking API because miniaudio manages temporary buffers on
+    the stack to achieve this.
+
+    The other situation is when the data converter does not have the ability to query the number
+    of input frames that are required in order to process a given number of output frames. When
+    performing data conversion, it's useful if miniaudio know exactly how many frames it needs
+    from the client in order to generate a given number of output frames. This way, only exactly
+    the number of frames are needed to be read from the client which means no cache is necessary.
+    On the other hand, if miniaudio doesn't know how many frames to read, it is forced to read
+    in fixed sized chunks and then cache any residual unused input frames, those of which will be
+    processed at a later stage.
+    */
+    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
+        ma_uint64 unused;
+
+        pDevice->playback.inputCacheConsumed  = 0;
+        pDevice->playback.inputCacheRemaining = 0;
+
+        if (pDevice->type == ma_device_type_duplex ||                                                                       /* Duplex. backend may decide to use ma_device_handle_backend_data_callback() which will require this cache. */
+            ma_data_converter_get_required_input_frame_count(&pDevice->playback.converter, 1, &unused) != MA_SUCCESS)       /* Data conversion required input frame calculation not supported. */
+        {
+            /* We need a heap allocated cache. We want to size this based on the period size. */
+            void* pNewInputCache;
+            ma_uint64 newInputCacheCap;
+            ma_uint64 newInputCacheSizeInBytes;
+
+            newInputCacheCap = ma_calculate_frame_count_after_resampling(pDevice->playback.internalSampleRate, pDevice->sampleRate, pDevice->playback.internalPeriodSizeInFrames);
+
+            newInputCacheSizeInBytes = newInputCacheCap * ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
+            if (newInputCacheSizeInBytes > MA_SIZE_MAX) {
+                ma_free(pDevice->playback.pInputCache, &pDevice->pContext->allocationCallbacks);
+                pDevice->playback.pInputCache   = NULL;
+                pDevice->playback.inputCacheCap = 0;
+                return MA_OUT_OF_MEMORY;    /* Allocation too big. Should never hit this, but makes the cast below safer for 32-bit builds. */
+            }
+
+            pNewInputCache = ma_realloc(pDevice->playback.pInputCache, (size_t)newInputCacheSizeInBytes, &pDevice->pContext->allocationCallbacks);
+            if (pNewInputCache == NULL) {
+                ma_free(pDevice->playback.pInputCache, &pDevice->pContext->allocationCallbacks);
+                pDevice->playback.pInputCache   = NULL;
+                pDevice->playback.inputCacheCap = 0;
+                return MA_OUT_OF_MEMORY;
+            }
+
+            pDevice->playback.pInputCache   = pNewInputCache;
+            pDevice->playback.inputCacheCap = newInputCacheCap;
+        } else {
+            /* Heap allocation not required. Make sure we clear out the old cache just in case this function was called in response to a route change. */
+            ma_free(pDevice->playback.pInputCache, &pDevice->pContext->allocationCallbacks);
+            pDevice->playback.pInputCache   = NULL;
+            pDevice->playback.inputCacheCap = 0;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_device_post_init(ma_device* pDevice, ma_device_type deviceType, const ma_device_descriptor* pDescriptorPlayback, const ma_device_descriptor* pDescriptorCapture)
+{
+    ma_result result;
+
+    if (pDevice == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Capture. */
+    if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
+        if (ma_device_descriptor_is_valid(pDescriptorCapture) == MA_FALSE) {
+            return MA_INVALID_ARGS;
+        }
+
+        pDevice->capture.internalFormat             = pDescriptorCapture->format;
+        pDevice->capture.internalChannels           = pDescriptorCapture->channels;
+        pDevice->capture.internalSampleRate         = pDescriptorCapture->sampleRate;
+        MA_COPY_MEMORY(pDevice->capture.internalChannelMap, pDescriptorCapture->channelMap, sizeof(pDescriptorCapture->channelMap));
+        pDevice->capture.internalPeriodSizeInFrames = pDescriptorCapture->periodSizeInFrames;
+        pDevice->capture.internalPeriods            = pDescriptorCapture->periodCount;
+
+        if (pDevice->capture.internalPeriodSizeInFrames == 0) {
+            pDevice->capture.internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptorCapture->periodSizeInMilliseconds, pDescriptorCapture->sampleRate);
+        }
+    }
+
+    /* Playback. */
+    if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
+        if (ma_device_descriptor_is_valid(pDescriptorPlayback) == MA_FALSE) {
+            return MA_INVALID_ARGS;
+        }
+
+        pDevice->playback.internalFormat             = pDescriptorPlayback->format;
+        pDevice->playback.internalChannels           = pDescriptorPlayback->channels;
+        pDevice->playback.internalSampleRate         = pDescriptorPlayback->sampleRate;
+        MA_COPY_MEMORY(pDevice->playback.internalChannelMap, pDescriptorPlayback->channelMap, sizeof(pDescriptorPlayback->channelMap));
+        pDevice->playback.internalPeriodSizeInFrames = pDescriptorPlayback->periodSizeInFrames;
+        pDevice->playback.internalPeriods            = pDescriptorPlayback->periodCount;
+
+        if (pDevice->playback.internalPeriodSizeInFrames == 0) {
+            pDevice->playback.internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptorPlayback->periodSizeInMilliseconds, pDescriptorPlayback->sampleRate);
+        }
+    }
+
+    /*
+    The name of the device can be retrieved from device info. This may be temporary and replaced with a `ma_device_get_info(pDevice, deviceType)` instead.
+    For loopback devices, we need to retrieve the name of the playback device.
+    */
+    {
+        ma_device_info deviceInfo;
+
+        if (deviceType == ma_device_type_capture || deviceType == ma_device_type_duplex || deviceType == ma_device_type_loopback) {
+            result = ma_device_get_info(pDevice, ma_device_type_capture, &deviceInfo);
+            if (result == MA_SUCCESS) {
+                ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), deviceInfo.name, (size_t)-1);
+            } else {
+                /* We failed to retrieve the device info. Fall back to a default name. */
+                if (pDescriptorCapture->pDeviceID == NULL) {
+                    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+                } else {
+                    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), "Capture Device", (size_t)-1);
+                }
+            }
+        }
+
+        if (deviceType == ma_device_type_playback || deviceType == ma_device_type_duplex) {
+            result = ma_device_get_info(pDevice, ma_device_type_playback, &deviceInfo);
+            if (result == MA_SUCCESS) {
+                ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), deviceInfo.name, (size_t)-1);
+            } else {
+                /* We failed to retrieve the device info. Fall back to a default name. */
+                if (pDescriptorPlayback->pDeviceID == NULL) {
+                    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+                } else {
+                    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), "Playback Device", (size_t)-1);
+                }
+            }
+        }
+    }
+
+    /* Update data conversion. */
+    return ma_device__post_init_setup(pDevice, deviceType); /* TODO: Should probably rename ma_device__post_init_setup() to something better. */
+}
+
+
+static ma_thread_result MA_THREADCALL ma_worker_thread(void* pData)
+{
+    ma_device* pDevice = (ma_device*)pData;
+#ifdef MA_WIN32
+    HRESULT CoInitializeResult;
+#endif
+
+    MA_ASSERT(pDevice != NULL);
+
+#ifdef MA_WIN32
+    CoInitializeResult = ma_CoInitializeEx(pDevice->pContext, NULL, MA_COINIT_VALUE);
+#endif
+
+    /*
+    When the device is being initialized its initial state is set to ma_device_state_uninitialized. Before returning from
+    ma_device_init(), the state needs to be set to something valid. In miniaudio the device's default state immediately
+    after initialization is stopped, so therefore we need to mark the device as such. miniaudio will wait on the worker
+    thread to signal an event to know when the worker thread is ready for action.
+    */
+    ma_device__set_state(pDevice, ma_device_state_stopped);
+    ma_event_signal(&pDevice->stopEvent);
+
+    for (;;) {  /* <-- This loop just keeps the thread alive. The main audio loop is inside. */
+        ma_result startResult;
+        ma_result stopResult;   /* <-- This will store the result from onDeviceStop(). If it returns an error, we don't fire the stopped notification callback. */
+
+        /* We wait on an event to know when something has requested that the device be started and the main loop entered. */
+        ma_event_wait(&pDevice->wakeupEvent);
+
+        /* Default result code. */
+        pDevice->workResult = MA_SUCCESS;
+
+        /* If the reason for the wake up is that we are terminating, just break from the loop. */
+        if (ma_device_get_state(pDevice) == ma_device_state_uninitialized) {
+            break;
+        }
+
+        /*
+        Getting to this point means the device is wanting to get started. The function that has requested that the device
+        be started will be waiting on an event (pDevice->startEvent) which means we need to make sure we signal the event
+        in both the success and error case. It's important that the state of the device is set _before_ signaling the event.
+        */
+        MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_starting);
+
+        /* If the device has a start callback, start it now. */
+        if (pDevice->pContext->callbacks.onDeviceStart != NULL) {
+            startResult = pDevice->pContext->callbacks.onDeviceStart(pDevice);
+        } else {
+            startResult = MA_SUCCESS;
+        }
+
+        /*
+        If starting was not successful we'll need to loop back to the start and wait for something
+        to happen (pDevice->wakeupEvent).
+        */
+        if (startResult != MA_SUCCESS) {
+            pDevice->workResult = startResult;
+            ma_event_signal(&pDevice->startEvent);  /* <-- Always signal the start event so ma_device_start() can return as it'll be waiting on it. */
+            continue;
+        }
+
+        /* Make sure the state is set appropriately. */
+        ma_device__set_state(pDevice, ma_device_state_started); /* <-- Set this before signaling the event so that the state is always guaranteed to be good after ma_device_start() has returned. */
+        ma_event_signal(&pDevice->startEvent);
+
+        ma_device__on_notification_started(pDevice);
+
+        if (pDevice->pContext->callbacks.onDeviceDataLoop != NULL) {
+            pDevice->pContext->callbacks.onDeviceDataLoop(pDevice);
+        } else {
+            /* The backend is not using a custom main loop implementation, so now fall back to the blocking read-write implementation. */
+            ma_device_audio_thread__default_read_write(pDevice);
+        }
+
+        /* Getting here means we have broken from the main loop which happens the application has requested that device be stopped. */
+        if (pDevice->pContext->callbacks.onDeviceStop != NULL) {
+            stopResult = pDevice->pContext->callbacks.onDeviceStop(pDevice);
+        } else {
+            stopResult = MA_SUCCESS;    /* No stop callback with the backend. Just assume successful. */
+        }
+
+        /*
+        After the device has stopped, make sure an event is posted. Don't post a stopped event if
+        stopping failed. This can happen on some backends when the underlying stream has been
+        stopped due to the device being physically unplugged or disabled via an OS setting.
+        */
+        if (stopResult == MA_SUCCESS) {
+            ma_device__on_notification_stopped(pDevice);
+        }
+
+        /* If we stopped because the device has been uninitialized, abort now. */
+        if (ma_device_get_state(pDevice) == ma_device_state_uninitialized) {
+            break;
+        }
+
+        /* A function somewhere is waiting for the device to have stopped for real so we need to signal an event to allow it to continue. */
+        ma_device__set_state(pDevice, ma_device_state_stopped);
+        ma_event_signal(&pDevice->stopEvent);
+    }
+
+#ifdef MA_WIN32
+    if (CoInitializeResult == S_OK) {
+        ma_CoUninitialize(pDevice->pContext);
+    }
+#endif
+
+    return (ma_thread_result)0;
+}
+
+
+/* Helper for determining whether or not the given device is initialized. */
+static ma_bool32 ma_device__is_initialized(ma_device* pDevice)
+{
+    if (pDevice == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_device_get_state(pDevice) != ma_device_state_uninitialized;
+}
+
+
+#ifdef MA_WIN32
+static ma_result ma_context_uninit_backend_apis__win32(ma_context* pContext)
+{
+    /* For some reason UWP complains when CoUninitialize() is called. I'm just not going to call it on UWP. */
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    if (pContext->win32.CoInitializeResult == S_OK) {
+        ma_CoUninitialize(pContext);
+    }
+
+    #if defined(MA_WIN32_DESKTOP)
+        ma_dlclose(ma_context_get_log(pContext), pContext->win32.hUser32DLL);
+        ma_dlclose(ma_context_get_log(pContext), pContext->win32.hAdvapi32DLL);
+    #endif
+
+    ma_dlclose(ma_context_get_log(pContext), pContext->win32.hOle32DLL);
+#else
+    (void)pContext;
+#endif
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init_backend_apis__win32(ma_context* pContext)
+{
+#if defined(MA_WIN32_DESKTOP) || defined(MA_WIN32_GDK)
+    #if defined(MA_WIN32_DESKTOP)
+        /* User32.dll */
+        pContext->win32.hUser32DLL = ma_dlopen(ma_context_get_log(pContext), "user32.dll");
+        if (pContext->win32.hUser32DLL == NULL) {
+            return MA_FAILED_TO_INIT_BACKEND;
+        }
+
+        pContext->win32.GetForegroundWindow = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hUser32DLL, "GetForegroundWindow");
+        pContext->win32.GetDesktopWindow    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hUser32DLL, "GetDesktopWindow");
+
+
+        /* Advapi32.dll */
+        pContext->win32.hAdvapi32DLL = ma_dlopen(ma_context_get_log(pContext), "advapi32.dll");
+        if (pContext->win32.hAdvapi32DLL == NULL) {
+            return MA_FAILED_TO_INIT_BACKEND;
+        }
+
+        pContext->win32.RegOpenKeyExA    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hAdvapi32DLL, "RegOpenKeyExA");
+        pContext->win32.RegCloseKey      = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hAdvapi32DLL, "RegCloseKey");
+        pContext->win32.RegQueryValueExA = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hAdvapi32DLL, "RegQueryValueExA");
+    #endif
+
+    /* Ole32.dll */
+    pContext->win32.hOle32DLL = ma_dlopen(ma_context_get_log(pContext), "ole32.dll");
+    if (pContext->win32.hOle32DLL == NULL) {
+        return MA_FAILED_TO_INIT_BACKEND;
+    }
+
+    pContext->win32.CoInitialize     = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoInitialize");
+    pContext->win32.CoInitializeEx   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoInitializeEx");
+    pContext->win32.CoUninitialize   = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoUninitialize");
+    pContext->win32.CoCreateInstance = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoCreateInstance");
+    pContext->win32.CoTaskMemFree    = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "CoTaskMemFree");
+    pContext->win32.PropVariantClear = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "PropVariantClear");
+    pContext->win32.StringFromGUID2  = (ma_proc)ma_dlsym(ma_context_get_log(pContext), pContext->win32.hOle32DLL, "StringFromGUID2");
+#else
+    (void)pContext; /* Unused. */
+#endif
+
+    pContext->win32.CoInitializeResult = ma_CoInitializeEx(pContext, NULL, MA_COINIT_VALUE);
+    return MA_SUCCESS;
+}
+#else
+static ma_result ma_context_uninit_backend_apis__nix(ma_context* pContext)
+{
+    (void)pContext;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_context_init_backend_apis__nix(ma_context* pContext)
+{
+    (void)pContext;
+
+    return MA_SUCCESS;
+}
+#endif
+
+static ma_result ma_context_init_backend_apis(ma_context* pContext)
+{
+    ma_result result;
+#ifdef MA_WIN32
+    result = ma_context_init_backend_apis__win32(pContext);
+#else
+    result = ma_context_init_backend_apis__nix(pContext);
+#endif
+
+    return result;
+}
+
+static ma_result ma_context_uninit_backend_apis(ma_context* pContext)
+{
+    ma_result result;
+#ifdef MA_WIN32
+    result = ma_context_uninit_backend_apis__win32(pContext);
+#else
+    result = ma_context_uninit_backend_apis__nix(pContext);
+#endif
+
+    return result;
+}
+
+
+/* The default capacity doesn't need to be too big. */
+#ifndef MA_DEFAULT_DEVICE_JOB_QUEUE_CAPACITY
+#define MA_DEFAULT_DEVICE_JOB_QUEUE_CAPACITY    32
+#endif
+
+MA_API ma_device_job_thread_config ma_device_job_thread_config_init(void)
+{
+    ma_device_job_thread_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.noThread         = MA_FALSE;
+    config.jobQueueCapacity = MA_DEFAULT_DEVICE_JOB_QUEUE_CAPACITY;
+    config.jobQueueFlags    = 0;
+
+    return config;
+}
+
+
+static ma_thread_result MA_THREADCALL ma_device_job_thread_entry(void* pUserData)
+{
+    ma_device_job_thread* pJobThread = (ma_device_job_thread*)pUserData;
+    MA_ASSERT(pJobThread != NULL);
+
+    for (;;) {
+        ma_result result;
+        ma_job job;
+
+        result = ma_device_job_thread_next(pJobThread, &job);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        if (job.toc.breakup.code == MA_JOB_TYPE_QUIT) {
+            break;
+        }
+
+        ma_job_process(&job);
+    }
+
+    return (ma_thread_result)0;
+}
+
+MA_API ma_result ma_device_job_thread_init(const ma_device_job_thread_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_device_job_thread* pJobThread)
+{
+    ma_result result;
+    ma_job_queue_config jobQueueConfig;
+
+    if (pJobThread == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pJobThread);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+
+    /* Initialize the job queue before the thread to ensure it's in a valid state. */
+    jobQueueConfig = ma_job_queue_config_init(pConfig->jobQueueFlags, pConfig->jobQueueCapacity);
+
+    result = ma_job_queue_init(&jobQueueConfig, pAllocationCallbacks, &pJobThread->jobQueue);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize job queue. */
+    }
+
+
+    /* The thread needs to be initialized after the job queue to ensure the thread doesn't try to access it prematurely. */
+    if (pConfig->noThread == MA_FALSE) {
+        result = ma_thread_create(&pJobThread->thread, ma_thread_priority_normal, 0, ma_device_job_thread_entry, pJobThread, pAllocationCallbacks);
+        if (result != MA_SUCCESS) {
+            ma_job_queue_uninit(&pJobThread->jobQueue, pAllocationCallbacks);
+            return result;  /* Failed to create the job thread. */
+        }
+
+        pJobThread->_hasThread = MA_TRUE;
+    } else {
+        pJobThread->_hasThread = MA_FALSE;
+    }
+
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_device_job_thread_uninit(ma_device_job_thread* pJobThread, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pJobThread == NULL) {
+        return;
+    }
+
+    /* The first thing to do is post a quit message to the job queue. If we're using a thread we'll need to wait for it. */
+    {
+        ma_job job = ma_job_init(MA_JOB_TYPE_QUIT);
+        ma_device_job_thread_post(pJobThread, &job);
+    }
+
+    /* Wait for the thread to terminate naturally. */
+    if (pJobThread->_hasThread) {
+        ma_thread_wait(&pJobThread->thread);
+    }
+
+    /* At this point the thread should be terminated so we can safely uninitialize the job queue. */
+    ma_job_queue_uninit(&pJobThread->jobQueue, pAllocationCallbacks);
+}
+
+MA_API ma_result ma_device_job_thread_post(ma_device_job_thread* pJobThread, const ma_job* pJob)
+{
+    if (pJobThread == NULL || pJob == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_job_queue_post(&pJobThread->jobQueue, pJob);
+}
+
+MA_API ma_result ma_device_job_thread_next(ma_device_job_thread* pJobThread, ma_job* pJob)
+{
+    if (pJob == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pJob);
+
+    if (pJobThread == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_job_queue_next(&pJobThread->jobQueue, pJob);
+}
+
+
+MA_API ma_bool32 ma_device_id_equal(const ma_device_id* pA, const ma_device_id* pB)
+{
+    size_t i;
+
+    if (pA == NULL || pB == NULL) {
+        return MA_FALSE;
+    }
+
+    for (i = 0; i < sizeof(ma_device_id); i += 1) {
+        if (((const char*)pA)[i] != ((const char*)pB)[i]) {
+            return MA_FALSE;
+        }
+    }
+
+    return MA_TRUE;
+}
+
+
+
+MA_API ma_context_config ma_context_config_init(void)
+{
+    ma_context_config config;
+    MA_ZERO_OBJECT(&config);
+
+    return config;
+}
+
+MA_API ma_result ma_context_init(const ma_backend backends[], ma_uint32 backendCount, const ma_context_config* pConfig, ma_context* pContext)
+{
+    ma_result result;
+    ma_context_config defaultConfig;
+    ma_backend defaultBackends[ma_backend_null+1];
+    ma_uint32 iBackend;
+    ma_backend* pBackendsToIterate;
+    ma_uint32 backendsToIterateCount;
+
+    if (pContext == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pContext);
+
+    /* Always make sure the config is set first to ensure properties are available as soon as possible. */
+    if (pConfig == NULL) {
+        defaultConfig = ma_context_config_init();
+        pConfig = &defaultConfig;
+    }
+
+    /* Allocation callbacks need to come first because they'll be passed around to other areas. */
+    result = ma_allocation_callbacks_init_copy(&pContext->allocationCallbacks, &pConfig->allocationCallbacks);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* Get a lot set up first so we can start logging ASAP. */
+    if (pConfig->pLog != NULL) {
+        pContext->pLog = pConfig->pLog;
+    } else {
+        result = ma_log_init(&pContext->allocationCallbacks, &pContext->log);
+        if (result == MA_SUCCESS) {
+            pContext->pLog = &pContext->log;
+        } else {
+            pContext->pLog = NULL;  /* Logging is not available. */
+        }
+    }
+
+    pContext->threadPriority  = pConfig->threadPriority;
+    pContext->threadStackSize = pConfig->threadStackSize;
+    pContext->pUserData       = pConfig->pUserData;
+
+    /* Backend APIs need to be initialized first. This is where external libraries will be loaded and linked. */
+    result = ma_context_init_backend_apis(pContext);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    for (iBackend = 0; iBackend <= ma_backend_null; ++iBackend) {
+        defaultBackends[iBackend] = (ma_backend)iBackend;
+    }
+
+    pBackendsToIterate = (ma_backend*)backends;
+    backendsToIterateCount = backendCount;
+    if (pBackendsToIterate == NULL) {
+        pBackendsToIterate = (ma_backend*)defaultBackends;
+        backendsToIterateCount = ma_countof(defaultBackends);
+    }
+
+    MA_ASSERT(pBackendsToIterate != NULL);
+
+    for (iBackend = 0; iBackend < backendsToIterateCount; iBackend += 1) {
+        ma_backend backend = pBackendsToIterate[iBackend];
+
+        /* Make sure all callbacks are reset so we don't accidentally drag in any from previously failed initialization attempts. */
+        MA_ZERO_OBJECT(&pContext->callbacks);
+
+        /* These backends are using the new callback system. */
+        switch (backend) {
+        #ifdef MA_HAS_WASAPI
+            case ma_backend_wasapi:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__wasapi;
+            } break;
+        #endif
+        #ifdef MA_HAS_DSOUND
+            case ma_backend_dsound:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__dsound;
+            } break;
+        #endif
+        #ifdef MA_HAS_WINMM
+            case ma_backend_winmm:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__winmm;
+            } break;
+        #endif
+        #ifdef MA_HAS_COREAUDIO
+            case ma_backend_coreaudio:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__coreaudio;
+            } break;
+        #endif
+        #ifdef MA_HAS_SNDIO
+            case ma_backend_sndio:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__sndio;
+            } break;
+        #endif
+        #ifdef MA_HAS_AUDIO4
+            case ma_backend_audio4:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__audio4;
+            } break;
+        #endif
+        #ifdef MA_HAS_OSS
+            case ma_backend_oss:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__oss;
+            } break;
+        #endif
+        #ifdef MA_HAS_PULSEAUDIO
+            case ma_backend_pulseaudio:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__pulse;
+            } break;
+        #endif
+        #ifdef MA_HAS_ALSA
+            case ma_backend_alsa:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__alsa;
+            } break;
+        #endif
+        #ifdef MA_HAS_JACK
+            case ma_backend_jack:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__jack;
+            } break;
+        #endif
+        #ifdef MA_HAS_AAUDIO
+            case ma_backend_aaudio:
+            {
+                if (ma_is_backend_enabled(backend)) {
+                    pContext->callbacks.onContextInit = ma_context_init__aaudio;
+                }
+            } break;
+        #endif
+        #ifdef MA_HAS_OPENSL
+            case ma_backend_opensl:
+            {
+                if (ma_is_backend_enabled(backend)) {
+                    pContext->callbacks.onContextInit = ma_context_init__opensl;
+                }
+            } break;
+        #endif
+        #ifdef MA_HAS_WEBAUDIO
+            case ma_backend_webaudio:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__webaudio;
+            } break;
+        #endif
+        #ifdef MA_HAS_CUSTOM
+            case ma_backend_custom:
+            {
+                /* Slightly different logic for custom backends. Custom backends can optionally set all of their callbacks in the config. */
+                pContext->callbacks = pConfig->custom;
+            } break;
+        #endif
+        #ifdef MA_HAS_NULL
+            case ma_backend_null:
+            {
+                pContext->callbacks.onContextInit = ma_context_init__null;
+            } break;
+        #endif
+
+            default: break;
+        }
+
+        if (pContext->callbacks.onContextInit != NULL) {
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "Attempting to initialize %s backend...\n", ma_get_backend_name(backend));
+            result = pContext->callbacks.onContextInit(pContext, pConfig, &pContext->callbacks);
+        } else {
+            /* Getting here means the onContextInit callback is not set which means the backend is not enabled. Special case for the custom backend. */
+            if (backend != ma_backend_custom) {
+                result = MA_BACKEND_NOT_ENABLED;
+            } else {
+            #if !defined(MA_HAS_CUSTOM)
+                result = MA_BACKEND_NOT_ENABLED;
+            #else
+                result = MA_NO_BACKEND;
+            #endif
+            }
+        }
+
+        /* If this iteration was successful, return. */
+        if (result == MA_SUCCESS) {
+            result = ma_mutex_init(&pContext->deviceEnumLock);
+            if (result != MA_SUCCESS) {
+                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "Failed to initialize mutex for device enumeration. ma_context_get_devices() is not thread safe.\n");
+            }
+
+            result = ma_mutex_init(&pContext->deviceInfoLock);
+            if (result != MA_SUCCESS) {
+                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_WARNING, "Failed to initialize mutex for device info retrieval. ma_context_get_device_info() is not thread safe.\n");
+            }
+
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "System Architecture:\n");
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "  Endian: %s\n", ma_is_little_endian() ? "LE"  : "BE");
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "  SSE2:   %s\n", ma_has_sse2()         ? "YES" : "NO");
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "  AVX2:   %s\n", ma_has_avx2()         ? "YES" : "NO");
+            ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "  NEON:   %s\n", ma_has_neon()         ? "YES" : "NO");
+
+            pContext->backend = backend;
+            return result;
+        } else {
+            if (result == MA_BACKEND_NOT_ENABLED) {
+                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "%s backend is disabled.\n", ma_get_backend_name(backend));
+            } else {
+                ma_log_postf(ma_context_get_log(pContext), MA_LOG_LEVEL_DEBUG, "Failed to initialize %s backend.\n", ma_get_backend_name(backend));
+            }
+        }
+    }
+
+    /* If we get here it means an error occurred. */
+    MA_ZERO_OBJECT(pContext);  /* Safety. */
+    return MA_NO_BACKEND;
+}
+
+MA_API ma_result ma_context_uninit(ma_context* pContext)
+{
+    if (pContext == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pContext->callbacks.onContextUninit != NULL) {
+        pContext->callbacks.onContextUninit(pContext);
+    }
+
+    ma_mutex_uninit(&pContext->deviceEnumLock);
+    ma_mutex_uninit(&pContext->deviceInfoLock);
+    ma_free(pContext->pDeviceInfos, &pContext->allocationCallbacks);
+    ma_context_uninit_backend_apis(pContext);
+
+    if (pContext->pLog == &pContext->log) {
+        ma_log_uninit(&pContext->log);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API size_t ma_context_sizeof(void)
+{
+    return sizeof(ma_context);
+}
+
+
+MA_API ma_log* ma_context_get_log(ma_context* pContext)
+{
+    if (pContext == NULL) {
+        return NULL;
+    }
+
+    return pContext->pLog;
+}
+
+
+MA_API ma_result ma_context_enumerate_devices(ma_context* pContext, ma_enum_devices_callback_proc callback, void* pUserData)
+{
+    ma_result result;
+
+    if (pContext == NULL || callback == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pContext->callbacks.onContextEnumerateDevices == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    ma_mutex_lock(&pContext->deviceEnumLock);
+    {
+        result = pContext->callbacks.onContextEnumerateDevices(pContext, callback, pUserData);
+    }
+    ma_mutex_unlock(&pContext->deviceEnumLock);
+
+    return result;
+}
+
+
+static ma_bool32 ma_context_get_devices__enum_callback(ma_context* pContext, ma_device_type deviceType, const ma_device_info* pInfo, void* pUserData)
+{
+    /*
+    We need to insert the device info into our main internal buffer. Where it goes depends on the device type. If it's a capture device
+    it's just appended to the end. If it's a playback device it's inserted just before the first capture device.
+    */
+
+    /*
+    First make sure we have room. Since the number of devices we add to the list is usually relatively small I've decided to use a
+    simple fixed size increment for buffer expansion.
+    */
+    const ma_uint32 bufferExpansionCount = 2;
+    const ma_uint32 totalDeviceInfoCount = pContext->playbackDeviceInfoCount + pContext->captureDeviceInfoCount;
+
+    if (totalDeviceInfoCount >= pContext->deviceInfoCapacity) {
+        ma_uint32 newCapacity = pContext->deviceInfoCapacity + bufferExpansionCount;
+        ma_device_info* pNewInfos = (ma_device_info*)ma_realloc(pContext->pDeviceInfos, sizeof(*pContext->pDeviceInfos)*newCapacity, &pContext->allocationCallbacks);
+        if (pNewInfos == NULL) {
+            return MA_FALSE;   /* Out of memory. */
+        }
+
+        pContext->pDeviceInfos = pNewInfos;
+        pContext->deviceInfoCapacity = newCapacity;
+    }
+
+    if (deviceType == ma_device_type_playback) {
+        /* Playback. Insert just before the first capture device. */
+
+        /* The first thing to do is move all of the capture devices down a slot. */
+        ma_uint32 iFirstCaptureDevice = pContext->playbackDeviceInfoCount;
+        size_t iCaptureDevice;
+        for (iCaptureDevice = totalDeviceInfoCount; iCaptureDevice > iFirstCaptureDevice; --iCaptureDevice) {
+            pContext->pDeviceInfos[iCaptureDevice] = pContext->pDeviceInfos[iCaptureDevice-1];
+        }
+
+        /* Now just insert where the first capture device was before moving it down a slot. */
+        pContext->pDeviceInfos[iFirstCaptureDevice] = *pInfo;
+        pContext->playbackDeviceInfoCount += 1;
+    } else {
+        /* Capture. Insert at the end. */
+        pContext->pDeviceInfos[totalDeviceInfoCount] = *pInfo;
+        pContext->captureDeviceInfoCount += 1;
+    }
+
+    (void)pUserData;
+    return MA_TRUE;
+}
+
+MA_API ma_result ma_context_get_devices(ma_context* pContext, ma_device_info** ppPlaybackDeviceInfos, ma_uint32* pPlaybackDeviceCount, ma_device_info** ppCaptureDeviceInfos, ma_uint32* pCaptureDeviceCount)
+{
+    ma_result result;
+
+    /* Safety. */
+    if (ppPlaybackDeviceInfos != NULL) *ppPlaybackDeviceInfos = NULL;
+    if (pPlaybackDeviceCount  != NULL) *pPlaybackDeviceCount  = 0;
+    if (ppCaptureDeviceInfos  != NULL) *ppCaptureDeviceInfos  = NULL;
+    if (pCaptureDeviceCount   != NULL) *pCaptureDeviceCount   = 0;
+
+    if (pContext == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pContext->callbacks.onContextEnumerateDevices == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* Note that we don't use ma_context_enumerate_devices() here because we want to do locking at a higher level. */
+    ma_mutex_lock(&pContext->deviceEnumLock);
+    {
+        /* Reset everything first. */
+        pContext->playbackDeviceInfoCount = 0;
+        pContext->captureDeviceInfoCount = 0;
+
+        /* Now enumerate over available devices. */
+        result = pContext->callbacks.onContextEnumerateDevices(pContext, ma_context_get_devices__enum_callback, NULL);
+        if (result == MA_SUCCESS) {
+            /* Playback devices. */
+            if (ppPlaybackDeviceInfos != NULL) {
+                *ppPlaybackDeviceInfos = pContext->pDeviceInfos;
+            }
+            if (pPlaybackDeviceCount != NULL) {
+                *pPlaybackDeviceCount = pContext->playbackDeviceInfoCount;
+            }
+
+            /* Capture devices. */
+            if (ppCaptureDeviceInfos != NULL) {
+                *ppCaptureDeviceInfos = pContext->pDeviceInfos;
+                /* Capture devices come after playback devices. */
+                if (pContext->playbackDeviceInfoCount > 0) {
+                    /* Conditional, because NULL+0 is undefined behavior. */
+                    *ppCaptureDeviceInfos += pContext->playbackDeviceInfoCount;
+                }
+            }
+            if (pCaptureDeviceCount != NULL) {
+                *pCaptureDeviceCount = pContext->captureDeviceInfoCount;
+            }
+        }
+    }
+    ma_mutex_unlock(&pContext->deviceEnumLock);
+
+    return result;
+}
+
+MA_API ma_result ma_context_get_device_info(ma_context* pContext, ma_device_type deviceType, const ma_device_id* pDeviceID, ma_device_info* pDeviceInfo)
+{
+    ma_result result;
+    ma_device_info deviceInfo;
+
+    /* NOTE: Do not clear pDeviceInfo on entry. The reason is the pDeviceID may actually point to pDeviceInfo->id which will break things. */
+    if (pContext == NULL || pDeviceInfo == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(&deviceInfo);
+
+    /* Help the backend out by copying over the device ID if we have one. */
+    if (pDeviceID != NULL) {
+        MA_COPY_MEMORY(&deviceInfo.id, pDeviceID, sizeof(*pDeviceID));
+    }
+
+    if (pContext->callbacks.onContextGetDeviceInfo == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    ma_mutex_lock(&pContext->deviceInfoLock);
+    {
+        result = pContext->callbacks.onContextGetDeviceInfo(pContext, deviceType, pDeviceID, &deviceInfo);
+    }
+    ma_mutex_unlock(&pContext->deviceInfoLock);
+
+    *pDeviceInfo = deviceInfo;
+    return result;
+}
+
+MA_API ma_bool32 ma_context_is_loopback_supported(ma_context* pContext)
+{
+    if (pContext == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_is_loopback_supported(pContext->backend);
+}
+
+
+MA_API ma_device_config ma_device_config_init(ma_device_type deviceType)
+{
+    ma_device_config config;
+    MA_ZERO_OBJECT(&config);
+    config.deviceType = deviceType;
+    config.resampling = ma_resampler_config_init(ma_format_unknown, 0, 0, 0, ma_resample_algorithm_linear); /* Format/channels/rate don't matter here. */
+
+    return config;
+}
+
+MA_API ma_result ma_device_init(ma_context* pContext, const ma_device_config* pConfig, ma_device* pDevice)
+{
+    ma_result result;
+    ma_device_descriptor descriptorPlayback;
+    ma_device_descriptor descriptorCapture;
+
+    /* The context can be null, in which case we self-manage it. */
+    if (pContext == NULL) {
+        return ma_device_init_ex(NULL, 0, NULL, pConfig, pDevice);
+    }
+
+    if (pDevice == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDevice);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Check that we have our callbacks defined. */
+    if (pContext->callbacks.onDeviceInit == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* Basic config validation. */
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex) {
+        if (pConfig->capture.channels > MA_MAX_CHANNELS) {
+            return MA_INVALID_ARGS;
+        }
+
+        if (!ma__is_channel_map_valid(pConfig->capture.pChannelMap, pConfig->capture.channels)) {
+            return MA_INVALID_ARGS;
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
+        if (pConfig->playback.channels > MA_MAX_CHANNELS) {
+            return MA_INVALID_ARGS;
+        }
+
+        if (!ma__is_channel_map_valid(pConfig->playback.pChannelMap, pConfig->playback.channels)) {
+            return MA_INVALID_ARGS;
+        }
+    }
+
+    pDevice->pContext = pContext;
+
+    /* Set the user data and log callback ASAP to ensure it is available for the entire initialization process. */
+    pDevice->pUserData      = pConfig->pUserData;
+    pDevice->onData         = pConfig->dataCallback;
+    pDevice->onNotification = pConfig->notificationCallback;
+    pDevice->onStop         = pConfig->stopCallback;
+
+    if (pConfig->playback.pDeviceID != NULL) {
+        MA_COPY_MEMORY(&pDevice->playback.id, pConfig->playback.pDeviceID, sizeof(pDevice->playback.id));
+        pDevice->playback.pID = &pDevice->playback.id;
+    } else {
+        pDevice->playback.pID = NULL;
+    }
+
+    if (pConfig->capture.pDeviceID != NULL) {
+        MA_COPY_MEMORY(&pDevice->capture.id, pConfig->capture.pDeviceID, sizeof(pDevice->capture.id));
+        pDevice->capture.pID = &pDevice->capture.id;
+    } else {
+        pDevice->capture.pID = NULL;
+    }
+
+    pDevice->noPreSilencedOutputBuffer   = pConfig->noPreSilencedOutputBuffer;
+    pDevice->noClip                      = pConfig->noClip;
+    pDevice->noDisableDenormals          = pConfig->noDisableDenormals;
+    pDevice->noFixedSizedCallback        = pConfig->noFixedSizedCallback;
+    ma_atomic_float_set(&pDevice->masterVolumeFactor, 1);
+
+    pDevice->type                        = pConfig->deviceType;
+    pDevice->sampleRate                  = pConfig->sampleRate;
+    pDevice->resampling.algorithm        = pConfig->resampling.algorithm;
+    pDevice->resampling.linear.lpfOrder  = pConfig->resampling.linear.lpfOrder;
+    pDevice->resampling.pBackendVTable   = pConfig->resampling.pBackendVTable;
+    pDevice->resampling.pBackendUserData = pConfig->resampling.pBackendUserData;
+
+    pDevice->capture.shareMode           = pConfig->capture.shareMode;
+    pDevice->capture.format              = pConfig->capture.format;
+    pDevice->capture.channels            = pConfig->capture.channels;
+    ma_channel_map_copy_or_default(pDevice->capture.channelMap, ma_countof(pDevice->capture.channelMap), pConfig->capture.pChannelMap, pConfig->capture.channels);
+    pDevice->capture.channelMixMode      = pConfig->capture.channelMixMode;
+    pDevice->capture.calculateLFEFromSpatialChannels = pConfig->capture.calculateLFEFromSpatialChannels;
+
+    pDevice->playback.shareMode          = pConfig->playback.shareMode;
+    pDevice->playback.format             = pConfig->playback.format;
+    pDevice->playback.channels           = pConfig->playback.channels;
+    ma_channel_map_copy_or_default(pDevice->playback.channelMap, ma_countof(pDevice->playback.channelMap), pConfig->playback.pChannelMap, pConfig->playback.channels);
+    pDevice->playback.channelMixMode     = pConfig->playback.channelMixMode;
+    pDevice->playback.calculateLFEFromSpatialChannels = pConfig->playback.calculateLFEFromSpatialChannels;
+
+    result = ma_mutex_init(&pDevice->startStopLock);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /*
+    When the device is started, the worker thread is the one that does the actual startup of the backend device. We
+    use a semaphore to wait for the background thread to finish the work. The same applies for stopping the device.
+
+    Each of these semaphores is released internally by the worker thread when the work is completed. The start
+    semaphore is also used to wake up the worker thread.
+    */
+    result = ma_event_init(&pDevice->wakeupEvent);
+    if (result != MA_SUCCESS) {
+        ma_mutex_uninit(&pDevice->startStopLock);
+        return result;
+    }
+
+    result = ma_event_init(&pDevice->startEvent);
+    if (result != MA_SUCCESS) {
+        ma_event_uninit(&pDevice->wakeupEvent);
+        ma_mutex_uninit(&pDevice->startStopLock);
+        return result;
+    }
+
+    result = ma_event_init(&pDevice->stopEvent);
+    if (result != MA_SUCCESS) {
+        ma_event_uninit(&pDevice->startEvent);
+        ma_event_uninit(&pDevice->wakeupEvent);
+        ma_mutex_uninit(&pDevice->startStopLock);
+        return result;
+    }
+
+
+    MA_ZERO_OBJECT(&descriptorPlayback);
+    descriptorPlayback.pDeviceID                = pConfig->playback.pDeviceID;
+    descriptorPlayback.shareMode                = pConfig->playback.shareMode;
+    descriptorPlayback.format                   = pConfig->playback.format;
+    descriptorPlayback.channels                 = pConfig->playback.channels;
+    descriptorPlayback.sampleRate               = pConfig->sampleRate;
+    ma_channel_map_copy_or_default(descriptorPlayback.channelMap, ma_countof(descriptorPlayback.channelMap), pConfig->playback.pChannelMap, pConfig->playback.channels);
+    descriptorPlayback.periodSizeInFrames       = pConfig->periodSizeInFrames;
+    descriptorPlayback.periodSizeInMilliseconds = pConfig->periodSizeInMilliseconds;
+    descriptorPlayback.periodCount              = pConfig->periods;
+
+    if (descriptorPlayback.periodCount == 0) {
+        descriptorPlayback.periodCount = MA_DEFAULT_PERIODS;
+    }
+
+
+    MA_ZERO_OBJECT(&descriptorCapture);
+    descriptorCapture.pDeviceID                 = pConfig->capture.pDeviceID;
+    descriptorCapture.shareMode                 = pConfig->capture.shareMode;
+    descriptorCapture.format                    = pConfig->capture.format;
+    descriptorCapture.channels                  = pConfig->capture.channels;
+    descriptorCapture.sampleRate                = pConfig->sampleRate;
+    ma_channel_map_copy_or_default(descriptorCapture.channelMap, ma_countof(descriptorCapture.channelMap), pConfig->capture.pChannelMap, pConfig->capture.channels);
+    descriptorCapture.periodSizeInFrames        = pConfig->periodSizeInFrames;
+    descriptorCapture.periodSizeInMilliseconds  = pConfig->periodSizeInMilliseconds;
+    descriptorCapture.periodCount               = pConfig->periods;
+
+    if (descriptorCapture.periodCount == 0) {
+        descriptorCapture.periodCount = MA_DEFAULT_PERIODS;
+    }
+
+
+    result = pContext->callbacks.onDeviceInit(pDevice, pConfig, &descriptorPlayback, &descriptorCapture);
+    if (result != MA_SUCCESS) {
+        ma_event_uninit(&pDevice->startEvent);
+        ma_event_uninit(&pDevice->wakeupEvent);
+        ma_mutex_uninit(&pDevice->startStopLock);
+        return result;
+    }
+
+#if 0
+    /*
+    On output the descriptors will contain the *actual* data format of the device. We need this to know how to convert the data between
+    the requested format and the internal format.
+    */
+    if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
+        if (!ma_device_descriptor_is_valid(&descriptorCapture)) {
+            ma_device_uninit(pDevice);
+            return MA_INVALID_ARGS;
+        }
+
+        pDevice->capture.internalFormat             = descriptorCapture.format;
+        pDevice->capture.internalChannels           = descriptorCapture.channels;
+        pDevice->capture.internalSampleRate         = descriptorCapture.sampleRate;
+        ma_channel_map_copy(pDevice->capture.internalChannelMap, descriptorCapture.channelMap, descriptorCapture.channels);
+        pDevice->capture.internalPeriodSizeInFrames = descriptorCapture.periodSizeInFrames;
+        pDevice->capture.internalPeriods            = descriptorCapture.periodCount;
+
+        if (pDevice->capture.internalPeriodSizeInFrames == 0) {
+            pDevice->capture.internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(descriptorCapture.periodSizeInMilliseconds, descriptorCapture.sampleRate);
+        }
+    }
+
+    if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+        if (!ma_device_descriptor_is_valid(&descriptorPlayback)) {
+            ma_device_uninit(pDevice);
+            return MA_INVALID_ARGS;
+        }
+
+        pDevice->playback.internalFormat             = descriptorPlayback.format;
+        pDevice->playback.internalChannels           = descriptorPlayback.channels;
+        pDevice->playback.internalSampleRate         = descriptorPlayback.sampleRate;
+        ma_channel_map_copy(pDevice->playback.internalChannelMap, descriptorPlayback.channelMap, descriptorPlayback.channels);
+        pDevice->playback.internalPeriodSizeInFrames = descriptorPlayback.periodSizeInFrames;
+        pDevice->playback.internalPeriods            = descriptorPlayback.periodCount;
+
+        if (pDevice->playback.internalPeriodSizeInFrames == 0) {
+            pDevice->playback.internalPeriodSizeInFrames = ma_calculate_buffer_size_in_frames_from_milliseconds(descriptorPlayback.periodSizeInMilliseconds, descriptorPlayback.sampleRate);
+        }
+    }
+
+
+    /*
+    The name of the device can be retrieved from device info. This may be temporary and replaced with a `ma_device_get_info(pDevice, deviceType)` instead.
+    For loopback devices, we need to retrieve the name of the playback device.
+    */
+    {
+        ma_device_info deviceInfo;
+
+        if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
+            result = ma_device_get_info(pDevice, (pConfig->deviceType == ma_device_type_loopback) ? ma_device_type_playback : ma_device_type_capture, &deviceInfo);
+            if (result == MA_SUCCESS) {
+                ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), deviceInfo.name, (size_t)-1);
+            } else {
+                /* We failed to retrieve the device info. Fall back to a default name. */
+                if (descriptorCapture.pDeviceID == NULL) {
+                    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), MA_DEFAULT_CAPTURE_DEVICE_NAME, (size_t)-1);
+                } else {
+                    ma_strncpy_s(pDevice->capture.name, sizeof(pDevice->capture.name), "Capture Device", (size_t)-1);
+                }
+            }
+        }
+
+        if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+            result = ma_device_get_info(pDevice, ma_device_type_playback, &deviceInfo);
+            if (result == MA_SUCCESS) {
+                ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), deviceInfo.name, (size_t)-1);
+            } else {
+                /* We failed to retrieve the device info. Fall back to a default name. */
+                if (descriptorPlayback.pDeviceID == NULL) {
+                    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), MA_DEFAULT_PLAYBACK_DEVICE_NAME, (size_t)-1);
+                } else {
+                    ma_strncpy_s(pDevice->playback.name, sizeof(pDevice->playback.name), "Playback Device", (size_t)-1);
+                }
+            }
+        }
+    }
+
+
+    ma_device__post_init_setup(pDevice, pConfig->deviceType);
+#endif
+
+    result = ma_device_post_init(pDevice, pConfig->deviceType, &descriptorPlayback, &descriptorCapture);
+    if (result != MA_SUCCESS) {
+        ma_device_uninit(pDevice);
+        return result;
+    }
+
+
+    /*
+    If we're using fixed sized callbacks we'll need to make use of an intermediary buffer. Needs to
+    be done after post_init_setup() because we'll need access to the sample rate.
+    */
+    if (pConfig->noFixedSizedCallback == MA_FALSE) {
+        /* We're using a fixed sized data callback so we'll need an intermediary buffer. */
+        ma_uint32 intermediaryBufferCap = pConfig->periodSizeInFrames;
+        if (intermediaryBufferCap == 0) {
+            intermediaryBufferCap = ma_calculate_buffer_size_in_frames_from_milliseconds(pConfig->periodSizeInMilliseconds, pDevice->sampleRate);
+        }
+
+        if (pConfig->deviceType == ma_device_type_capture || pConfig->deviceType == ma_device_type_duplex || pConfig->deviceType == ma_device_type_loopback) {
+            ma_uint32 intermediaryBufferSizeInBytes;
+
+            pDevice->capture.intermediaryBufferLen = 0;
+            pDevice->capture.intermediaryBufferCap = intermediaryBufferCap;
+            if (pDevice->capture.intermediaryBufferCap == 0) {
+                pDevice->capture.intermediaryBufferCap = pDevice->capture.internalPeriodSizeInFrames;
+            }
+
+            intermediaryBufferSizeInBytes = pDevice->capture.intermediaryBufferCap * ma_get_bytes_per_frame(pDevice->capture.format, pDevice->capture.channels);
+
+            pDevice->capture.pIntermediaryBuffer = ma_malloc((size_t)intermediaryBufferSizeInBytes, &pContext->allocationCallbacks);
+            if (pDevice->capture.pIntermediaryBuffer == NULL) {
+                ma_device_uninit(pDevice);
+                return MA_OUT_OF_MEMORY;
+            }
+
+            /* Silence the buffer for safety. */
+            ma_silence_pcm_frames(pDevice->capture.pIntermediaryBuffer, pDevice->capture.intermediaryBufferCap, pDevice->capture.format, pDevice->capture.channels);
+            pDevice->capture.intermediaryBufferLen = pDevice->capture.intermediaryBufferCap;
+        }
+
+        if (pConfig->deviceType == ma_device_type_playback || pConfig->deviceType == ma_device_type_duplex) {
+            ma_uint64 intermediaryBufferSizeInBytes;
+
+            pDevice->playback.intermediaryBufferLen = 0;
+            if (pConfig->deviceType == ma_device_type_duplex) {
+                pDevice->playback.intermediaryBufferCap = pDevice->capture.intermediaryBufferCap;   /* In duplex mode, make sure the intermediary buffer is always the same size as the capture side. */
+            } else {
+                pDevice->playback.intermediaryBufferCap = intermediaryBufferCap;
+                if (pDevice->playback.intermediaryBufferCap == 0) {
+                    pDevice->playback.intermediaryBufferCap = pDevice->playback.internalPeriodSizeInFrames;
+                }
+            }
+
+            intermediaryBufferSizeInBytes = pDevice->playback.intermediaryBufferCap * ma_get_bytes_per_frame(pDevice->playback.format, pDevice->playback.channels);
+
+            pDevice->playback.pIntermediaryBuffer = ma_malloc((size_t)intermediaryBufferSizeInBytes, &pContext->allocationCallbacks);
+            if (pDevice->playback.pIntermediaryBuffer == NULL) {
+                ma_device_uninit(pDevice);
+                return MA_OUT_OF_MEMORY;
+            }
+
+            /* Silence the buffer for safety. */
+            ma_silence_pcm_frames(pDevice->playback.pIntermediaryBuffer, pDevice->playback.intermediaryBufferCap, pDevice->playback.format, pDevice->playback.channels);
+            pDevice->playback.intermediaryBufferLen = 0;
+        }
+    } else {
+        /* Not using a fixed sized data callback so no need for an intermediary buffer. */
+    }
+
+
+    /* Some backends don't require the worker thread. */
+    if (!ma_context_is_backend_asynchronous(pContext)) {
+        /* The worker thread. */
+        result = ma_thread_create(&pDevice->thread, pContext->threadPriority, pContext->threadStackSize, ma_worker_thread, pDevice, &pContext->allocationCallbacks);
+        if (result != MA_SUCCESS) {
+            ma_device_uninit(pDevice);
+            return result;
+        }
+
+        /* Wait for the worker thread to put the device into its stopped state for real. */
+        ma_event_wait(&pDevice->stopEvent);
+        MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_stopped);
+    } else {
+        /*
+        If the backend is asynchronous and the device is duplex, we'll need an intermediary ring buffer. Note that this needs to be done
+        after ma_device__post_init_setup().
+        */
+        if (ma_context_is_backend_asynchronous(pContext)) {
+            if (pConfig->deviceType == ma_device_type_duplex) {
+                result = ma_duplex_rb_init(pDevice->capture.format, pDevice->capture.channels, pDevice->sampleRate, pDevice->capture.internalSampleRate, pDevice->capture.internalPeriodSizeInFrames, &pDevice->pContext->allocationCallbacks, &pDevice->duplexRB);
+                if (result != MA_SUCCESS) {
+                    ma_device_uninit(pDevice);
+                    return result;
+                }
+            }
+        }
+
+        ma_device__set_state(pDevice, ma_device_state_stopped);
+    }
+
+    /* Log device information. */
+    {
+        ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "[%s]\n", ma_get_backend_name(pDevice->pContext->backend));
+        if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
+            char name[MA_MAX_DEVICE_NAME_LENGTH + 1];
+            ma_device_get_name(pDevice, ma_device_type_capture, name, sizeof(name), NULL);
+
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "  %s (%s)\n", name, "Capture");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Format:      %s -> %s\n", ma_get_format_name(pDevice->capture.internalFormat), ma_get_format_name(pDevice->capture.format));
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Channels:    %d -> %d\n", pDevice->capture.internalChannels, pDevice->capture.channels);
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Sample Rate: %d -> %d\n", pDevice->capture.internalSampleRate, pDevice->sampleRate);
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Buffer Size: %d*%d (%d)\n", pDevice->capture.internalPeriodSizeInFrames, pDevice->capture.internalPeriods, (pDevice->capture.internalPeriodSizeInFrames * pDevice->capture.internalPeriods));
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Conversion:\n");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Pre Format Conversion:  %s\n", pDevice->capture.converter.hasPreFormatConversion  ? "YES" : "NO");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Post Format Conversion: %s\n", pDevice->capture.converter.hasPostFormatConversion ? "YES" : "NO");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Routing:        %s\n", pDevice->capture.converter.hasChannelConverter     ? "YES" : "NO");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Resampling:             %s\n", pDevice->capture.converter.hasResampler            ? "YES" : "NO");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Passthrough:            %s\n", pDevice->capture.converter.isPassthrough           ? "YES" : "NO");
+            {
+                char channelMapStr[1024];
+                ma_channel_map_to_string(pDevice->capture.internalChannelMap, pDevice->capture.internalChannels, channelMapStr, sizeof(channelMapStr));
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Map In:         {%s}\n", channelMapStr);
+
+                ma_channel_map_to_string(pDevice->capture.channelMap, pDevice->capture.channels, channelMapStr, sizeof(channelMapStr));
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Map Out:        {%s}\n", channelMapStr);
+            }
+        }
+        if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+            char name[MA_MAX_DEVICE_NAME_LENGTH + 1];
+            ma_device_get_name(pDevice, ma_device_type_playback, name, sizeof(name), NULL);
+
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "  %s (%s)\n", name, "Playback");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Format:      %s -> %s\n", ma_get_format_name(pDevice->playback.format), ma_get_format_name(pDevice->playback.internalFormat));
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Channels:    %d -> %d\n", pDevice->playback.channels, pDevice->playback.internalChannels);
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Sample Rate: %d -> %d\n", pDevice->sampleRate, pDevice->playback.internalSampleRate);
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Buffer Size: %d*%d (%d)\n", pDevice->playback.internalPeriodSizeInFrames, pDevice->playback.internalPeriods, (pDevice->playback.internalPeriodSizeInFrames * pDevice->playback.internalPeriods));
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "    Conversion:\n");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Pre Format Conversion:  %s\n", pDevice->playback.converter.hasPreFormatConversion  ? "YES" : "NO");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Post Format Conversion: %s\n", pDevice->playback.converter.hasPostFormatConversion ? "YES" : "NO");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Routing:        %s\n", pDevice->playback.converter.hasChannelConverter     ? "YES" : "NO");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Resampling:             %s\n", pDevice->playback.converter.hasResampler            ? "YES" : "NO");
+            ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Passthrough:            %s\n", pDevice->playback.converter.isPassthrough           ? "YES" : "NO");
+            {
+                char channelMapStr[1024];
+                ma_channel_map_to_string(pDevice->playback.channelMap, pDevice->playback.channels, channelMapStr, sizeof(channelMapStr));
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Map In:         {%s}\n", channelMapStr);
+
+                ma_channel_map_to_string(pDevice->playback.internalChannelMap, pDevice->playback.internalChannels, channelMapStr, sizeof(channelMapStr));
+                ma_log_postf(ma_device_get_log(pDevice), MA_LOG_LEVEL_INFO, "      Channel Map Out:        {%s}\n", channelMapStr);
+            }
+        }
+    }
+
+    MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_stopped);
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_device_init_ex(const ma_backend backends[], ma_uint32 backendCount, const ma_context_config* pContextConfig, const ma_device_config* pConfig, ma_device* pDevice)
+{
+    ma_result result;
+    ma_context* pContext;
+    ma_backend defaultBackends[ma_backend_null+1];
+    ma_uint32 iBackend;
+    ma_backend* pBackendsToIterate;
+    ma_uint32 backendsToIterateCount;
+    ma_allocation_callbacks allocationCallbacks;
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pContextConfig != NULL) {
+        result = ma_allocation_callbacks_init_copy(&allocationCallbacks, &pContextConfig->allocationCallbacks);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    } else {
+        allocationCallbacks = ma_allocation_callbacks_init_default();
+    }
+
+    pContext = (ma_context*)ma_malloc(sizeof(*pContext), &allocationCallbacks);
+    if (pContext == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    for (iBackend = 0; iBackend <= ma_backend_null; ++iBackend) {
+        defaultBackends[iBackend] = (ma_backend)iBackend;
+    }
+
+    pBackendsToIterate = (ma_backend*)backends;
+    backendsToIterateCount = backendCount;
+    if (pBackendsToIterate == NULL) {
+        pBackendsToIterate = (ma_backend*)defaultBackends;
+        backendsToIterateCount = ma_countof(defaultBackends);
+    }
+
+    result = MA_NO_BACKEND;
+
+    for (iBackend = 0; iBackend < backendsToIterateCount; ++iBackend) {
+        /*
+        This is a hack for iOS. If the context config is null, there's a good chance the
+        `ma_device_init(NULL, &deviceConfig, pDevice);` pattern is being used. In this
+        case, set the session category based on the device type.
+        */
+    #if defined(MA_APPLE_MOBILE)
+        ma_context_config contextConfig;
+
+        if (pContextConfig == NULL) {
+            contextConfig = ma_context_config_init();
+            switch (pConfig->deviceType) {
+                case ma_device_type_duplex: {
+                    contextConfig.coreaudio.sessionCategory = ma_ios_session_category_play_and_record;
+                } break;
+                case ma_device_type_capture: {
+                    contextConfig.coreaudio.sessionCategory = ma_ios_session_category_record;
+                } break;
+                case ma_device_type_playback:
+                default: {
+                    contextConfig.coreaudio.sessionCategory = ma_ios_session_category_playback;
+                } break;
+            }
+
+            pContextConfig = &contextConfig;
+        }
+    #endif
+
+        result = ma_context_init(&pBackendsToIterate[iBackend], 1, pContextConfig, pContext);
+        if (result == MA_SUCCESS) {
+            result = ma_device_init(pContext, pConfig, pDevice);
+            if (result == MA_SUCCESS) {
+                break;  /* Success. */
+            } else {
+                ma_context_uninit(pContext);   /* Failure. */
+            }
+        }
+    }
+
+    if (result != MA_SUCCESS) {
+        ma_free(pContext, &allocationCallbacks);
+        return result;
+    }
+
+    pDevice->isOwnerOfContext = MA_TRUE;
+    return result;
+}
+
+MA_API void ma_device_uninit(ma_device* pDevice)
+{
+    if (!ma_device__is_initialized(pDevice)) {
+        return;
+    }
+
+    /*
+    It's possible for the miniaudio side of the device and the backend to not be in sync due to
+    system-level situations such as the computer being put into sleep mode and the backend not
+    notifying miniaudio of the fact the device has stopped. It's possible for this to result in a
+    deadlock due to miniaudio thinking the device is in a running state, when in fact it's not
+    running at all. For this reason I am no longer explicitly stopping the device. I don't think
+    this should affect anyone in practice since uninitializing the backend will naturally stop the
+    device anyway.
+    */
+    #if 0
+    {
+        /* Make sure the device is stopped first. The backends will probably handle this naturally, but I like to do it explicitly for my own sanity. */
+        if (ma_device_is_started(pDevice)) {
+            ma_device_stop(pDevice);
+        }
+    }
+    #endif
+
+    /* Putting the device into an uninitialized state will make the worker thread return. */
+    ma_device__set_state(pDevice, ma_device_state_uninitialized);
+
+    /* Wake up the worker thread and wait for it to properly terminate. */
+    if (!ma_context_is_backend_asynchronous(pDevice->pContext)) {
+        ma_event_signal(&pDevice->wakeupEvent);
+        ma_thread_wait(&pDevice->thread);
+    }
+
+    if (pDevice->pContext->callbacks.onDeviceUninit != NULL) {
+        pDevice->pContext->callbacks.onDeviceUninit(pDevice);
+    }
+
+
+    ma_event_uninit(&pDevice->stopEvent);
+    ma_event_uninit(&pDevice->startEvent);
+    ma_event_uninit(&pDevice->wakeupEvent);
+    ma_mutex_uninit(&pDevice->startStopLock);
+
+    if (ma_context_is_backend_asynchronous(pDevice->pContext)) {
+        if (pDevice->type == ma_device_type_duplex) {
+            ma_duplex_rb_uninit(&pDevice->duplexRB);
+        }
+    }
+
+    if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_duplex || pDevice->type == ma_device_type_loopback) {
+        ma_data_converter_uninit(&pDevice->capture.converter, &pDevice->pContext->allocationCallbacks);
+    }
+    if (pDevice->type == ma_device_type_playback || pDevice->type == ma_device_type_duplex) {
+        ma_data_converter_uninit(&pDevice->playback.converter, &pDevice->pContext->allocationCallbacks);
+    }
+
+    if (pDevice->playback.pInputCache != NULL) {
+        ma_free(pDevice->playback.pInputCache, &pDevice->pContext->allocationCallbacks);
+    }
+
+    if (pDevice->capture.pIntermediaryBuffer != NULL) {
+        ma_free(pDevice->capture.pIntermediaryBuffer, &pDevice->pContext->allocationCallbacks);
+    }
+    if (pDevice->playback.pIntermediaryBuffer != NULL) {
+        ma_free(pDevice->playback.pIntermediaryBuffer, &pDevice->pContext->allocationCallbacks);
+    }
+
+    if (pDevice->isOwnerOfContext) {
+        ma_allocation_callbacks allocationCallbacks = pDevice->pContext->allocationCallbacks;
+
+        ma_context_uninit(pDevice->pContext);
+        ma_free(pDevice->pContext, &allocationCallbacks);
+    }
+
+    MA_ZERO_OBJECT(pDevice);
+}
+
+MA_API ma_context* ma_device_get_context(ma_device* pDevice)
+{
+    if (pDevice == NULL) {
+        return NULL;
+    }
+
+    return pDevice->pContext;
+}
+
+MA_API ma_log* ma_device_get_log(ma_device* pDevice)
+{
+    return ma_context_get_log(ma_device_get_context(pDevice));
+}
+
+MA_API ma_result ma_device_get_info(ma_device* pDevice, ma_device_type type, ma_device_info* pDeviceInfo)
+{
+    if (pDeviceInfo == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDeviceInfo);
+
+    if (pDevice == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* If the onDeviceGetInfo() callback is set, use that. Otherwise we'll fall back to ma_context_get_device_info(). */
+    if (pDevice->pContext->callbacks.onDeviceGetInfo != NULL) {
+        return pDevice->pContext->callbacks.onDeviceGetInfo(pDevice, type, pDeviceInfo);
+    }
+
+    /* Getting here means onDeviceGetInfo is not implemented so we need to fall back to an alternative. */
+    if (type == ma_device_type_playback) {
+        return ma_context_get_device_info(pDevice->pContext, type, pDevice->playback.pID, pDeviceInfo);
+    } else {
+        /*
+        Here we're getting the capture side, which is the branch we'll be entering for a loopback
+        device, since loopback is capturing. However, if the device is using the default device ID,
+        it won't get the correct information because it'll think we're asking for the default
+        capture device, where in fact for loopback we want the default *playback* device. We'll do
+        a bit of a hack here to make sure we get the correct info.
+        */
+        if (pDevice->type == ma_device_type_loopback && pDevice->capture.pID == NULL) {
+            type = ma_device_type_playback;
+        }
+
+        return ma_context_get_device_info(pDevice->pContext, type, pDevice->capture.pID, pDeviceInfo);
+    }
+}
+
+MA_API ma_result ma_device_get_name(ma_device* pDevice, ma_device_type type, char* pName, size_t nameCap, size_t* pLengthNotIncludingNullTerminator)
+{
+    ma_result result;
+    ma_device_info deviceInfo;
+
+    if (pLengthNotIncludingNullTerminator != NULL) {
+        *pLengthNotIncludingNullTerminator = 0;
+    }
+
+    if (pName != NULL && nameCap > 0) {
+        pName[0] = '\0';
+    }
+
+    result = ma_device_get_info(pDevice, type, &deviceInfo);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pName != NULL) {
+        ma_strncpy_s(pName, nameCap, deviceInfo.name, (size_t)-1);
+
+        /*
+        For safety, make sure the length is based on the truncated output string rather than the
+        source. Otherwise the caller might assume the output buffer contains more content than it
+        actually does.
+        */
+        if (pLengthNotIncludingNullTerminator != NULL) {
+            *pLengthNotIncludingNullTerminator = strlen(pName);
+        }
+    } else {
+        /* Name not specified. Just report the length of the source string. */
+        if (pLengthNotIncludingNullTerminator != NULL) {
+            *pLengthNotIncludingNullTerminator = strlen(deviceInfo.name);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_device_start(ma_device* pDevice)
+{
+    ma_result result;
+
+    if (pDevice == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_device_get_state(pDevice) == ma_device_state_uninitialized) {
+        return MA_INVALID_OPERATION;    /* Not initialized. */
+    }
+
+    if (ma_device_get_state(pDevice) == ma_device_state_started) {
+        return MA_SUCCESS;  /* Already started. */
+    }
+
+    ma_mutex_lock(&pDevice->startStopLock);
+    {
+        /*
+        We need to check again if the device is in a started state because it's possible for one thread to have started the device
+        while another was waiting on the mutex.
+        */
+        if (ma_device_get_state(pDevice) == ma_device_state_started) {
+            ma_mutex_unlock(&pDevice->startStopLock);
+            return MA_SUCCESS;  /* Already started. */
+        }
+
+        /* Starting and stopping are wrapped in a mutex which means we can assert that the device is in a stopped or paused state. */
+        MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_stopped);
+
+        ma_device__set_state(pDevice, ma_device_state_starting);
+
+        /* Asynchronous backends need to be handled differently. */
+        if (ma_context_is_backend_asynchronous(pDevice->pContext)) {
+            if (pDevice->pContext->callbacks.onDeviceStart != NULL) {
+                result = pDevice->pContext->callbacks.onDeviceStart(pDevice);
+            } else {
+                result = MA_INVALID_OPERATION;
+            }
+
+            if (result == MA_SUCCESS) {
+                ma_device__set_state(pDevice, ma_device_state_started);
+                ma_device__on_notification_started(pDevice);
+            }
+        } else {
+            /*
+            Synchronous backends are started by signaling an event that's being waited on in the worker thread. We first wake up the
+            thread and then wait for the start event.
+            */
+            ma_event_signal(&pDevice->wakeupEvent);
+
+            /*
+            Wait for the worker thread to finish starting the device. Note that the worker thread will be the one who puts the device
+            into the started state. Don't call ma_device__set_state() here.
+            */
+            ma_event_wait(&pDevice->startEvent);
+            result = pDevice->workResult;
+        }
+
+        /* We changed the state from stopped to started, so if we failed, make sure we put the state back to stopped. */
+        if (result != MA_SUCCESS) {
+            ma_device__set_state(pDevice, ma_device_state_stopped);
+        }
+    }
+    ma_mutex_unlock(&pDevice->startStopLock);
+
+    return result;
+}
+
+MA_API ma_result ma_device_stop(ma_device* pDevice)
+{
+    ma_result result;
+
+    if (pDevice == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_device_get_state(pDevice) == ma_device_state_uninitialized) {
+        return MA_INVALID_OPERATION;    /* Not initialized. */
+    }
+
+    if (ma_device_get_state(pDevice) == ma_device_state_stopped) {
+        return MA_SUCCESS;  /* Already stopped. */
+    }
+
+    ma_mutex_lock(&pDevice->startStopLock);
+    {
+        /*
+        We need to check again if the device is in a stopped state because it's possible for one thread to have stopped the device
+        while another was waiting on the mutex.
+        */
+        if (ma_device_get_state(pDevice) == ma_device_state_stopped) {
+            ma_mutex_unlock(&pDevice->startStopLock);
+            return MA_SUCCESS;  /* Already stopped. */
+        }
+
+        /* Starting and stopping are wrapped in a mutex which means we can assert that the device is in a started or paused state. */
+        MA_ASSERT(ma_device_get_state(pDevice) == ma_device_state_started);
+
+        ma_device__set_state(pDevice, ma_device_state_stopping);
+
+        /* Asynchronous backends need to be handled differently. */
+        if (ma_context_is_backend_asynchronous(pDevice->pContext)) {
+            /* Asynchronous backends must have a stop operation. */
+            if (pDevice->pContext->callbacks.onDeviceStop != NULL) {
+                result = pDevice->pContext->callbacks.onDeviceStop(pDevice);
+            } else {
+                result = MA_INVALID_OPERATION;
+            }
+
+            ma_device__set_state(pDevice, ma_device_state_stopped);
+        } else {
+            /*
+            Synchronous backends. The stop callback is always called from the worker thread. Do not call the stop callback here. If
+            the backend is implementing its own audio thread loop we'll need to wake it up if required. Note that we need to make
+            sure the state of the device is *not* playing right now, which it shouldn't be since we set it above. This is super
+            important though, so I'm asserting it here as well for extra safety in case we accidentally change something later.
+            */
+            MA_ASSERT(ma_device_get_state(pDevice) != ma_device_state_started);
+
+            if (pDevice->pContext->callbacks.onDeviceDataLoopWakeup != NULL) {
+                pDevice->pContext->callbacks.onDeviceDataLoopWakeup(pDevice);
+            }
+
+            /*
+            We need to wait for the worker thread to become available for work before returning. Note that the worker thread will be
+            the one who puts the device into the stopped state. Don't call ma_device__set_state() here.
+            */
+            ma_event_wait(&pDevice->stopEvent);
+            result = MA_SUCCESS;
+        }
+
+        /*
+        This is a safety measure to ensure the internal buffer has been cleared so any leftover
+        does not get played the next time the device starts. Ideally this should be drained by
+        the backend first.
+        */
+        pDevice->playback.intermediaryBufferLen = 0;
+        pDevice->playback.inputCacheConsumed    = 0;
+        pDevice->playback.inputCacheRemaining   = 0;
+    }
+    ma_mutex_unlock(&pDevice->startStopLock);
+
+    return result;
+}
+
+MA_API ma_bool32 ma_device_is_started(const ma_device* pDevice)
+{
+    return ma_device_get_state(pDevice) == ma_device_state_started;
+}
+
+MA_API ma_device_state ma_device_get_state(const ma_device* pDevice)
+{
+    if (pDevice == NULL) {
+        return ma_device_state_uninitialized;
+    }
+
+    return ma_atomic_device_state_get((ma_atomic_device_state*)&pDevice->state);   /* Naughty cast to get rid of a const warning. */
+}
+
+MA_API ma_result ma_device_set_master_volume(ma_device* pDevice, float volume)
+{
+    if (pDevice == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (volume < 0.0f) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_atomic_float_set(&pDevice->masterVolumeFactor, volume);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_device_get_master_volume(ma_device* pDevice, float* pVolume)
+{
+    if (pVolume == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDevice == NULL) {
+        *pVolume = 0;
+        return MA_INVALID_ARGS;
+    }
+
+    *pVolume = ma_atomic_float_get(&pDevice->masterVolumeFactor);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_device_set_master_volume_db(ma_device* pDevice, float gainDB)
+{
+    if (gainDB > 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_device_set_master_volume(pDevice, ma_volume_db_to_linear(gainDB));
+}
+
+MA_API ma_result ma_device_get_master_volume_db(ma_device* pDevice, float* pGainDB)
+{
+    float factor;
+    ma_result result;
+
+    if (pGainDB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_device_get_master_volume(pDevice, &factor);
+    if (result != MA_SUCCESS) {
+        *pGainDB = 0;
+        return result;
+    }
+
+    *pGainDB = ma_volume_linear_to_db(factor);
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_device_handle_backend_data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount)
+{
+    if (pDevice == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pOutput == NULL && pInput == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    There is an assert deeper in the code that checks that frameCount > 0. Since this is a public facing
+    API we'll need to check for that here. I've had reports that AAudio can sometimes post a frame count
+    of 0.
+    */
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDevice->type == ma_device_type_duplex) {
+        if (pInput != NULL) {
+            ma_device__handle_duplex_callback_capture(pDevice, frameCount, pInput, &pDevice->duplexRB.rb);
+        }
+
+        if (pOutput != NULL) {
+            ma_device__handle_duplex_callback_playback(pDevice, frameCount, pOutput, &pDevice->duplexRB.rb);
+        }
+    } else {
+        if (pDevice->type == ma_device_type_capture || pDevice->type == ma_device_type_loopback) {
+            if (pInput == NULL) {
+                return MA_INVALID_ARGS;
+            }
+
+            ma_device__send_frames_to_client(pDevice, frameCount, pInput);
+        }
+
+        if (pDevice->type == ma_device_type_playback) {
+            if (pOutput == NULL) {
+                return MA_INVALID_ARGS;
+            }
+
+            ma_device__read_frames_from_client(pDevice, frameCount, pOutput);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint32 ma_calculate_buffer_size_in_frames_from_descriptor(const ma_device_descriptor* pDescriptor, ma_uint32 nativeSampleRate, ma_performance_profile performanceProfile)
+{
+    if (pDescriptor == NULL) {
+        return 0;
+    }
+
+    /*
+    We must have a non-0 native sample rate, but some backends don't allow retrieval of this at the
+    time when the size of the buffer needs to be determined. In this case we need to just take a best
+    guess and move on. We'll try using the sample rate in pDescriptor first. If that's not set we'll
+    just fall back to MA_DEFAULT_SAMPLE_RATE.
+    */
+    if (nativeSampleRate == 0) {
+        nativeSampleRate = pDescriptor->sampleRate;
+    }
+    if (nativeSampleRate == 0) {
+        nativeSampleRate = MA_DEFAULT_SAMPLE_RATE;
+    }
+
+    MA_ASSERT(nativeSampleRate != 0);
+
+    if (pDescriptor->periodSizeInFrames == 0) {
+        if (pDescriptor->periodSizeInMilliseconds == 0) {
+            if (performanceProfile == ma_performance_profile_low_latency) {
+                return ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_LOW_LATENCY, nativeSampleRate);
+            } else {
+                return ma_calculate_buffer_size_in_frames_from_milliseconds(MA_DEFAULT_PERIOD_SIZE_IN_MILLISECONDS_CONSERVATIVE, nativeSampleRate);
+            }
+        } else {
+            return ma_calculate_buffer_size_in_frames_from_milliseconds(pDescriptor->periodSizeInMilliseconds, nativeSampleRate);
+        }
+    } else {
+        return pDescriptor->periodSizeInFrames;
+    }
+}
+#endif  /* MA_NO_DEVICE_IO */
+
+
+MA_API ma_uint32 ma_calculate_buffer_size_in_milliseconds_from_frames(ma_uint32 bufferSizeInFrames, ma_uint32 sampleRate)
+{
+    /* Prevent a division by zero. */
+    if (sampleRate == 0) {
+        return 0;
+    }
+
+    return (bufferSizeInFrames*1000 + (sampleRate - 1)) / sampleRate;
+}
+
+MA_API ma_uint32 ma_calculate_buffer_size_in_frames_from_milliseconds(ma_uint32 bufferSizeInMilliseconds, ma_uint32 sampleRate)
+{
+    /* Prevent a division by zero. */
+    if (sampleRate == 0) {
+        return 0;
+    }
+
+    return bufferSizeInMilliseconds*sampleRate / 1000;
+}
+
+MA_API void ma_copy_pcm_frames(void* dst, const void* src, ma_uint64 frameCount, ma_format format, ma_uint32 channels)
+{
+    if (dst == src) {
+        return; /* No-op. */
+    }
+
+    ma_copy_memory_64(dst, src, frameCount * ma_get_bytes_per_frame(format, channels));
+}
+
+MA_API void ma_silence_pcm_frames(void* p, ma_uint64 frameCount, ma_format format, ma_uint32 channels)
+{
+    if (format == ma_format_u8) {
+        ma_uint64 sampleCount = frameCount * channels;
+        ma_uint64 iSample;
+        for (iSample = 0; iSample < sampleCount; iSample += 1) {
+            ((ma_uint8*)p)[iSample] = 128;
+        }
+    } else {
+        ma_zero_memory_64(p, frameCount * ma_get_bytes_per_frame(format, channels));
+    }
+}
+
+MA_API void* ma_offset_pcm_frames_ptr(void* p, ma_uint64 offsetInFrames, ma_format format, ma_uint32 channels)
+{
+    return ma_offset_ptr(p, offsetInFrames * ma_get_bytes_per_frame(format, channels));
+}
+
+MA_API const void* ma_offset_pcm_frames_const_ptr(const void* p, ma_uint64 offsetInFrames, ma_format format, ma_uint32 channels)
+{
+    return ma_offset_ptr(p, offsetInFrames * ma_get_bytes_per_frame(format, channels));
+}
+
+
+MA_API void ma_clip_samples_u8(ma_uint8* pDst, const ma_int16* pSrc, ma_uint64 count)
+{
+    ma_uint64 iSample;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        pDst[iSample] = ma_clip_u8(pSrc[iSample]);
+    }
+}
+
+MA_API void ma_clip_samples_s16(ma_int16* pDst, const ma_int32* pSrc, ma_uint64 count)
+{
+    ma_uint64 iSample;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        pDst[iSample] = ma_clip_s16(pSrc[iSample]);
+    }
+}
+
+MA_API void ma_clip_samples_s24(ma_uint8* pDst, const ma_int64* pSrc, ma_uint64 count)
+{
+    ma_uint64 iSample;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        ma_int64 s = ma_clip_s24(pSrc[iSample]);
+        pDst[iSample*3 + 0] = (ma_uint8)((s & 0x000000FF) >>  0);
+        pDst[iSample*3 + 1] = (ma_uint8)((s & 0x0000FF00) >>  8);
+        pDst[iSample*3 + 2] = (ma_uint8)((s & 0x00FF0000) >> 16);
+    }
+}
+
+MA_API void ma_clip_samples_s32(ma_int32* pDst, const ma_int64* pSrc, ma_uint64 count)
+{
+    ma_uint64 iSample;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        pDst[iSample] = ma_clip_s32(pSrc[iSample]);
+    }
+}
+
+MA_API void ma_clip_samples_f32(float* pDst, const float* pSrc, ma_uint64 count)
+{
+    ma_uint64 iSample;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        pDst[iSample] = ma_clip_f32(pSrc[iSample]);
+    }
+}
+
+MA_API void ma_clip_pcm_frames(void* pDst, const void* pSrc, ma_uint64 frameCount, ma_format format, ma_uint32 channels)
+{
+    ma_uint64 sampleCount;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    sampleCount = frameCount * channels;
+
+    switch (format) {
+        case ma_format_u8:  ma_clip_samples_u8( (ma_uint8*)pDst, (const ma_int16*)pSrc, sampleCount); break;
+        case ma_format_s16: ma_clip_samples_s16((ma_int16*)pDst, (const ma_int32*)pSrc, sampleCount); break;
+        case ma_format_s24: ma_clip_samples_s24((ma_uint8*)pDst, (const ma_int64*)pSrc, sampleCount); break;
+        case ma_format_s32: ma_clip_samples_s32((ma_int32*)pDst, (const ma_int64*)pSrc, sampleCount); break;
+        case ma_format_f32: ma_clip_samples_f32((   float*)pDst, (const    float*)pSrc, sampleCount); break;
+
+        /* Do nothing if we don't know the format. We're including these here to silence a compiler warning about enums not being handled by the switch. */
+        case ma_format_unknown:
+        case ma_format_count:
+            break;
+    }
+}
+
+
+MA_API void ma_copy_and_apply_volume_factor_u8(ma_uint8* pSamplesOut, const ma_uint8* pSamplesIn, ma_uint64 sampleCount, float factor)
+{
+    ma_uint64 iSample;
+
+    if (pSamplesOut == NULL || pSamplesIn == NULL) {
+        return;
+    }
+
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamplesOut[iSample] = (ma_uint8)(pSamplesIn[iSample] * factor);
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_factor_s16(ma_int16* pSamplesOut, const ma_int16* pSamplesIn, ma_uint64 sampleCount, float factor)
+{
+    ma_uint64 iSample;
+
+    if (pSamplesOut == NULL || pSamplesIn == NULL) {
+        return;
+    }
+
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamplesOut[iSample] = (ma_int16)(pSamplesIn[iSample] * factor);
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_factor_s24(void* pSamplesOut, const void* pSamplesIn, ma_uint64 sampleCount, float factor)
+{
+    ma_uint64 iSample;
+    ma_uint8* pSamplesOut8;
+    ma_uint8* pSamplesIn8;
+
+    if (pSamplesOut == NULL || pSamplesIn == NULL) {
+        return;
+    }
+
+    pSamplesOut8 = (ma_uint8*)pSamplesOut;
+    pSamplesIn8  = (ma_uint8*)pSamplesIn;
+
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        ma_int32 sampleS32;
+
+        sampleS32 = (ma_int32)(((ma_uint32)(pSamplesIn8[iSample*3+0]) << 8) | ((ma_uint32)(pSamplesIn8[iSample*3+1]) << 16) | ((ma_uint32)(pSamplesIn8[iSample*3+2])) << 24);
+        sampleS32 = (ma_int32)(sampleS32 * factor);
+
+        pSamplesOut8[iSample*3+0] = (ma_uint8)(((ma_uint32)sampleS32 & 0x0000FF00) >>  8);
+        pSamplesOut8[iSample*3+1] = (ma_uint8)(((ma_uint32)sampleS32 & 0x00FF0000) >> 16);
+        pSamplesOut8[iSample*3+2] = (ma_uint8)(((ma_uint32)sampleS32 & 0xFF000000) >> 24);
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_factor_s32(ma_int32* pSamplesOut, const ma_int32* pSamplesIn, ma_uint64 sampleCount, float factor)
+{
+    ma_uint64 iSample;
+
+    if (pSamplesOut == NULL || pSamplesIn == NULL) {
+        return;
+    }
+
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamplesOut[iSample] = (ma_int32)(pSamplesIn[iSample] * factor);
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_factor_f32(float* pSamplesOut, const float* pSamplesIn, ma_uint64 sampleCount, float factor)
+{
+    ma_uint64 iSample;
+
+    if (pSamplesOut == NULL || pSamplesIn == NULL) {
+        return;
+    }
+
+    if (factor == 1) {
+        if (pSamplesOut == pSamplesIn) {
+            /* In place. No-op. */
+        } else {
+            /* Just a copy. */
+            for (iSample = 0; iSample < sampleCount; iSample += 1) {
+                pSamplesOut[iSample] = pSamplesIn[iSample];
+            }
+        }
+    } else {
+        for (iSample = 0; iSample < sampleCount; iSample += 1) {
+            pSamplesOut[iSample] = pSamplesIn[iSample] * factor;
+        }
+    }
+}
+
+MA_API void ma_apply_volume_factor_u8(ma_uint8* pSamples, ma_uint64 sampleCount, float factor)
+{
+    ma_copy_and_apply_volume_factor_u8(pSamples, pSamples, sampleCount, factor);
+}
+
+MA_API void ma_apply_volume_factor_s16(ma_int16* pSamples, ma_uint64 sampleCount, float factor)
+{
+    ma_copy_and_apply_volume_factor_s16(pSamples, pSamples, sampleCount, factor);
+}
+
+MA_API void ma_apply_volume_factor_s24(void* pSamples, ma_uint64 sampleCount, float factor)
+{
+    ma_copy_and_apply_volume_factor_s24(pSamples, pSamples, sampleCount, factor);
+}
+
+MA_API void ma_apply_volume_factor_s32(ma_int32* pSamples, ma_uint64 sampleCount, float factor)
+{
+    ma_copy_and_apply_volume_factor_s32(pSamples, pSamples, sampleCount, factor);
+}
+
+MA_API void ma_apply_volume_factor_f32(float* pSamples, ma_uint64 sampleCount, float factor)
+{
+    ma_copy_and_apply_volume_factor_f32(pSamples, pSamples, sampleCount, factor);
+}
+
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_u8(ma_uint8* pFramesOut, const ma_uint8* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_u8(pFramesOut, pFramesIn, frameCount*channels, factor);
+}
+
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s16(ma_int16* pFramesOut, const ma_int16* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_s16(pFramesOut, pFramesIn, frameCount*channels, factor);
+}
+
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s24(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_s24(pFramesOut, pFramesIn, frameCount*channels, factor);
+}
+
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_s32(ma_int32* pFramesOut, const ma_int32* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_s32(pFramesOut, pFramesIn, frameCount*channels, factor);
+}
+
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_f32(pFramesOut, pFramesIn, frameCount*channels, factor);
+}
+
+MA_API void ma_copy_and_apply_volume_factor_pcm_frames(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float factor)
+{
+    switch (format)
+    {
+    case ma_format_u8:  ma_copy_and_apply_volume_factor_pcm_frames_u8 ((ma_uint8*)pFramesOut, (const ma_uint8*)pFramesIn, frameCount, channels, factor); return;
+    case ma_format_s16: ma_copy_and_apply_volume_factor_pcm_frames_s16((ma_int16*)pFramesOut, (const ma_int16*)pFramesIn, frameCount, channels, factor); return;
+    case ma_format_s24: ma_copy_and_apply_volume_factor_pcm_frames_s24(           pFramesOut,                  pFramesIn, frameCount, channels, factor); return;
+    case ma_format_s32: ma_copy_and_apply_volume_factor_pcm_frames_s32((ma_int32*)pFramesOut, (const ma_int32*)pFramesIn, frameCount, channels, factor); return;
+    case ma_format_f32: ma_copy_and_apply_volume_factor_pcm_frames_f32(   (float*)pFramesOut,    (const float*)pFramesIn, frameCount, channels, factor); return;
+    default: return;    /* Do nothing. */
+    }
+}
+
+MA_API void ma_apply_volume_factor_pcm_frames_u8(ma_uint8* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_pcm_frames_u8(pFrames, pFrames, frameCount, channels, factor);
+}
+
+MA_API void ma_apply_volume_factor_pcm_frames_s16(ma_int16* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_pcm_frames_s16(pFrames, pFrames, frameCount, channels, factor);
+}
+
+MA_API void ma_apply_volume_factor_pcm_frames_s24(void* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_pcm_frames_s24(pFrames, pFrames, frameCount, channels, factor);
+}
+
+MA_API void ma_apply_volume_factor_pcm_frames_s32(ma_int32* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_pcm_frames_s32(pFrames, pFrames, frameCount, channels, factor);
+}
+
+MA_API void ma_apply_volume_factor_pcm_frames_f32(float* pFrames, ma_uint64 frameCount, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_pcm_frames_f32(pFrames, pFrames, frameCount, channels, factor);
+}
+
+MA_API void ma_apply_volume_factor_pcm_frames(void* pFramesOut, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float factor)
+{
+    ma_copy_and_apply_volume_factor_pcm_frames(pFramesOut, pFramesOut, frameCount, format, channels, factor);
+}
+
+
+MA_API void ma_copy_and_apply_volume_factor_per_channel_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, ma_uint32 channels, float* pChannelGains)
+{
+    ma_uint64 iFrame;
+
+    if (channels == 2) {
+        /* TODO: Do an optimized implementation for stereo and mono. Can do a SIMD optimized implementation as well. */
+    }
+
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            pFramesOut[iFrame * channels + iChannel] = pFramesIn[iFrame * channels + iChannel] * pChannelGains[iChannel];
+        }
+    }
+}
+
+
+
+static MA_INLINE ma_int16 ma_apply_volume_unclipped_u8(ma_int16 x, ma_int16 volume)
+{
+    return (ma_int16)(((ma_int32)x * (ma_int32)volume) >> 8);
+}
+
+static MA_INLINE ma_int32 ma_apply_volume_unclipped_s16(ma_int32 x, ma_int16 volume)
+{
+    return (ma_int32)((x * volume) >> 8);
+}
+
+static MA_INLINE ma_int64 ma_apply_volume_unclipped_s24(ma_int64 x, ma_int16 volume)
+{
+    return (ma_int64)((x * volume) >> 8);
+}
+
+static MA_INLINE ma_int64 ma_apply_volume_unclipped_s32(ma_int64 x, ma_int16 volume)
+{
+    return (ma_int64)((x * volume) >> 8);
+}
+
+static MA_INLINE float ma_apply_volume_unclipped_f32(float x, float volume)
+{
+    return x * volume;
+}
+
+
+MA_API void ma_copy_and_apply_volume_and_clip_samples_u8(ma_uint8* pDst, const ma_int16* pSrc, ma_uint64 count, float volume)
+{
+    ma_uint64 iSample;
+    ma_int16  volumeFixed;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    volumeFixed = ma_float_to_fixed_16(volume);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        pDst[iSample] = ma_clip_u8(ma_apply_volume_unclipped_u8(pSrc[iSample], volumeFixed));
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_and_clip_samples_s16(ma_int16* pDst, const ma_int32* pSrc, ma_uint64 count, float volume)
+{
+    ma_uint64 iSample;
+    ma_int16  volumeFixed;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    volumeFixed = ma_float_to_fixed_16(volume);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        pDst[iSample] = ma_clip_s16(ma_apply_volume_unclipped_s16(pSrc[iSample], volumeFixed));
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_and_clip_samples_s24(ma_uint8* pDst, const ma_int64* pSrc, ma_uint64 count, float volume)
+{
+    ma_uint64 iSample;
+    ma_int16  volumeFixed;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    volumeFixed = ma_float_to_fixed_16(volume);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        ma_int64 s = ma_clip_s24(ma_apply_volume_unclipped_s24(pSrc[iSample], volumeFixed));
+        pDst[iSample*3 + 0] = (ma_uint8)((s & 0x000000FF) >>  0);
+        pDst[iSample*3 + 1] = (ma_uint8)((s & 0x0000FF00) >>  8);
+        pDst[iSample*3 + 2] = (ma_uint8)((s & 0x00FF0000) >> 16);
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_and_clip_samples_s32(ma_int32* pDst, const ma_int64* pSrc, ma_uint64 count, float volume)
+{
+    ma_uint64 iSample;
+    ma_int16  volumeFixed;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    volumeFixed = ma_float_to_fixed_16(volume);
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        pDst[iSample] = ma_clip_s32(ma_apply_volume_unclipped_s32(pSrc[iSample], volumeFixed));
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_and_clip_samples_f32(float* pDst, const float* pSrc, ma_uint64 count, float volume)
+{
+    ma_uint64 iSample;
+
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    /* For the f32 case we need to make sure this supports in-place processing where the input and output buffers are the same. */
+
+    for (iSample = 0; iSample < count; iSample += 1) {
+        pDst[iSample] = ma_clip_f32(ma_apply_volume_unclipped_f32(pSrc[iSample], volume));
+    }
+}
+
+MA_API void ma_copy_and_apply_volume_and_clip_pcm_frames(void* pDst, const void* pSrc, ma_uint64 frameCount, ma_format format, ma_uint32 channels, float volume)
+{
+    MA_ASSERT(pDst != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    if (volume == 1) {
+        ma_clip_pcm_frames(pDst, pSrc, frameCount, format, channels);   /* Optimized case for volume = 1. */
+    } else if (volume == 0) {
+        ma_silence_pcm_frames(pDst, frameCount, format, channels);      /* Optimized case for volume = 0. */
+    } else {
+        ma_uint64 sampleCount = frameCount * channels;
+
+        switch (format) {
+            case ma_format_u8:  ma_copy_and_apply_volume_and_clip_samples_u8( (ma_uint8*)pDst, (const ma_int16*)pSrc, sampleCount, volume); break;
+            case ma_format_s16: ma_copy_and_apply_volume_and_clip_samples_s16((ma_int16*)pDst, (const ma_int32*)pSrc, sampleCount, volume); break;
+            case ma_format_s24: ma_copy_and_apply_volume_and_clip_samples_s24((ma_uint8*)pDst, (const ma_int64*)pSrc, sampleCount, volume); break;
+            case ma_format_s32: ma_copy_and_apply_volume_and_clip_samples_s32((ma_int32*)pDst, (const ma_int64*)pSrc, sampleCount, volume); break;
+            case ma_format_f32: ma_copy_and_apply_volume_and_clip_samples_f32((   float*)pDst, (const    float*)pSrc, sampleCount, volume); break;
+
+            /* Do nothing if we don't know the format. We're including these here to silence a compiler warning about enums not being handled by the switch. */
+            case ma_format_unknown:
+            case ma_format_count:
+                break;
+        }
+    }
+}
+
+
+
+MA_API float ma_volume_linear_to_db(float factor)
+{
+    return 20*ma_log10f(factor);
+}
+
+MA_API float ma_volume_db_to_linear(float gain)
+{
+    return ma_powf(10, gain/20.0f);
+}
+
+
+MA_API ma_result ma_mix_pcm_frames_f32(float* pDst, const float* pSrc, ma_uint64 frameCount, ma_uint32 channels, float volume)
+{
+    ma_uint64 iSample;
+    ma_uint64 sampleCount;
+
+    if (pDst == NULL || pSrc == NULL || channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (volume == 0) {
+        return MA_SUCCESS;  /* No changes if the volume is 0. */
+    }
+
+    sampleCount = frameCount * channels;
+
+    if (volume == 1) {
+        for (iSample = 0; iSample < sampleCount; iSample += 1) {
+            pDst[iSample] += pSrc[iSample];
+        }
+    } else {
+        for (iSample = 0; iSample < sampleCount; iSample += 1) {
+            pDst[iSample] += ma_apply_volume_unclipped_f32(pSrc[iSample], volume);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+
+/**************************************************************************************************************************************************************
+
+Format Conversion
+
+**************************************************************************************************************************************************************/
+
+static MA_INLINE ma_int16 ma_pcm_sample_f32_to_s16(float x)
+{
+    return (ma_int16)(x * 32767.0f);
+}
+
+static MA_INLINE ma_int16 ma_pcm_sample_u8_to_s16_no_scale(ma_uint8 x)
+{
+    return (ma_int16)((ma_int16)x - 128);
+}
+
+static MA_INLINE ma_int64 ma_pcm_sample_s24_to_s32_no_scale(const ma_uint8* x)
+{
+    return (ma_int64)(((ma_uint64)x[0] << 40) | ((ma_uint64)x[1] << 48) | ((ma_uint64)x[2] << 56)) >> 40;  /* Make sure the sign bits are maintained. */
+}
+
+static MA_INLINE void ma_pcm_sample_s32_to_s24_no_scale(ma_int64 x, ma_uint8* s24)
+{
+    s24[0] = (ma_uint8)((x & 0x000000FF) >>  0);
+    s24[1] = (ma_uint8)((x & 0x0000FF00) >>  8);
+    s24[2] = (ma_uint8)((x & 0x00FF0000) >> 16);
+}
+
+
+/* u8 */
+MA_API void ma_pcm_u8_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    (void)ditherMode;
+    ma_copy_memory_64(dst, src, count * sizeof(ma_uint8));
+}
+
+
+static MA_INLINE void ma_pcm_u8_to_s16__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_int16* dst_s16 = (ma_int16*)dst;
+    const ma_uint8* src_u8 = (const ma_uint8*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        ma_int16 x = src_u8[i];
+        x = (ma_int16)(x - 128);
+        x = (ma_int16)(x << 8);
+        dst_s16[i] = x;
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_u8_to_s16__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s16__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_u8_to_s16__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_u8_to_s16__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_u8_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_u8_to_s16__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_u8_to_s16__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_u8_to_s16__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_u8_to_s16__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_u8_to_s24__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint8* dst_s24 = (ma_uint8*)dst;
+    const ma_uint8* src_u8 = (const ma_uint8*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        ma_int16 x = src_u8[i];
+        x = (ma_int16)(x - 128);
+
+        dst_s24[i*3+0] = 0;
+        dst_s24[i*3+1] = 0;
+        dst_s24[i*3+2] = (ma_uint8)((ma_int8)x);
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_u8_to_s24__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_u8_to_s24__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_u8_to_s24__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_u8_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_u8_to_s24__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_u8_to_s24__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_u8_to_s24__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_u8_to_s24__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_u8_to_s32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_int32* dst_s32 = (ma_int32*)dst;
+    const ma_uint8* src_u8 = (const ma_uint8*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        ma_int32 x = src_u8[i];
+        x = x - 128;
+        x = x << 24;
+        dst_s32[i] = x;
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_u8_to_s32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s32__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_u8_to_s32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_u8_to_s32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_u8_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_u8_to_s32__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_u8_to_s32__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_u8_to_s32__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_u8_to_s32__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_u8_to_f32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    float* dst_f32 = (float*)dst;
+    const ma_uint8* src_u8 = (const ma_uint8*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        float x = (float)src_u8[i];
+        x = x * 0.00784313725490196078f;    /* 0..255 to 0..2 */
+        x = x - 1;                          /* 0..2 to -1..1 */
+
+        dst_f32[i] = x;
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_u8_to_f32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_f32__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_u8_to_f32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_u8_to_f32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_u8_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_u8_to_f32__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_u8_to_f32__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_u8_to_f32__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_u8_to_f32__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+static MA_INLINE void ma_pcm_interleave_u8__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_uint8* dst_u8 = (ma_uint8*)dst;
+    const ma_uint8** src_u8 = (const ma_uint8**)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst_u8[iFrame*channels + iChannel] = src_u8[iChannel][iFrame];
+        }
+    }
+}
+#else
+static MA_INLINE void ma_pcm_interleave_u8__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_uint8* dst_u8 = (ma_uint8*)dst;
+    const ma_uint8** src_u8 = (const ma_uint8**)src;
+
+    if (channels == 1) {
+        ma_copy_memory_64(dst, src[0], frameCount * sizeof(ma_uint8));
+    } else if (channels == 2) {
+        ma_uint64 iFrame;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            dst_u8[iFrame*2 + 0] = src_u8[0][iFrame];
+            dst_u8[iFrame*2 + 1] = src_u8[1][iFrame];
+        }
+    } else {
+        ma_uint64 iFrame;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            ma_uint32 iChannel;
+            for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                dst_u8[iFrame*channels + iChannel] = src_u8[iChannel][iFrame];
+            }
+        }
+    }
+}
+#endif
+
+MA_API void ma_pcm_interleave_u8(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_interleave_u8__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_interleave_u8__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_deinterleave_u8__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_uint8** dst_u8 = (ma_uint8**)dst;
+    const ma_uint8* src_u8 = (const ma_uint8*)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst_u8[iChannel][iFrame] = src_u8[iFrame*channels + iChannel];
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_deinterleave_u8__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_deinterleave_u8__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_deinterleave_u8(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_deinterleave_u8__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_deinterleave_u8__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+/* s16 */
+static MA_INLINE void ma_pcm_s16_to_u8__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint8* dst_u8 = (ma_uint8*)dst;
+    const ma_int16* src_s16 = (const ma_int16*)src;
+
+    if (ditherMode == ma_dither_mode_none) {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_int16 x = src_s16[i];
+            x = (ma_int16)(x >> 8);
+            x = (ma_int16)(x + 128);
+            dst_u8[i] = (ma_uint8)x;
+        }
+    } else {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_int16 x = src_s16[i];
+
+            /* Dither. Don't overflow. */
+            ma_int32 dither = ma_dither_s32(ditherMode, -0x80, 0x7F);
+            if ((x + dither) <= 0x7FFF) {
+                x = (ma_int16)(x + dither);
+            } else {
+                x = 0x7FFF;
+            }
+
+            x = (ma_int16)(x >> 8);
+            x = (ma_int16)(x + 128);
+            dst_u8[i] = (ma_uint8)x;
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_s16_to_u8__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_u8__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s16_to_u8__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s16_to_u8__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s16_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s16_to_u8__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s16_to_u8__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s16_to_u8__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s16_to_u8__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+MA_API void ma_pcm_s16_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    (void)ditherMode;
+    ma_copy_memory_64(dst, src, count * sizeof(ma_int16));
+}
+
+
+static MA_INLINE void ma_pcm_s16_to_s24__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint8* dst_s24 = (ma_uint8*)dst;
+    const ma_int16* src_s16 = (const ma_int16*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        dst_s24[i*3+0] = 0;
+        dst_s24[i*3+1] = (ma_uint8)(src_s16[i] & 0xFF);
+        dst_s24[i*3+2] = (ma_uint8)(src_s16[i] >> 8);
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_s16_to_s24__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_s24__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s16_to_s24__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s16_to_s24__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s16_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s16_to_s24__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s16_to_s24__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s16_to_s24__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s16_to_s24__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_s16_to_s32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_int32* dst_s32 = (ma_int32*)dst;
+    const ma_int16* src_s16 = (const ma_int16*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        dst_s32[i] = src_s16[i] << 16;
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_s16_to_s32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_s32__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s16_to_s32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s16_to_s32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s16_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s16_to_s32__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s16_to_s32__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s16_to_s32__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s16_to_s32__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_s16_to_f32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    float* dst_f32 = (float*)dst;
+    const ma_int16* src_s16 = (const ma_int16*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        float x = (float)src_s16[i];
+
+#if 0
+        /* The accurate way. */
+        x = x + 32768.0f;                   /* -32768..32767 to 0..65535 */
+        x = x * 0.00003051804379339284f;    /* 0..65535 to 0..2 */
+        x = x - 1;                          /* 0..2 to -1..1 */
+#else
+        /* The fast way. */
+        x = x * 0.000030517578125f;         /* -32768..32767 to -1..0.999969482421875 */
+#endif
+
+        dst_f32[i] = x;
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_s16_to_f32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_f32__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s16_to_f32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s16_to_f32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s16_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s16_to_f32__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s16_to_f32__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s16_to_f32__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s16_to_f32__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_interleave_s16__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_int16* dst_s16 = (ma_int16*)dst;
+    const ma_int16** src_s16 = (const ma_int16**)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst_s16[iFrame*channels + iChannel] = src_s16[iChannel][iFrame];
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_interleave_s16__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_interleave_s16__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_interleave_s16(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_interleave_s16__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_interleave_s16__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_deinterleave_s16__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_int16** dst_s16 = (ma_int16**)dst;
+    const ma_int16* src_s16 = (const ma_int16*)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst_s16[iChannel][iFrame] = src_s16[iFrame*channels + iChannel];
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_deinterleave_s16__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_deinterleave_s16__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_deinterleave_s16(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_deinterleave_s16__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_deinterleave_s16__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+/* s24 */
+static MA_INLINE void ma_pcm_s24_to_u8__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint8* dst_u8 = (ma_uint8*)dst;
+    const ma_uint8* src_s24 = (const ma_uint8*)src;
+
+    if (ditherMode == ma_dither_mode_none) {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            dst_u8[i] = (ma_uint8)((ma_int8)src_s24[i*3 + 2] + 128);
+        }
+    } else {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_int32 x = (ma_int32)(((ma_uint32)(src_s24[i*3+0]) << 8) | ((ma_uint32)(src_s24[i*3+1]) << 16) | ((ma_uint32)(src_s24[i*3+2])) << 24);
+
+            /* Dither. Don't overflow. */
+            ma_int32 dither = ma_dither_s32(ditherMode, -0x800000, 0x7FFFFF);
+            if ((ma_int64)x + dither <= 0x7FFFFFFF) {
+                x = x + dither;
+            } else {
+                x = 0x7FFFFFFF;
+            }
+
+            x = x >> 24;
+            x = x + 128;
+            dst_u8[i] = (ma_uint8)x;
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_s24_to_u8__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_u8__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s24_to_u8__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s24_to_u8__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s24_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s24_to_u8__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s24_to_u8__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s24_to_u8__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s24_to_u8__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_s24_to_s16__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_int16* dst_s16 = (ma_int16*)dst;
+    const ma_uint8* src_s24 = (const ma_uint8*)src;
+
+    if (ditherMode == ma_dither_mode_none) {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_uint16 dst_lo =            ((ma_uint16)src_s24[i*3 + 1]);
+            ma_uint16 dst_hi = (ma_uint16)((ma_uint16)src_s24[i*3 + 2] << 8);
+            dst_s16[i] = (ma_int16)(dst_lo | dst_hi);
+        }
+    } else {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_int32 x = (ma_int32)(((ma_uint32)(src_s24[i*3+0]) << 8) | ((ma_uint32)(src_s24[i*3+1]) << 16) | ((ma_uint32)(src_s24[i*3+2])) << 24);
+
+            /* Dither. Don't overflow. */
+            ma_int32 dither = ma_dither_s32(ditherMode, -0x8000, 0x7FFF);
+            if ((ma_int64)x + dither <= 0x7FFFFFFF) {
+                x = x + dither;
+            } else {
+                x = 0x7FFFFFFF;
+            }
+
+            x = x >> 16;
+            dst_s16[i] = (ma_int16)x;
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_s24_to_s16__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_s16__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s24_to_s16__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s24_to_s16__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s24_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s24_to_s16__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s24_to_s16__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s24_to_s16__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s24_to_s16__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+MA_API void ma_pcm_s24_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    (void)ditherMode;
+
+    ma_copy_memory_64(dst, src, count * 3);
+}
+
+
+static MA_INLINE void ma_pcm_s24_to_s32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_int32* dst_s32 = (ma_int32*)dst;
+    const ma_uint8* src_s24 = (const ma_uint8*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        dst_s32[i] = (ma_int32)(((ma_uint32)(src_s24[i*3+0]) << 8) | ((ma_uint32)(src_s24[i*3+1]) << 16) | ((ma_uint32)(src_s24[i*3+2])) << 24);
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_s24_to_s32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_s32__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s24_to_s32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s24_to_s32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s24_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s24_to_s32__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s24_to_s32__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s24_to_s32__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s24_to_s32__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_s24_to_f32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    float* dst_f32 = (float*)dst;
+    const ma_uint8* src_s24 = (const ma_uint8*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        float x = (float)(((ma_int32)(((ma_uint32)(src_s24[i*3+0]) << 8) | ((ma_uint32)(src_s24[i*3+1]) << 16) | ((ma_uint32)(src_s24[i*3+2])) << 24)) >> 8);
+
+#if 0
+        /* The accurate way. */
+        x = x + 8388608.0f;                 /* -8388608..8388607 to 0..16777215 */
+        x = x * 0.00000011920929665621f;    /* 0..16777215 to 0..2 */
+        x = x - 1;                          /* 0..2 to -1..1 */
+#else
+        /* The fast way. */
+        x = x * 0.00000011920928955078125f; /* -8388608..8388607 to -1..0.999969482421875 */
+#endif
+
+        dst_f32[i] = x;
+    }
+
+    (void)ditherMode;
+}
+
+static MA_INLINE void ma_pcm_s24_to_f32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_f32__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s24_to_f32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s24_to_f32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s24_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s24_to_f32__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s24_to_f32__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s24_to_f32__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s24_to_f32__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_interleave_s24__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_uint8* dst8 = (ma_uint8*)dst;
+    const ma_uint8** src8 = (const ma_uint8**)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst8[iFrame*3*channels + iChannel*3 + 0] = src8[iChannel][iFrame*3 + 0];
+            dst8[iFrame*3*channels + iChannel*3 + 1] = src8[iChannel][iFrame*3 + 1];
+            dst8[iFrame*3*channels + iChannel*3 + 2] = src8[iChannel][iFrame*3 + 2];
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_interleave_s24__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_interleave_s24__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_interleave_s24(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_interleave_s24__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_interleave_s24__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_deinterleave_s24__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_uint8** dst8 = (ma_uint8**)dst;
+    const ma_uint8* src8 = (const ma_uint8*)src;
+
+    ma_uint32 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst8[iChannel][iFrame*3 + 0] = src8[iFrame*3*channels + iChannel*3 + 0];
+            dst8[iChannel][iFrame*3 + 1] = src8[iFrame*3*channels + iChannel*3 + 1];
+            dst8[iChannel][iFrame*3 + 2] = src8[iFrame*3*channels + iChannel*3 + 2];
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_deinterleave_s24__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_deinterleave_s24__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_deinterleave_s24(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_deinterleave_s24__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_deinterleave_s24__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+
+/* s32 */
+static MA_INLINE void ma_pcm_s32_to_u8__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint8* dst_u8 = (ma_uint8*)dst;
+    const ma_int32* src_s32 = (const ma_int32*)src;
+
+    if (ditherMode == ma_dither_mode_none) {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_int32 x = src_s32[i];
+            x = x >> 24;
+            x = x + 128;
+            dst_u8[i] = (ma_uint8)x;
+        }
+    } else {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_int32 x = src_s32[i];
+
+            /* Dither. Don't overflow. */
+            ma_int32 dither = ma_dither_s32(ditherMode, -0x800000, 0x7FFFFF);
+            if ((ma_int64)x + dither <= 0x7FFFFFFF) {
+                x = x + dither;
+            } else {
+                x = 0x7FFFFFFF;
+            }
+
+            x = x >> 24;
+            x = x + 128;
+            dst_u8[i] = (ma_uint8)x;
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_s32_to_u8__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_u8__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s32_to_u8__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s32_to_u8__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s32_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s32_to_u8__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s32_to_u8__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s32_to_u8__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s32_to_u8__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_s32_to_s16__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_int16* dst_s16 = (ma_int16*)dst;
+    const ma_int32* src_s32 = (const ma_int32*)src;
+
+    if (ditherMode == ma_dither_mode_none) {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_int32 x = src_s32[i];
+            x = x >> 16;
+            dst_s16[i] = (ma_int16)x;
+        }
+    } else {
+        ma_uint64 i;
+        for (i = 0; i < count; i += 1) {
+            ma_int32 x = src_s32[i];
+
+            /* Dither. Don't overflow. */
+            ma_int32 dither = ma_dither_s32(ditherMode, -0x8000, 0x7FFF);
+            if ((ma_int64)x + dither <= 0x7FFFFFFF) {
+                x = x + dither;
+            } else {
+                x = 0x7FFFFFFF;
+            }
+
+            x = x >> 16;
+            dst_s16[i] = (ma_int16)x;
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_s32_to_s16__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_s16__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s32_to_s16__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s32_to_s16__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s32_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s32_to_s16__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s32_to_s16__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s32_to_s16__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s32_to_s16__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_s32_to_s24__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint8* dst_s24 = (ma_uint8*)dst;
+    const ma_int32* src_s32 = (const ma_int32*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        ma_uint32 x = (ma_uint32)src_s32[i];
+        dst_s24[i*3+0] = (ma_uint8)((x & 0x0000FF00) >>  8);
+        dst_s24[i*3+1] = (ma_uint8)((x & 0x00FF0000) >> 16);
+        dst_s24[i*3+2] = (ma_uint8)((x & 0xFF000000) >> 24);
+    }
+
+    (void)ditherMode;   /* No dithering for s32 -> s24. */
+}
+
+static MA_INLINE void ma_pcm_s32_to_s24__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_s24__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s32_to_s24__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s32_to_s24__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s32_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s32_to_s24__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s32_to_s24__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s32_to_s24__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s32_to_s24__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+MA_API void ma_pcm_s32_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    (void)ditherMode;
+
+    ma_copy_memory_64(dst, src, count * sizeof(ma_int32));
+}
+
+
+static MA_INLINE void ma_pcm_s32_to_f32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    float* dst_f32 = (float*)dst;
+    const ma_int32* src_s32 = (const ma_int32*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        double x = src_s32[i];
+
+#if 0
+        x = x + 2147483648.0;
+        x = x * 0.0000000004656612873077392578125;
+        x = x - 1;
+#else
+        x = x / 2147483648.0;
+#endif
+
+        dst_f32[i] = (float)x;
+    }
+
+    (void)ditherMode;   /* No dithering for s32 -> f32. */
+}
+
+static MA_INLINE void ma_pcm_s32_to_f32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_f32__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_s32_to_f32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_s32_to_f32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_s32_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_s32_to_f32__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_s32_to_f32__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_s32_to_f32__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_s32_to_f32__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_interleave_s32__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_int32* dst_s32 = (ma_int32*)dst;
+    const ma_int32** src_s32 = (const ma_int32**)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst_s32[iFrame*channels + iChannel] = src_s32[iChannel][iFrame];
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_interleave_s32__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_interleave_s32__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_interleave_s32(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_interleave_s32__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_interleave_s32__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_deinterleave_s32__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_int32** dst_s32 = (ma_int32**)dst;
+    const ma_int32* src_s32 = (const ma_int32*)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst_s32[iChannel][iFrame] = src_s32[iFrame*channels + iChannel];
+        }
+    }
+}
+
+static MA_INLINE void ma_pcm_deinterleave_s32__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_deinterleave_s32__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_deinterleave_s32(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_deinterleave_s32__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_deinterleave_s32__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+/* f32 */
+static MA_INLINE void ma_pcm_f32_to_u8__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint64 i;
+
+    ma_uint8* dst_u8 = (ma_uint8*)dst;
+    const float* src_f32 = (const float*)src;
+
+    float ditherMin = 0;
+    float ditherMax = 0;
+    if (ditherMode != ma_dither_mode_none) {
+        ditherMin = 1.0f / -128;
+        ditherMax = 1.0f /  127;
+    }
+
+    for (i = 0; i < count; i += 1) {
+        float x = src_f32[i];
+        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
+        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
+        x = x + 1;                                  /* -1..1 to 0..2 */
+        x = x * 127.5f;                             /* 0..2 to 0..255 */
+
+        dst_u8[i] = (ma_uint8)x;
+    }
+}
+
+static MA_INLINE void ma_pcm_f32_to_u8__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_u8__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_f32_to_u8__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_f32_to_u8__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_f32_to_u8(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_f32_to_u8__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_f32_to_u8__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_f32_to_u8__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_f32_to_u8__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+static MA_INLINE void ma_pcm_f32_to_s16__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint64 i;
+
+    ma_int16* dst_s16 = (ma_int16*)dst;
+    const float* src_f32 = (const float*)src;
+
+    float ditherMin = 0;
+    float ditherMax = 0;
+    if (ditherMode != ma_dither_mode_none) {
+        ditherMin = 1.0f / -32768;
+        ditherMax = 1.0f /  32767;
+    }
+
+    for (i = 0; i < count; i += 1) {
+        float x = src_f32[i];
+        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
+        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
+
+#if 0
+        /* The accurate way. */
+        x = x + 1;                                  /* -1..1 to 0..2 */
+        x = x * 32767.5f;                           /* 0..2 to 0..65535 */
+        x = x - 32768.0f;                           /* 0...65535 to -32768..32767 */
+#else
+        /* The fast way. */
+        x = x * 32767.0f;                           /* -1..1 to -32767..32767 */
+#endif
+
+        dst_s16[i] = (ma_int16)x;
+    }
+}
+#else
+static MA_INLINE void ma_pcm_f32_to_s16__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint64 i;
+    ma_uint64 i4;
+    ma_uint64 count4;
+
+    ma_int16* dst_s16 = (ma_int16*)dst;
+    const float* src_f32 = (const float*)src;
+
+    float ditherMin = 0;
+    float ditherMax = 0;
+    if (ditherMode != ma_dither_mode_none) {
+        ditherMin = 1.0f / -32768;
+        ditherMax = 1.0f /  32767;
+    }
+
+    /* Unrolled. */
+    i = 0;
+    count4 = count >> 2;
+    for (i4 = 0; i4 < count4; i4 += 1) {
+        float d0 = ma_dither_f32(ditherMode, ditherMin, ditherMax);
+        float d1 = ma_dither_f32(ditherMode, ditherMin, ditherMax);
+        float d2 = ma_dither_f32(ditherMode, ditherMin, ditherMax);
+        float d3 = ma_dither_f32(ditherMode, ditherMin, ditherMax);
+
+        float x0 = src_f32[i+0];
+        float x1 = src_f32[i+1];
+        float x2 = src_f32[i+2];
+        float x3 = src_f32[i+3];
+
+        x0 = x0 + d0;
+        x1 = x1 + d1;
+        x2 = x2 + d2;
+        x3 = x3 + d3;
+
+        x0 = ((x0 < -1) ? -1 : ((x0 > 1) ? 1 : x0));
+        x1 = ((x1 < -1) ? -1 : ((x1 > 1) ? 1 : x1));
+        x2 = ((x2 < -1) ? -1 : ((x2 > 1) ? 1 : x2));
+        x3 = ((x3 < -1) ? -1 : ((x3 > 1) ? 1 : x3));
+
+        x0 = x0 * 32767.0f;
+        x1 = x1 * 32767.0f;
+        x2 = x2 * 32767.0f;
+        x3 = x3 * 32767.0f;
+
+        dst_s16[i+0] = (ma_int16)x0;
+        dst_s16[i+1] = (ma_int16)x1;
+        dst_s16[i+2] = (ma_int16)x2;
+        dst_s16[i+3] = (ma_int16)x3;
+
+        i += 4;
+    }
+
+    /* Leftover. */
+    for (; i < count; i += 1) {
+        float x = src_f32[i];
+        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
+        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
+        x = x * 32767.0f;                           /* -1..1 to -32767..32767 */
+
+        dst_s16[i] = (ma_int16)x;
+    }
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_f32_to_s16__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint64 i;
+    ma_uint64 i8;
+    ma_uint64 count8;
+    ma_int16* dst_s16;
+    const float* src_f32;
+    float ditherMin;
+    float ditherMax;
+
+    /* Both the input and output buffers need to be aligned to 16 bytes. */
+    if ((((ma_uintptr)dst & 15) != 0) || (((ma_uintptr)src & 15) != 0)) {
+        ma_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
+        return;
+    }
+
+    dst_s16 = (ma_int16*)dst;
+    src_f32 = (const float*)src;
+
+    ditherMin = 0;
+    ditherMax = 0;
+    if (ditherMode != ma_dither_mode_none) {
+        ditherMin = 1.0f / -32768;
+        ditherMax = 1.0f /  32767;
+    }
+
+    i = 0;
+
+    /* SSE2. SSE allows us to output 8 s16's at a time which means our loop is unrolled 8 times. */
+    count8 = count >> 3;
+    for (i8 = 0; i8 < count8; i8 += 1) {
+        __m128 d0;
+        __m128 d1;
+        __m128 x0;
+        __m128 x1;
+
+        if (ditherMode == ma_dither_mode_none) {
+            d0 = _mm_set1_ps(0);
+            d1 = _mm_set1_ps(0);
+        } else if (ditherMode == ma_dither_mode_rectangle) {
+            d0 = _mm_set_ps(
+                ma_dither_f32_rectangle(ditherMin, ditherMax),
+                ma_dither_f32_rectangle(ditherMin, ditherMax),
+                ma_dither_f32_rectangle(ditherMin, ditherMax),
+                ma_dither_f32_rectangle(ditherMin, ditherMax)
+            );
+            d1 = _mm_set_ps(
+                ma_dither_f32_rectangle(ditherMin, ditherMax),
+                ma_dither_f32_rectangle(ditherMin, ditherMax),
+                ma_dither_f32_rectangle(ditherMin, ditherMax),
+                ma_dither_f32_rectangle(ditherMin, ditherMax)
+            );
+        } else {
+            d0 = _mm_set_ps(
+                ma_dither_f32_triangle(ditherMin, ditherMax),
+                ma_dither_f32_triangle(ditherMin, ditherMax),
+                ma_dither_f32_triangle(ditherMin, ditherMax),
+                ma_dither_f32_triangle(ditherMin, ditherMax)
+            );
+            d1 = _mm_set_ps(
+                ma_dither_f32_triangle(ditherMin, ditherMax),
+                ma_dither_f32_triangle(ditherMin, ditherMax),
+                ma_dither_f32_triangle(ditherMin, ditherMax),
+                ma_dither_f32_triangle(ditherMin, ditherMax)
+            );
+        }
+
+        x0 = *((__m128*)(src_f32 + i) + 0);
+        x1 = *((__m128*)(src_f32 + i) + 1);
+
+        x0 = _mm_add_ps(x0, d0);
+        x1 = _mm_add_ps(x1, d1);
+
+        x0 = _mm_mul_ps(x0, _mm_set1_ps(32767.0f));
+        x1 = _mm_mul_ps(x1, _mm_set1_ps(32767.0f));
+
+        _mm_stream_si128(((__m128i*)(dst_s16 + i)), _mm_packs_epi32(_mm_cvttps_epi32(x0), _mm_cvttps_epi32(x1)));
+
+        i += 8;
+    }
+
+
+    /* Leftover. */
+    for (; i < count; i += 1) {
+        float x = src_f32[i];
+        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
+        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
+        x = x * 32767.0f;                           /* -1..1 to -32767..32767 */
+
+        dst_s16[i] = (ma_int16)x;
+    }
+}
+#endif  /* SSE2 */
+
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_f32_to_s16__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint64 i;
+    ma_uint64 i8;
+    ma_uint64 count8;
+    ma_int16* dst_s16;
+    const float* src_f32;
+    float ditherMin;
+    float ditherMax;
+
+    if (!ma_has_neon()) {
+        ma_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
+        return;
+    }
+
+    /* Both the input and output buffers need to be aligned to 16 bytes. */
+    if ((((ma_uintptr)dst & 15) != 0) || (((ma_uintptr)src & 15) != 0)) {
+        ma_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
+        return;
+    }
+
+    dst_s16 = (ma_int16*)dst;
+    src_f32 = (const float*)src;
+
+    ditherMin = 0;
+    ditherMax = 0;
+    if (ditherMode != ma_dither_mode_none) {
+        ditherMin = 1.0f / -32768;
+        ditherMax = 1.0f /  32767;
+    }
+
+    i = 0;
+
+    /* NEON. NEON allows us to output 8 s16's at a time which means our loop is unrolled 8 times. */
+    count8 = count >> 3;
+    for (i8 = 0; i8 < count8; i8 += 1) {
+        float32x4_t d0;
+        float32x4_t d1;
+        float32x4_t x0;
+        float32x4_t x1;
+        int32x4_t i0;
+        int32x4_t i1;
+
+        if (ditherMode == ma_dither_mode_none) {
+            d0 = vmovq_n_f32(0);
+            d1 = vmovq_n_f32(0);
+        } else if (ditherMode == ma_dither_mode_rectangle) {
+            float d0v[4];
+            float d1v[4];
+
+            d0v[0] = ma_dither_f32_rectangle(ditherMin, ditherMax);
+            d0v[1] = ma_dither_f32_rectangle(ditherMin, ditherMax);
+            d0v[2] = ma_dither_f32_rectangle(ditherMin, ditherMax);
+            d0v[3] = ma_dither_f32_rectangle(ditherMin, ditherMax);
+            d0 = vld1q_f32(d0v);
+
+            d1v[0] = ma_dither_f32_rectangle(ditherMin, ditherMax);
+            d1v[1] = ma_dither_f32_rectangle(ditherMin, ditherMax);
+            d1v[2] = ma_dither_f32_rectangle(ditherMin, ditherMax);
+            d1v[3] = ma_dither_f32_rectangle(ditherMin, ditherMax);
+            d1 = vld1q_f32(d1v);
+        } else {
+            float d0v[4];
+            float d1v[4];
+
+            d0v[0] = ma_dither_f32_triangle(ditherMin, ditherMax);
+            d0v[1] = ma_dither_f32_triangle(ditherMin, ditherMax);
+            d0v[2] = ma_dither_f32_triangle(ditherMin, ditherMax);
+            d0v[3] = ma_dither_f32_triangle(ditherMin, ditherMax);
+            d0 = vld1q_f32(d0v);
+
+            d1v[0] = ma_dither_f32_triangle(ditherMin, ditherMax);
+            d1v[1] = ma_dither_f32_triangle(ditherMin, ditherMax);
+            d1v[2] = ma_dither_f32_triangle(ditherMin, ditherMax);
+            d1v[3] = ma_dither_f32_triangle(ditherMin, ditherMax);
+            d1 = vld1q_f32(d1v);
+        }
+
+        x0 = *((float32x4_t*)(src_f32 + i) + 0);
+        x1 = *((float32x4_t*)(src_f32 + i) + 1);
+
+        x0 = vaddq_f32(x0, d0);
+        x1 = vaddq_f32(x1, d1);
+
+        x0 = vmulq_n_f32(x0, 32767.0f);
+        x1 = vmulq_n_f32(x1, 32767.0f);
+
+        i0 = vcvtq_s32_f32(x0);
+        i1 = vcvtq_s32_f32(x1);
+        *((int16x8_t*)(dst_s16 + i)) = vcombine_s16(vqmovn_s32(i0), vqmovn_s32(i1));
+
+        i += 8;
+    }
+
+
+    /* Leftover. */
+    for (; i < count; i += 1) {
+        float x = src_f32[i];
+        x = x + ma_dither_f32(ditherMode, ditherMin, ditherMax);
+        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
+        x = x * 32767.0f;                           /* -1..1 to -32767..32767 */
+
+        dst_s16[i] = (ma_int16)x;
+    }
+}
+#endif  /* Neon */
+#endif  /* MA_USE_REFERENCE_CONVERSION_APIS */
+
+MA_API void ma_pcm_f32_to_s16(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_f32_to_s16__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_f32_to_s16__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_f32_to_s16__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_f32_to_s16__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_f32_to_s24__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_uint8* dst_s24 = (ma_uint8*)dst;
+    const float* src_f32 = (const float*)src;
+
+    ma_uint64 i;
+    for (i = 0; i < count; i += 1) {
+        ma_int32 r;
+        float x = src_f32[i];
+        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
+
+#if 0
+        /* The accurate way. */
+        x = x + 1;                                  /* -1..1 to 0..2 */
+        x = x * 8388607.5f;                         /* 0..2 to 0..16777215 */
+        x = x - 8388608.0f;                         /* 0..16777215 to -8388608..8388607 */
+#else
+        /* The fast way. */
+        x = x * 8388607.0f;                         /* -1..1 to -8388607..8388607 */
+#endif
+
+        r = (ma_int32)x;
+        dst_s24[(i*3)+0] = (ma_uint8)((r & 0x0000FF) >>  0);
+        dst_s24[(i*3)+1] = (ma_uint8)((r & 0x00FF00) >>  8);
+        dst_s24[(i*3)+2] = (ma_uint8)((r & 0xFF0000) >> 16);
+    }
+
+    (void)ditherMode;   /* No dithering for f32 -> s24. */
+}
+
+static MA_INLINE void ma_pcm_f32_to_s24__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_s24__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_f32_to_s24__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_f32_to_s24__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_f32_to_s24(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_f32_to_s24__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_f32_to_s24__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_f32_to_s24__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_f32_to_s24__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+static MA_INLINE void ma_pcm_f32_to_s32__reference(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_int32* dst_s32 = (ma_int32*)dst;
+    const float* src_f32 = (const float*)src;
+
+    ma_uint32 i;
+    for (i = 0; i < count; i += 1) {
+        double x = src_f32[i];
+        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));    /* clip */
+
+#if 0
+        /* The accurate way. */
+        x = x + 1;                                  /* -1..1 to 0..2 */
+        x = x * 2147483647.5;                       /* 0..2 to 0..4294967295 */
+        x = x - 2147483648.0;                       /* 0...4294967295 to -2147483648..2147483647 */
+#else
+        /* The fast way. */
+        x = x * 2147483647.0;                       /* -1..1 to -2147483647..2147483647 */
+#endif
+
+        dst_s32[i] = (ma_int32)x;
+    }
+
+    (void)ditherMode;   /* No dithering for f32 -> s32. */
+}
+
+static MA_INLINE void ma_pcm_f32_to_s32__optimized(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_s32__reference(dst, src, count, ditherMode);
+}
+
+#if defined(MA_SUPPORT_SSE2)
+static MA_INLINE void ma_pcm_f32_to_s32__sse2(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
+}
+#endif
+#if defined(MA_SUPPORT_NEON)
+static MA_INLINE void ma_pcm_f32_to_s32__neon(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    ma_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
+}
+#endif
+
+MA_API void ma_pcm_f32_to_s32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_f32_to_s32__reference(dst, src, count, ditherMode);
+#else
+    #  if defined(MA_SUPPORT_SSE2)
+        if (ma_has_sse2()) {
+            ma_pcm_f32_to_s32__sse2(dst, src, count, ditherMode);
+        } else
+    #elif defined(MA_SUPPORT_NEON)
+        if (ma_has_neon()) {
+            ma_pcm_f32_to_s32__neon(dst, src, count, ditherMode);
+        } else
+    #endif
+        {
+            ma_pcm_f32_to_s32__optimized(dst, src, count, ditherMode);
+        }
+#endif
+}
+
+
+MA_API void ma_pcm_f32_to_f32(void* dst, const void* src, ma_uint64 count, ma_dither_mode ditherMode)
+{
+    (void)ditherMode;
+
+    ma_copy_memory_64(dst, src, count * sizeof(float));
+}
+
+
+static void ma_pcm_interleave_f32__reference(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    float* dst_f32 = (float*)dst;
+    const float** src_f32 = (const float**)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst_f32[iFrame*channels + iChannel] = src_f32[iChannel][iFrame];
+        }
+    }
+}
+
+static void ma_pcm_interleave_f32__optimized(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_interleave_f32__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_interleave_f32(void* dst, const void** src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_interleave_f32__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_interleave_f32__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+static void ma_pcm_deinterleave_f32__reference(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    float** dst_f32 = (float**)dst;
+    const float* src_f32 = (const float*)src;
+
+    ma_uint64 iFrame;
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; iChannel += 1) {
+            dst_f32[iChannel][iFrame] = src_f32[iFrame*channels + iChannel];
+        }
+    }
+}
+
+static void ma_pcm_deinterleave_f32__optimized(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+    ma_pcm_deinterleave_f32__reference(dst, src, frameCount, channels);
+}
+
+MA_API void ma_pcm_deinterleave_f32(void** dst, const void* src, ma_uint64 frameCount, ma_uint32 channels)
+{
+#ifdef MA_USE_REFERENCE_CONVERSION_APIS
+    ma_pcm_deinterleave_f32__reference(dst, src, frameCount, channels);
+#else
+    ma_pcm_deinterleave_f32__optimized(dst, src, frameCount, channels);
+#endif
+}
+
+
+MA_API void ma_pcm_convert(void* pOut, ma_format formatOut, const void* pIn, ma_format formatIn, ma_uint64 sampleCount, ma_dither_mode ditherMode)
+{
+    if (formatOut == formatIn) {
+        ma_copy_memory_64(pOut, pIn, sampleCount * ma_get_bytes_per_sample(formatOut));
+        return;
+    }
+
+    switch (formatIn)
+    {
+        case ma_format_u8:
+        {
+            switch (formatOut)
+            {
+                case ma_format_s16: ma_pcm_u8_to_s16(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s24: ma_pcm_u8_to_s24(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s32: ma_pcm_u8_to_s32(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_f32: ma_pcm_u8_to_f32(pOut, pIn, sampleCount, ditherMode); return;
+                default: break;
+            }
+        } break;
+
+        case ma_format_s16:
+        {
+            switch (formatOut)
+            {
+                case ma_format_u8:  ma_pcm_s16_to_u8( pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s24: ma_pcm_s16_to_s24(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s32: ma_pcm_s16_to_s32(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_f32: ma_pcm_s16_to_f32(pOut, pIn, sampleCount, ditherMode); return;
+                default: break;
+            }
+        } break;
+
+        case ma_format_s24:
+        {
+            switch (formatOut)
+            {
+                case ma_format_u8:  ma_pcm_s24_to_u8( pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s16: ma_pcm_s24_to_s16(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s32: ma_pcm_s24_to_s32(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_f32: ma_pcm_s24_to_f32(pOut, pIn, sampleCount, ditherMode); return;
+                default: break;
+            }
+        } break;
+
+        case ma_format_s32:
+        {
+            switch (formatOut)
+            {
+                case ma_format_u8:  ma_pcm_s32_to_u8( pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s16: ma_pcm_s32_to_s16(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s24: ma_pcm_s32_to_s24(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_f32: ma_pcm_s32_to_f32(pOut, pIn, sampleCount, ditherMode); return;
+                default: break;
+            }
+        } break;
+
+        case ma_format_f32:
+        {
+            switch (formatOut)
+            {
+                case ma_format_u8:  ma_pcm_f32_to_u8( pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s16: ma_pcm_f32_to_s16(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s24: ma_pcm_f32_to_s24(pOut, pIn, sampleCount, ditherMode); return;
+                case ma_format_s32: ma_pcm_f32_to_s32(pOut, pIn, sampleCount, ditherMode); return;
+                default: break;
+            }
+        } break;
+
+        default: break;
+    }
+}
+
+MA_API void ma_convert_pcm_frames_format(void* pOut, ma_format formatOut, const void* pIn, ma_format formatIn, ma_uint64 frameCount, ma_uint32 channels, ma_dither_mode ditherMode)
+{
+    ma_pcm_convert(pOut, formatOut, pIn, formatIn, frameCount * channels, ditherMode);
+}
+
+MA_API void ma_deinterleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void* pInterleavedPCMFrames, void** ppDeinterleavedPCMFrames)
+{
+    if (pInterleavedPCMFrames == NULL || ppDeinterleavedPCMFrames == NULL) {
+        return; /* Invalid args. */
+    }
+
+    /* For efficiency we do this per format. */
+    switch (format) {
+        case ma_format_s16:
+        {
+            const ma_int16* pSrcS16 = (const ma_int16*)pInterleavedPCMFrames;
+            ma_uint64 iPCMFrame;
+            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < channels; ++iChannel) {
+                    ma_int16* pDstS16 = (ma_int16*)ppDeinterleavedPCMFrames[iChannel];
+                    pDstS16[iPCMFrame] = pSrcS16[iPCMFrame*channels+iChannel];
+                }
+            }
+        } break;
+
+        case ma_format_f32:
+        {
+            const float* pSrcF32 = (const float*)pInterleavedPCMFrames;
+            ma_uint64 iPCMFrame;
+            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < channels; ++iChannel) {
+                    float* pDstF32 = (float*)ppDeinterleavedPCMFrames[iChannel];
+                    pDstF32[iPCMFrame] = pSrcF32[iPCMFrame*channels+iChannel];
+                }
+            }
+        } break;
+
+        default:
+        {
+            ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
+            ma_uint64 iPCMFrame;
+            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < channels; ++iChannel) {
+                          void* pDst = ma_offset_ptr(ppDeinterleavedPCMFrames[iChannel], iPCMFrame*sampleSizeInBytes);
+                    const void* pSrc = ma_offset_ptr(pInterleavedPCMFrames, (iPCMFrame*channels+iChannel)*sampleSizeInBytes);
+                    memcpy(pDst, pSrc, sampleSizeInBytes);
+                }
+            }
+        } break;
+    }
+}
+
+MA_API void ma_interleave_pcm_frames(ma_format format, ma_uint32 channels, ma_uint64 frameCount, const void** ppDeinterleavedPCMFrames, void* pInterleavedPCMFrames)
+{
+    switch (format)
+    {
+        case ma_format_s16:
+        {
+            ma_int16* pDstS16 = (ma_int16*)pInterleavedPCMFrames;
+            ma_uint64 iPCMFrame;
+            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < channels; ++iChannel) {
+                    const ma_int16* pSrcS16 = (const ma_int16*)ppDeinterleavedPCMFrames[iChannel];
+                    pDstS16[iPCMFrame*channels+iChannel] = pSrcS16[iPCMFrame];
+                }
+            }
+        } break;
+
+        case ma_format_f32:
+        {
+            float* pDstF32 = (float*)pInterleavedPCMFrames;
+            ma_uint64 iPCMFrame;
+            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < channels; ++iChannel) {
+                    const float* pSrcF32 = (const float*)ppDeinterleavedPCMFrames[iChannel];
+                    pDstF32[iPCMFrame*channels+iChannel] = pSrcF32[iPCMFrame];
+                }
+            }
+        } break;
+
+        default:
+        {
+            ma_uint32 sampleSizeInBytes = ma_get_bytes_per_sample(format);
+            ma_uint64 iPCMFrame;
+            for (iPCMFrame = 0; iPCMFrame < frameCount; ++iPCMFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < channels; ++iChannel) {
+                          void* pDst = ma_offset_ptr(pInterleavedPCMFrames, (iPCMFrame*channels+iChannel)*sampleSizeInBytes);
+                    const void* pSrc = ma_offset_ptr(ppDeinterleavedPCMFrames[iChannel], iPCMFrame*sampleSizeInBytes);
+                    memcpy(pDst, pSrc, sampleSizeInBytes);
+                }
+            }
+        } break;
+    }
+}
+
+
+/**************************************************************************************************************************************************************
+
+Biquad Filter
+
+**************************************************************************************************************************************************************/
+#ifndef MA_BIQUAD_FIXED_POINT_SHIFT
+#define MA_BIQUAD_FIXED_POINT_SHIFT 14
+#endif
+
+static ma_int32 ma_biquad_float_to_fp(double x)
+{
+    return (ma_int32)(x * (1 << MA_BIQUAD_FIXED_POINT_SHIFT));
+}
+
+MA_API ma_biquad_config ma_biquad_config_init(ma_format format, ma_uint32 channels, double b0, double b1, double b2, double a0, double a1, double a2)
+{
+    ma_biquad_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format = format;
+    config.channels = channels;
+    config.b0 = b0;
+    config.b1 = b1;
+    config.b2 = b2;
+    config.a0 = a0;
+    config.a1 = a1;
+    config.a2 = a2;
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t r1Offset;
+    size_t r2Offset;
+} ma_biquad_heap_layout;
+
+static ma_result ma_biquad_get_heap_layout(const ma_biquad_config* pConfig, ma_biquad_heap_layout* pHeapLayout)
+{
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* R0 */
+    pHeapLayout->r1Offset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += sizeof(ma_biquad_coefficient) * pConfig->channels;
+
+    /* R1 */
+    pHeapLayout->r2Offset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += sizeof(ma_biquad_coefficient) * pConfig->channels;
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_biquad_get_heap_size(const ma_biquad_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_biquad_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_biquad_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_biquad_init_preallocated(const ma_biquad_config* pConfig, void* pHeap, ma_biquad* pBQ)
+{
+    ma_result result;
+    ma_biquad_heap_layout heapLayout;
+
+    if (pBQ == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pBQ);
+
+    result = ma_biquad_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pBQ->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pBQ->pR1 = (ma_biquad_coefficient*)ma_offset_ptr(pHeap, heapLayout.r1Offset);
+    pBQ->pR2 = (ma_biquad_coefficient*)ma_offset_ptr(pHeap, heapLayout.r2Offset);
+
+    return ma_biquad_reinit(pConfig, pBQ);
+}
+
+MA_API ma_result ma_biquad_init(const ma_biquad_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_biquad* pBQ)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_biquad_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_biquad_init_preallocated(pConfig, pHeap, pBQ);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pBQ->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_biquad_uninit(ma_biquad* pBQ, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pBQ == NULL) {
+        return;
+    }
+
+    if (pBQ->_ownsHeap) {
+        ma_free(pBQ->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_biquad_reinit(const ma_biquad_config* pConfig, ma_biquad* pBQ)
+{
+    if (pBQ == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->a0 == 0) {
+        return MA_INVALID_ARGS; /* Division by zero. */
+    }
+
+    /* Only supporting f32 and s16. */
+    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The format cannot be changed after initialization. */
+    if (pBQ->format != ma_format_unknown && pBQ->format != pConfig->format) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* The channel count cannot be changed after initialization. */
+    if (pBQ->channels != 0 && pBQ->channels != pConfig->channels) {
+        return MA_INVALID_OPERATION;
+    }
+
+
+    pBQ->format   = pConfig->format;
+    pBQ->channels = pConfig->channels;
+
+    /* Normalize. */
+    if (pConfig->format == ma_format_f32) {
+        pBQ->b0.f32 = (float)(pConfig->b0 / pConfig->a0);
+        pBQ->b1.f32 = (float)(pConfig->b1 / pConfig->a0);
+        pBQ->b2.f32 = (float)(pConfig->b2 / pConfig->a0);
+        pBQ->a1.f32 = (float)(pConfig->a1 / pConfig->a0);
+        pBQ->a2.f32 = (float)(pConfig->a2 / pConfig->a0);
+    } else {
+        pBQ->b0.s32 = ma_biquad_float_to_fp(pConfig->b0 / pConfig->a0);
+        pBQ->b1.s32 = ma_biquad_float_to_fp(pConfig->b1 / pConfig->a0);
+        pBQ->b2.s32 = ma_biquad_float_to_fp(pConfig->b2 / pConfig->a0);
+        pBQ->a1.s32 = ma_biquad_float_to_fp(pConfig->a1 / pConfig->a0);
+        pBQ->a2.s32 = ma_biquad_float_to_fp(pConfig->a2 / pConfig->a0);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_biquad_clear_cache(ma_biquad* pBQ)
+{
+    if (pBQ == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pBQ->format == ma_format_f32) {
+        pBQ->pR1->f32 = 0;
+        pBQ->pR2->f32 = 0;
+    } else {
+        pBQ->pR1->s32 = 0;
+        pBQ->pR2->s32 = 0;
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(ma_biquad* pBQ, float* pY, const float* pX)
+{
+    ma_uint32 c;
+    const ma_uint32 channels = pBQ->channels;
+    const float b0 = pBQ->b0.f32;
+    const float b1 = pBQ->b1.f32;
+    const float b2 = pBQ->b2.f32;
+    const float a1 = pBQ->a1.f32;
+    const float a2 = pBQ->a2.f32;
+
+    MA_ASSUME(channels > 0);
+    for (c = 0; c < channels; c += 1) {
+        float r1 = pBQ->pR1[c].f32;
+        float r2 = pBQ->pR2[c].f32;
+        float x  = pX[c];
+        float y;
+
+        y  = b0*x        + r1;
+        r1 = b1*x - a1*y + r2;
+        r2 = b2*x - a2*y;
+
+        pY[c]           = y;
+        pBQ->pR1[c].f32 = r1;
+        pBQ->pR2[c].f32 = r2;
+    }
+}
+
+static MA_INLINE void ma_biquad_process_pcm_frame_f32(ma_biquad* pBQ, float* pY, const float* pX)
+{
+    ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(pBQ, pY, pX);
+}
+
+static MA_INLINE void ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(ma_biquad* pBQ, ma_int16* pY, const ma_int16* pX)
+{
+    ma_uint32 c;
+    const ma_uint32 channels = pBQ->channels;
+    const ma_int32 b0 = pBQ->b0.s32;
+    const ma_int32 b1 = pBQ->b1.s32;
+    const ma_int32 b2 = pBQ->b2.s32;
+    const ma_int32 a1 = pBQ->a1.s32;
+    const ma_int32 a2 = pBQ->a2.s32;
+
+    MA_ASSUME(channels > 0);
+    for (c = 0; c < channels; c += 1) {
+        ma_int32 r1 = pBQ->pR1[c].s32;
+        ma_int32 r2 = pBQ->pR2[c].s32;
+        ma_int32 x  = pX[c];
+        ma_int32 y;
+
+        y  = (b0*x        + r1) >> MA_BIQUAD_FIXED_POINT_SHIFT;
+        r1 = (b1*x - a1*y + r2);
+        r2 = (b2*x - a2*y);
+
+        pY[c]           = (ma_int16)ma_clamp(y, -32768, 32767);
+        pBQ->pR1[c].s32 = r1;
+        pBQ->pR2[c].s32 = r2;
+    }
+}
+
+static MA_INLINE void ma_biquad_process_pcm_frame_s16(ma_biquad* pBQ, ma_int16* pY, const ma_int16* pX)
+{
+    ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(pBQ, pY, pX);
+}
+
+MA_API ma_result ma_biquad_process_pcm_frames(ma_biquad* pBQ, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_uint32 n;
+
+    if (pBQ == NULL || pFramesOut == NULL || pFramesIn == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Note that the logic below needs to support in-place filtering. That is, it must support the case where pFramesOut and pFramesIn are the same. */
+
+    if (pBQ->format == ma_format_f32) {
+        /* */ float* pY = (      float*)pFramesOut;
+        const float* pX = (const float*)pFramesIn;
+
+        for (n = 0; n < frameCount; n += 1) {
+            ma_biquad_process_pcm_frame_f32__direct_form_2_transposed(pBQ, pY, pX);
+            pY += pBQ->channels;
+            pX += pBQ->channels;
+        }
+    } else if (pBQ->format == ma_format_s16) {
+        /* */ ma_int16* pY = (      ma_int16*)pFramesOut;
+        const ma_int16* pX = (const ma_int16*)pFramesIn;
+
+        for (n = 0; n < frameCount; n += 1) {
+            ma_biquad_process_pcm_frame_s16__direct_form_2_transposed(pBQ, pY, pX);
+            pY += pBQ->channels;
+            pX += pBQ->channels;
+        }
+    } else {
+        MA_ASSERT(MA_FALSE);
+        return MA_INVALID_ARGS; /* Format not supported. Should never hit this because it's checked in ma_biquad_init() and ma_biquad_reinit(). */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint32 ma_biquad_get_latency(const ma_biquad* pBQ)
+{
+    if (pBQ == NULL) {
+        return 0;
+    }
+
+    return 2;
+}
+
+
+/**************************************************************************************************************************************************************
+
+Low-Pass Filter
+
+**************************************************************************************************************************************************************/
+MA_API ma_lpf1_config ma_lpf1_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency)
+{
+    ma_lpf1_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format = format;
+    config.channels = channels;
+    config.sampleRate = sampleRate;
+    config.cutoffFrequency = cutoffFrequency;
+    config.q = 0.5;
+
+    return config;
+}
+
+MA_API ma_lpf2_config ma_lpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q)
+{
+    ma_lpf2_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format = format;
+    config.channels = channels;
+    config.sampleRate = sampleRate;
+    config.cutoffFrequency = cutoffFrequency;
+    config.q = q;
+
+    /* Q cannot be 0 or else it'll result in a division by 0. In this case just default to 0.707107. */
+    if (config.q == 0) {
+        config.q = 0.707107;
+    }
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t r1Offset;
+} ma_lpf1_heap_layout;
+
+static ma_result ma_lpf1_get_heap_layout(const ma_lpf1_config* pConfig, ma_lpf1_heap_layout* pHeapLayout)
+{
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* R1 */
+    pHeapLayout->r1Offset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += sizeof(ma_biquad_coefficient) * pConfig->channels;
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_lpf1_get_heap_size(const ma_lpf1_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_lpf1_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_lpf1_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_lpf1_init_preallocated(const ma_lpf1_config* pConfig, void* pHeap, ma_lpf1* pLPF)
+{
+    ma_result result;
+    ma_lpf1_heap_layout heapLayout;
+
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pLPF);
+
+    result = ma_lpf1_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pLPF->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pLPF->pR1 = (ma_biquad_coefficient*)ma_offset_ptr(pHeap, heapLayout.r1Offset);
+
+    return ma_lpf1_reinit(pConfig, pLPF);
+}
+
+MA_API ma_result ma_lpf1_init(const ma_lpf1_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf1* pLPF)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_lpf1_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_lpf1_init_preallocated(pConfig, pHeap, pLPF);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pLPF->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_lpf1_uninit(ma_lpf1* pLPF, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pLPF == NULL) {
+        return;
+    }
+
+    if (pLPF->_ownsHeap) {
+        ma_free(pLPF->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_lpf1_reinit(const ma_lpf1_config* pConfig, ma_lpf1* pLPF)
+{
+    double a;
+
+    if (pLPF == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Only supporting f32 and s16. */
+    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The format cannot be changed after initialization. */
+    if (pLPF->format != ma_format_unknown && pLPF->format != pConfig->format) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* The channel count cannot be changed after initialization. */
+    if (pLPF->channels != 0 && pLPF->channels != pConfig->channels) {
+        return MA_INVALID_OPERATION;
+    }
+
+    pLPF->format   = pConfig->format;
+    pLPF->channels = pConfig->channels;
+
+    a = ma_expd(-2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate);
+    if (pConfig->format == ma_format_f32) {
+        pLPF->a.f32 = (float)a;
+    } else {
+        pLPF->a.s32 = ma_biquad_float_to_fp(a);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_lpf1_clear_cache(ma_lpf1* pLPF)
+{
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pLPF->format == ma_format_f32) {
+        pLPF->a.f32 = 0;
+    } else {
+        pLPF->a.s32 = 0;
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_lpf1_process_pcm_frame_f32(ma_lpf1* pLPF, float* pY, const float* pX)
+{
+    ma_uint32 c;
+    const ma_uint32 channels = pLPF->channels;
+    const float a = pLPF->a.f32;
+    const float b = 1 - a;
+
+    MA_ASSUME(channels > 0);
+    for (c = 0; c < channels; c += 1) {
+        float r1 = pLPF->pR1[c].f32;
+        float x  = pX[c];
+        float y;
+
+        y = b*x + a*r1;
+
+        pY[c]           = y;
+        pLPF->pR1[c].f32 = y;
+    }
+}
+
+static MA_INLINE void ma_lpf1_process_pcm_frame_s16(ma_lpf1* pLPF, ma_int16* pY, const ma_int16* pX)
+{
+    ma_uint32 c;
+    const ma_uint32 channels = pLPF->channels;
+    const ma_int32 a = pLPF->a.s32;
+    const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
+
+    MA_ASSUME(channels > 0);
+    for (c = 0; c < channels; c += 1) {
+        ma_int32 r1 = pLPF->pR1[c].s32;
+        ma_int32 x  = pX[c];
+        ma_int32 y;
+
+        y = (b*x + a*r1) >> MA_BIQUAD_FIXED_POINT_SHIFT;
+
+        pY[c]            = (ma_int16)y;
+        pLPF->pR1[c].s32 = (ma_int32)y;
+    }
+}
+
+MA_API ma_result ma_lpf1_process_pcm_frames(ma_lpf1* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_uint32 n;
+
+    if (pLPF == NULL || pFramesOut == NULL || pFramesIn == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Note that the logic below needs to support in-place filtering. That is, it must support the case where pFramesOut and pFramesIn are the same. */
+
+    if (pLPF->format == ma_format_f32) {
+        /* */ float* pY = (      float*)pFramesOut;
+        const float* pX = (const float*)pFramesIn;
+
+        for (n = 0; n < frameCount; n += 1) {
+            ma_lpf1_process_pcm_frame_f32(pLPF, pY, pX);
+            pY += pLPF->channels;
+            pX += pLPF->channels;
+        }
+    } else if (pLPF->format == ma_format_s16) {
+        /* */ ma_int16* pY = (      ma_int16*)pFramesOut;
+        const ma_int16* pX = (const ma_int16*)pFramesIn;
+
+        for (n = 0; n < frameCount; n += 1) {
+            ma_lpf1_process_pcm_frame_s16(pLPF, pY, pX);
+            pY += pLPF->channels;
+            pX += pLPF->channels;
+        }
+    } else {
+        MA_ASSERT(MA_FALSE);
+        return MA_INVALID_ARGS; /* Format not supported. Should never hit this because it's checked in ma_biquad_init() and ma_biquad_reinit(). */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint32 ma_lpf1_get_latency(const ma_lpf1* pLPF)
+{
+    if (pLPF == NULL) {
+        return 0;
+    }
+
+    return 1;
+}
+
+
+static MA_INLINE ma_biquad_config ma_lpf2__get_biquad_config(const ma_lpf2_config* pConfig)
+{
+    ma_biquad_config bqConfig;
+    double q;
+    double w;
+    double s;
+    double c;
+    double a;
+
+    MA_ASSERT(pConfig != NULL);
+
+    q = pConfig->q;
+    w = 2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate;
+    s = ma_sind(w);
+    c = ma_cosd(w);
+    a = s / (2*q);
+
+    bqConfig.b0 = (1 - c) / 2;
+    bqConfig.b1 =  1 - c;
+    bqConfig.b2 = (1 - c) / 2;
+    bqConfig.a0 =  1 + a;
+    bqConfig.a1 = -2 * c;
+    bqConfig.a2 =  1 - a;
+
+    bqConfig.format   = pConfig->format;
+    bqConfig.channels = pConfig->channels;
+
+    return bqConfig;
+}
+
+MA_API ma_result ma_lpf2_get_heap_size(const ma_lpf2_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_biquad_config bqConfig;
+    bqConfig = ma_lpf2__get_biquad_config(pConfig);
+
+    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
+}
+
+MA_API ma_result ma_lpf2_init_preallocated(const ma_lpf2_config* pConfig, void* pHeap, ma_lpf2* pLPF)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pLPF);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_lpf2__get_biquad_config(pConfig);
+    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pLPF->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_lpf2_init(const ma_lpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf2* pLPF)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_lpf2_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_lpf2_init_preallocated(pConfig, pHeap, pLPF);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pLPF->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
+    return MA_SUCCESS;
+}
+
+MA_API void ma_lpf2_uninit(ma_lpf2* pLPF, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pLPF == NULL) {
+        return;
+    }
+
+    ma_biquad_uninit(&pLPF->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
+}
+
+MA_API ma_result ma_lpf2_reinit(const ma_lpf2_config* pConfig, ma_lpf2* pLPF)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pLPF == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_lpf2__get_biquad_config(pConfig);
+    result = ma_biquad_reinit(&bqConfig, &pLPF->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_lpf2_clear_cache(ma_lpf2* pLPF)
+{
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_biquad_clear_cache(&pLPF->bq);
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_lpf2_process_pcm_frame_s16(ma_lpf2* pLPF, ma_int16* pFrameOut, const ma_int16* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_s16(&pLPF->bq, pFrameOut, pFrameIn);
+}
+
+static MA_INLINE void ma_lpf2_process_pcm_frame_f32(ma_lpf2* pLPF, float* pFrameOut, const float* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_f32(&pLPF->bq, pFrameOut, pFrameIn);
+}
+
+MA_API ma_result ma_lpf2_process_pcm_frames(ma_lpf2* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_biquad_process_pcm_frames(&pLPF->bq, pFramesOut, pFramesIn, frameCount);
+}
+
+MA_API ma_uint32 ma_lpf2_get_latency(const ma_lpf2* pLPF)
+{
+    if (pLPF == NULL) {
+        return 0;
+    }
+
+    return ma_biquad_get_latency(&pLPF->bq);
+}
+
+
+MA_API ma_lpf_config ma_lpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
+{
+    ma_lpf_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format          = format;
+    config.channels        = channels;
+    config.sampleRate      = sampleRate;
+    config.cutoffFrequency = cutoffFrequency;
+    config.order           = ma_min(order, MA_MAX_FILTER_ORDER);
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t lpf1Offset;
+    size_t lpf2Offset;  /* Offset of the first second order filter. Subsequent filters will come straight after, and will each have the same heap size. */
+} ma_lpf_heap_layout;
+
+static void ma_lpf_calculate_sub_lpf_counts(ma_uint32 order, ma_uint32* pLPF1Count, ma_uint32* pLPF2Count)
+{
+    MA_ASSERT(pLPF1Count != NULL);
+    MA_ASSERT(pLPF2Count != NULL);
+
+    *pLPF1Count = order % 2;
+    *pLPF2Count = order / 2;
+}
+
+static ma_result ma_lpf_get_heap_layout(const ma_lpf_config* pConfig, ma_lpf_heap_layout* pHeapLayout)
+{
+    ma_result result;
+    ma_uint32 lpf1Count;
+    ma_uint32 lpf2Count;
+    ma_uint32 ilpf1;
+    ma_uint32 ilpf2;
+
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->order > MA_MAX_FILTER_ORDER) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_lpf_calculate_sub_lpf_counts(pConfig->order, &lpf1Count, &lpf2Count);
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* LPF 1 */
+    pHeapLayout->lpf1Offset = pHeapLayout->sizeInBytes;
+    for (ilpf1 = 0; ilpf1 < lpf1Count; ilpf1 += 1) {
+        size_t lpf1HeapSizeInBytes;
+        ma_lpf1_config lpf1Config = ma_lpf1_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency);
+
+        result = ma_lpf1_get_heap_size(&lpf1Config, &lpf1HeapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->sizeInBytes += sizeof(ma_lpf1) + lpf1HeapSizeInBytes;
+    }
+
+    /* LPF 2*/
+    pHeapLayout->lpf2Offset = pHeapLayout->sizeInBytes;
+    for (ilpf2 = 0; ilpf2 < lpf2Count; ilpf2 += 1) {
+        size_t lpf2HeapSizeInBytes;
+        ma_lpf2_config lpf2Config = ma_lpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, 0.707107);   /* <-- The "q" parameter does not matter for the purpose of calculating the heap size. */
+
+        result = ma_lpf2_get_heap_size(&lpf2Config, &lpf2HeapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->sizeInBytes += sizeof(ma_lpf2) + lpf2HeapSizeInBytes;
+    }
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_lpf_reinit__internal(const ma_lpf_config* pConfig, void* pHeap, ma_lpf* pLPF, ma_bool32 isNew)
+{
+    ma_result result;
+    ma_uint32 lpf1Count;
+    ma_uint32 lpf2Count;
+    ma_uint32 ilpf1;
+    ma_uint32 ilpf2;
+    ma_lpf_heap_layout heapLayout;  /* Only used if isNew is true. */
+
+    if (pLPF == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Only supporting f32 and s16. */
+    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The format cannot be changed after initialization. */
+    if (pLPF->format != ma_format_unknown && pLPF->format != pConfig->format) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* The channel count cannot be changed after initialization. */
+    if (pLPF->channels != 0 && pLPF->channels != pConfig->channels) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pConfig->order > MA_MAX_FILTER_ORDER) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_lpf_calculate_sub_lpf_counts(pConfig->order, &lpf1Count, &lpf2Count);
+
+    /* The filter order can't change between reinits. */
+    if (!isNew) {
+        if (pLPF->lpf1Count != lpf1Count || pLPF->lpf2Count != lpf2Count) {
+            return MA_INVALID_OPERATION;
+        }
+    }
+
+    if (isNew) {
+        result = ma_lpf_get_heap_layout(pConfig, &heapLayout);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pLPF->_pHeap = pHeap;
+        MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+        pLPF->pLPF1 = (ma_lpf1*)ma_offset_ptr(pHeap, heapLayout.lpf1Offset);
+        pLPF->pLPF2 = (ma_lpf2*)ma_offset_ptr(pHeap, heapLayout.lpf2Offset);
+    } else {
+        MA_ZERO_OBJECT(&heapLayout);    /* To silence a compiler warning. */
+    }
+
+    for (ilpf1 = 0; ilpf1 < lpf1Count; ilpf1 += 1) {
+        ma_lpf1_config lpf1Config = ma_lpf1_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency);
+
+        if (isNew) {
+            size_t lpf1HeapSizeInBytes;
+
+            result = ma_lpf1_get_heap_size(&lpf1Config, &lpf1HeapSizeInBytes);
+            if (result == MA_SUCCESS) {
+                result = ma_lpf1_init_preallocated(&lpf1Config, ma_offset_ptr(pHeap, heapLayout.lpf1Offset + (sizeof(ma_lpf1) * lpf1Count) + (ilpf1 * lpf1HeapSizeInBytes)), &pLPF->pLPF1[ilpf1]);
+            }
+        } else {
+            result = ma_lpf1_reinit(&lpf1Config, &pLPF->pLPF1[ilpf1]);
+        }
+
+        if (result != MA_SUCCESS) {
+            ma_uint32 jlpf1;
+
+            for (jlpf1 = 0; jlpf1 < ilpf1; jlpf1 += 1) {
+                ma_lpf1_uninit(&pLPF->pLPF1[jlpf1], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
+            }
+
+            return result;
+        }
+    }
+
+    for (ilpf2 = 0; ilpf2 < lpf2Count; ilpf2 += 1) {
+        ma_lpf2_config lpf2Config;
+        double q;
+        double a;
+
+        /* Tempting to use 0.707107, but won't result in a Butterworth filter if the order is > 2. */
+        if (lpf1Count == 1) {
+            a = (1 + ilpf2*1) * (MA_PI_D/(pConfig->order*1));   /* Odd order. */
+        } else {
+            a = (1 + ilpf2*2) * (MA_PI_D/(pConfig->order*2));   /* Even order. */
+        }
+        q = 1 / (2*ma_cosd(a));
+
+        lpf2Config = ma_lpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, q);
+
+        if (isNew) {
+            size_t lpf2HeapSizeInBytes;
+
+            result = ma_lpf2_get_heap_size(&lpf2Config, &lpf2HeapSizeInBytes);
+            if (result == MA_SUCCESS) {
+                result = ma_lpf2_init_preallocated(&lpf2Config, ma_offset_ptr(pHeap, heapLayout.lpf2Offset + (sizeof(ma_lpf2) * lpf2Count) + (ilpf2 * lpf2HeapSizeInBytes)), &pLPF->pLPF2[ilpf2]);
+            }
+        } else {
+            result = ma_lpf2_reinit(&lpf2Config, &pLPF->pLPF2[ilpf2]);
+        }
+
+        if (result != MA_SUCCESS) {
+            ma_uint32 jlpf1;
+            ma_uint32 jlpf2;
+
+            for (jlpf1 = 0; jlpf1 < lpf1Count; jlpf1 += 1) {
+                ma_lpf1_uninit(&pLPF->pLPF1[jlpf1], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
+            }
+
+            for (jlpf2 = 0; jlpf2 < ilpf2; jlpf2 += 1) {
+                ma_lpf2_uninit(&pLPF->pLPF2[jlpf2], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
+            }
+
+            return result;
+        }
+    }
+
+    pLPF->lpf1Count  = lpf1Count;
+    pLPF->lpf2Count  = lpf2Count;
+    pLPF->format     = pConfig->format;
+    pLPF->channels   = pConfig->channels;
+    pLPF->sampleRate = pConfig->sampleRate;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_lpf_get_heap_size(const ma_lpf_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_lpf_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_lpf_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return result;
+}
+
+MA_API ma_result ma_lpf_init_preallocated(const ma_lpf_config* pConfig, void* pHeap, ma_lpf* pLPF)
+{
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pLPF);
+
+    return ma_lpf_reinit__internal(pConfig, pHeap, pLPF, /*isNew*/MA_TRUE);
+}
+
+MA_API ma_result ma_lpf_init(const ma_lpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf* pLPF)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_lpf_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_lpf_init_preallocated(pConfig, pHeap, pLPF);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pLPF->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_lpf_uninit(ma_lpf* pLPF, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_uint32 ilpf1;
+    ma_uint32 ilpf2;
+
+    if (pLPF == NULL) {
+        return;
+    }
+
+    for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
+        ma_lpf1_uninit(&pLPF->pLPF1[ilpf1], pAllocationCallbacks);
+    }
+
+    for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
+        ma_lpf2_uninit(&pLPF->pLPF2[ilpf2], pAllocationCallbacks);
+    }
+
+    if (pLPF->_ownsHeap) {
+        ma_free(pLPF->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_lpf_reinit(const ma_lpf_config* pConfig, ma_lpf* pLPF)
+{
+    return ma_lpf_reinit__internal(pConfig, NULL, pLPF, /*isNew*/MA_FALSE);
+}
+
+MA_API ma_result ma_lpf_clear_cache(ma_lpf* pLPF)
+{
+    ma_uint32 ilpf1;
+    ma_uint32 ilpf2;
+
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
+        ma_lpf1_clear_cache(&pLPF->pLPF1[ilpf1]);
+    }
+
+    for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
+        ma_lpf2_clear_cache(&pLPF->pLPF2[ilpf2]);
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_lpf_process_pcm_frame_f32(ma_lpf* pLPF, float* pY, const void* pX)
+{
+    ma_uint32 ilpf1;
+    ma_uint32 ilpf2;
+
+    MA_ASSERT(pLPF->format == ma_format_f32);
+
+    MA_MOVE_MEMORY(pY, pX, ma_get_bytes_per_frame(pLPF->format, pLPF->channels));
+
+    for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
+        ma_lpf1_process_pcm_frame_f32(&pLPF->pLPF1[ilpf1], pY, pY);
+    }
+
+    for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
+        ma_lpf2_process_pcm_frame_f32(&pLPF->pLPF2[ilpf2], pY, pY);
+    }
+}
+
+static MA_INLINE void ma_lpf_process_pcm_frame_s16(ma_lpf* pLPF, ma_int16* pY, const ma_int16* pX)
+{
+    ma_uint32 ilpf1;
+    ma_uint32 ilpf2;
+
+    MA_ASSERT(pLPF->format == ma_format_s16);
+
+    MA_MOVE_MEMORY(pY, pX, ma_get_bytes_per_frame(pLPF->format, pLPF->channels));
+
+    for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
+        ma_lpf1_process_pcm_frame_s16(&pLPF->pLPF1[ilpf1], pY, pY);
+    }
+
+    for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
+        ma_lpf2_process_pcm_frame_s16(&pLPF->pLPF2[ilpf2], pY, pY);
+    }
+}
+
+MA_API ma_result ma_lpf_process_pcm_frames(ma_lpf* pLPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_result result;
+    ma_uint32 ilpf1;
+    ma_uint32 ilpf2;
+
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Faster path for in-place. */
+    if (pFramesOut == pFramesIn) {
+        for (ilpf1 = 0; ilpf1 < pLPF->lpf1Count; ilpf1 += 1) {
+            result = ma_lpf1_process_pcm_frames(&pLPF->pLPF1[ilpf1], pFramesOut, pFramesOut, frameCount);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+        }
+
+        for (ilpf2 = 0; ilpf2 < pLPF->lpf2Count; ilpf2 += 1) {
+            result = ma_lpf2_process_pcm_frames(&pLPF->pLPF2[ilpf2], pFramesOut, pFramesOut, frameCount);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+        }
+    }
+
+    /* Slightly slower path for copying. */
+    if (pFramesOut != pFramesIn) {
+        ma_uint32 iFrame;
+
+        /*  */ if (pLPF->format == ma_format_f32) {
+            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
+            const float* pFramesInF32  = (const float*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                ma_lpf_process_pcm_frame_f32(pLPF, pFramesOutF32, pFramesInF32);
+                pFramesOutF32 += pLPF->channels;
+                pFramesInF32  += pLPF->channels;
+            }
+        } else if (pLPF->format == ma_format_s16) {
+            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
+            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                ma_lpf_process_pcm_frame_s16(pLPF, pFramesOutS16, pFramesInS16);
+                pFramesOutS16 += pLPF->channels;
+                pFramesInS16  += pLPF->channels;
+            }
+        } else {
+            MA_ASSERT(MA_FALSE);
+            return MA_INVALID_OPERATION;    /* Should never hit this. */
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint32 ma_lpf_get_latency(const ma_lpf* pLPF)
+{
+    if (pLPF == NULL) {
+        return 0;
+    }
+
+    return pLPF->lpf2Count*2 + pLPF->lpf1Count;
+}
+
+
+/**************************************************************************************************************************************************************
+
+High-Pass Filtering
+
+**************************************************************************************************************************************************************/
+MA_API ma_hpf1_config ma_hpf1_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency)
+{
+    ma_hpf1_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format = format;
+    config.channels = channels;
+    config.sampleRate = sampleRate;
+    config.cutoffFrequency = cutoffFrequency;
+
+    return config;
+}
+
+MA_API ma_hpf2_config ma_hpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q)
+{
+    ma_hpf2_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format = format;
+    config.channels = channels;
+    config.sampleRate = sampleRate;
+    config.cutoffFrequency = cutoffFrequency;
+    config.q = q;
+
+    /* Q cannot be 0 or else it'll result in a division by 0. In this case just default to 0.707107. */
+    if (config.q == 0) {
+        config.q = 0.707107;
+    }
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t r1Offset;
+} ma_hpf1_heap_layout;
+
+static ma_result ma_hpf1_get_heap_layout(const ma_hpf1_config* pConfig, ma_hpf1_heap_layout* pHeapLayout)
+{
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* R1 */
+    pHeapLayout->r1Offset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += sizeof(ma_biquad_coefficient) * pConfig->channels;
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_hpf1_get_heap_size(const ma_hpf1_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_hpf1_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_hpf1_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_hpf1_init_preallocated(const ma_hpf1_config* pConfig, void* pHeap, ma_hpf1* pLPF)
+{
+    ma_result result;
+    ma_hpf1_heap_layout heapLayout;
+
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pLPF);
+
+    result = ma_hpf1_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pLPF->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pLPF->pR1 = (ma_biquad_coefficient*)ma_offset_ptr(pHeap, heapLayout.r1Offset);
+
+    return ma_hpf1_reinit(pConfig, pLPF);
+}
+
+MA_API ma_result ma_hpf1_init(const ma_hpf1_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf1* pLPF)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_hpf1_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_hpf1_init_preallocated(pConfig, pHeap, pLPF);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pLPF->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_hpf1_uninit(ma_hpf1* pHPF, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pHPF == NULL) {
+        return;
+    }
+
+    if (pHPF->_ownsHeap) {
+        ma_free(pHPF->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_hpf1_reinit(const ma_hpf1_config* pConfig, ma_hpf1* pHPF)
+{
+    double a;
+
+    if (pHPF == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Only supporting f32 and s16. */
+    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The format cannot be changed after initialization. */
+    if (pHPF->format != ma_format_unknown && pHPF->format != pConfig->format) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* The channel count cannot be changed after initialization. */
+    if (pHPF->channels != 0 && pHPF->channels != pConfig->channels) {
+        return MA_INVALID_OPERATION;
+    }
+
+    pHPF->format   = pConfig->format;
+    pHPF->channels = pConfig->channels;
+
+    a = ma_expd(-2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate);
+    if (pConfig->format == ma_format_f32) {
+        pHPF->a.f32 = (float)a;
+    } else {
+        pHPF->a.s32 = ma_biquad_float_to_fp(a);
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_hpf1_process_pcm_frame_f32(ma_hpf1* pHPF, float* pY, const float* pX)
+{
+    ma_uint32 c;
+    const ma_uint32 channels = pHPF->channels;
+    const float a = 1 - pHPF->a.f32;
+    const float b = 1 - a;
+
+    MA_ASSUME(channels > 0);
+    for (c = 0; c < channels; c += 1) {
+        float r1 = pHPF->pR1[c].f32;
+        float x  = pX[c];
+        float y;
+
+        y = b*x - a*r1;
+
+        pY[c]            = y;
+        pHPF->pR1[c].f32 = y;
+    }
+}
+
+static MA_INLINE void ma_hpf1_process_pcm_frame_s16(ma_hpf1* pHPF, ma_int16* pY, const ma_int16* pX)
+{
+    ma_uint32 c;
+    const ma_uint32 channels = pHPF->channels;
+    const ma_int32 a = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - pHPF->a.s32);
+    const ma_int32 b = ((1 << MA_BIQUAD_FIXED_POINT_SHIFT) - a);
+
+    MA_ASSUME(channels > 0);
+    for (c = 0; c < channels; c += 1) {
+        ma_int32 r1 = pHPF->pR1[c].s32;
+        ma_int32 x  = pX[c];
+        ma_int32 y;
+
+        y = (b*x - a*r1) >> MA_BIQUAD_FIXED_POINT_SHIFT;
+
+        pY[c]            = (ma_int16)y;
+        pHPF->pR1[c].s32 = (ma_int32)y;
+    }
+}
+
+MA_API ma_result ma_hpf1_process_pcm_frames(ma_hpf1* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_uint32 n;
+
+    if (pHPF == NULL || pFramesOut == NULL || pFramesIn == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Note that the logic below needs to support in-place filtering. That is, it must support the case where pFramesOut and pFramesIn are the same. */
+
+    if (pHPF->format == ma_format_f32) {
+        /* */ float* pY = (      float*)pFramesOut;
+        const float* pX = (const float*)pFramesIn;
+
+        for (n = 0; n < frameCount; n += 1) {
+            ma_hpf1_process_pcm_frame_f32(pHPF, pY, pX);
+            pY += pHPF->channels;
+            pX += pHPF->channels;
+        }
+    } else if (pHPF->format == ma_format_s16) {
+        /* */ ma_int16* pY = (      ma_int16*)pFramesOut;
+        const ma_int16* pX = (const ma_int16*)pFramesIn;
+
+        for (n = 0; n < frameCount; n += 1) {
+            ma_hpf1_process_pcm_frame_s16(pHPF, pY, pX);
+            pY += pHPF->channels;
+            pX += pHPF->channels;
+        }
+    } else {
+        MA_ASSERT(MA_FALSE);
+        return MA_INVALID_ARGS; /* Format not supported. Should never hit this because it's checked in ma_biquad_init() and ma_biquad_reinit(). */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint32 ma_hpf1_get_latency(const ma_hpf1* pHPF)
+{
+    if (pHPF == NULL) {
+        return 0;
+    }
+
+    return 1;
+}
+
+
+static MA_INLINE ma_biquad_config ma_hpf2__get_biquad_config(const ma_hpf2_config* pConfig)
+{
+    ma_biquad_config bqConfig;
+    double q;
+    double w;
+    double s;
+    double c;
+    double a;
+
+    MA_ASSERT(pConfig != NULL);
+
+    q = pConfig->q;
+    w = 2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate;
+    s = ma_sind(w);
+    c = ma_cosd(w);
+    a = s / (2*q);
+
+    bqConfig.b0 =  (1 + c) / 2;
+    bqConfig.b1 = -(1 + c);
+    bqConfig.b2 =  (1 + c) / 2;
+    bqConfig.a0 =   1 + a;
+    bqConfig.a1 =  -2 * c;
+    bqConfig.a2 =   1 - a;
+
+    bqConfig.format   = pConfig->format;
+    bqConfig.channels = pConfig->channels;
+
+    return bqConfig;
+}
+
+MA_API ma_result ma_hpf2_get_heap_size(const ma_hpf2_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_biquad_config bqConfig;
+    bqConfig = ma_hpf2__get_biquad_config(pConfig);
+
+    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
+}
+
+MA_API ma_result ma_hpf2_init_preallocated(const ma_hpf2_config* pConfig, void* pHeap, ma_hpf2* pHPF)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pHPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pHPF);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_hpf2__get_biquad_config(pConfig);
+    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pHPF->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_hpf2_init(const ma_hpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf2* pHPF)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_hpf2_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_hpf2_init_preallocated(pConfig, pHeap, pHPF);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pHPF->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
+    return MA_SUCCESS;
+}
+
+MA_API void ma_hpf2_uninit(ma_hpf2* pHPF, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pHPF == NULL) {
+        return;
+    }
+
+    ma_biquad_uninit(&pHPF->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
+}
+
+MA_API ma_result ma_hpf2_reinit(const ma_hpf2_config* pConfig, ma_hpf2* pHPF)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pHPF == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_hpf2__get_biquad_config(pConfig);
+    result = ma_biquad_reinit(&bqConfig, &pHPF->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_hpf2_process_pcm_frame_s16(ma_hpf2* pHPF, ma_int16* pFrameOut, const ma_int16* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_s16(&pHPF->bq, pFrameOut, pFrameIn);
+}
+
+static MA_INLINE void ma_hpf2_process_pcm_frame_f32(ma_hpf2* pHPF, float* pFrameOut, const float* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_f32(&pHPF->bq, pFrameOut, pFrameIn);
+}
+
+MA_API ma_result ma_hpf2_process_pcm_frames(ma_hpf2* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pHPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_biquad_process_pcm_frames(&pHPF->bq, pFramesOut, pFramesIn, frameCount);
+}
+
+MA_API ma_uint32 ma_hpf2_get_latency(const ma_hpf2* pHPF)
+{
+    if (pHPF == NULL) {
+        return 0;
+    }
+
+    return ma_biquad_get_latency(&pHPF->bq);
+}
+
+
+MA_API ma_hpf_config ma_hpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
+{
+    ma_hpf_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format          = format;
+    config.channels        = channels;
+    config.sampleRate      = sampleRate;
+    config.cutoffFrequency = cutoffFrequency;
+    config.order           = ma_min(order, MA_MAX_FILTER_ORDER);
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t hpf1Offset;
+    size_t hpf2Offset;  /* Offset of the first second order filter. Subsequent filters will come straight after, and will each have the same heap size. */
+} ma_hpf_heap_layout;
+
+static void ma_hpf_calculate_sub_hpf_counts(ma_uint32 order, ma_uint32* pHPF1Count, ma_uint32* pHPF2Count)
+{
+    MA_ASSERT(pHPF1Count != NULL);
+    MA_ASSERT(pHPF2Count != NULL);
+
+    *pHPF1Count = order % 2;
+    *pHPF2Count = order / 2;
+}
+
+static ma_result ma_hpf_get_heap_layout(const ma_hpf_config* pConfig, ma_hpf_heap_layout* pHeapLayout)
+{
+    ma_result result;
+    ma_uint32 hpf1Count;
+    ma_uint32 hpf2Count;
+    ma_uint32 ihpf1;
+    ma_uint32 ihpf2;
+
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->order > MA_MAX_FILTER_ORDER) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_hpf_calculate_sub_hpf_counts(pConfig->order, &hpf1Count, &hpf2Count);
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* HPF 1 */
+    pHeapLayout->hpf1Offset = pHeapLayout->sizeInBytes;
+    for (ihpf1 = 0; ihpf1 < hpf1Count; ihpf1 += 1) {
+        size_t hpf1HeapSizeInBytes;
+        ma_hpf1_config hpf1Config = ma_hpf1_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency);
+
+        result = ma_hpf1_get_heap_size(&hpf1Config, &hpf1HeapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->sizeInBytes += sizeof(ma_hpf1) + hpf1HeapSizeInBytes;
+    }
+
+    /* HPF 2*/
+    pHeapLayout->hpf2Offset = pHeapLayout->sizeInBytes;
+    for (ihpf2 = 0; ihpf2 < hpf2Count; ihpf2 += 1) {
+        size_t hpf2HeapSizeInBytes;
+        ma_hpf2_config hpf2Config = ma_hpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, 0.707107);   /* <-- The "q" parameter does not matter for the purpose of calculating the heap size. */
+
+        result = ma_hpf2_get_heap_size(&hpf2Config, &hpf2HeapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->sizeInBytes += sizeof(ma_hpf2) + hpf2HeapSizeInBytes;
+    }
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_hpf_reinit__internal(const ma_hpf_config* pConfig, void* pHeap, ma_hpf* pHPF, ma_bool32 isNew)
+{
+    ma_result result;
+    ma_uint32 hpf1Count;
+    ma_uint32 hpf2Count;
+    ma_uint32 ihpf1;
+    ma_uint32 ihpf2;
+    ma_hpf_heap_layout heapLayout;  /* Only used if isNew is true. */
+
+    if (pHPF == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Only supporting f32 and s16. */
+    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The format cannot be changed after initialization. */
+    if (pHPF->format != ma_format_unknown && pHPF->format != pConfig->format) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* The channel count cannot be changed after initialization. */
+    if (pHPF->channels != 0 && pHPF->channels != pConfig->channels) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pConfig->order > MA_MAX_FILTER_ORDER) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_hpf_calculate_sub_hpf_counts(pConfig->order, &hpf1Count, &hpf2Count);
+
+    /* The filter order can't change between reinits. */
+    if (!isNew) {
+        if (pHPF->hpf1Count != hpf1Count || pHPF->hpf2Count != hpf2Count) {
+            return MA_INVALID_OPERATION;
+        }
+    }
+
+    if (isNew) {
+        result = ma_hpf_get_heap_layout(pConfig, &heapLayout);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHPF->_pHeap = pHeap;
+        MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+        pHPF->pHPF1 = (ma_hpf1*)ma_offset_ptr(pHeap, heapLayout.hpf1Offset);
+        pHPF->pHPF2 = (ma_hpf2*)ma_offset_ptr(pHeap, heapLayout.hpf2Offset);
+    } else {
+        MA_ZERO_OBJECT(&heapLayout);    /* To silence a compiler warning. */
+    }
+
+    for (ihpf1 = 0; ihpf1 < hpf1Count; ihpf1 += 1) {
+        ma_hpf1_config hpf1Config = ma_hpf1_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency);
+
+        if (isNew) {
+            size_t hpf1HeapSizeInBytes;
+
+            result = ma_hpf1_get_heap_size(&hpf1Config, &hpf1HeapSizeInBytes);
+            if (result == MA_SUCCESS) {
+                result = ma_hpf1_init_preallocated(&hpf1Config, ma_offset_ptr(pHeap, heapLayout.hpf1Offset + (sizeof(ma_hpf1) * hpf1Count) + (ihpf1 * hpf1HeapSizeInBytes)), &pHPF->pHPF1[ihpf1]);
+            }
+        } else {
+            result = ma_hpf1_reinit(&hpf1Config, &pHPF->pHPF1[ihpf1]);
+        }
+
+        if (result != MA_SUCCESS) {
+            ma_uint32 jhpf1;
+
+            for (jhpf1 = 0; jhpf1 < ihpf1; jhpf1 += 1) {
+                ma_hpf1_uninit(&pHPF->pHPF1[jhpf1], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
+            }
+
+            return result;
+        }
+    }
+
+    for (ihpf2 = 0; ihpf2 < hpf2Count; ihpf2 += 1) {
+        ma_hpf2_config hpf2Config;
+        double q;
+        double a;
+
+        /* Tempting to use 0.707107, but won't result in a Butterworth filter if the order is > 2. */
+        if (hpf1Count == 1) {
+            a = (1 + ihpf2*1) * (MA_PI_D/(pConfig->order*1));   /* Odd order. */
+        } else {
+            a = (1 + ihpf2*2) * (MA_PI_D/(pConfig->order*2));   /* Even order. */
+        }
+        q = 1 / (2*ma_cosd(a));
+
+        hpf2Config = ma_hpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, q);
+
+        if (isNew) {
+            size_t hpf2HeapSizeInBytes;
+
+            result = ma_hpf2_get_heap_size(&hpf2Config, &hpf2HeapSizeInBytes);
+            if (result == MA_SUCCESS) {
+                result = ma_hpf2_init_preallocated(&hpf2Config, ma_offset_ptr(pHeap, heapLayout.hpf2Offset + (sizeof(ma_hpf2) * hpf2Count) + (ihpf2 * hpf2HeapSizeInBytes)), &pHPF->pHPF2[ihpf2]);
+            }
+        } else {
+            result = ma_hpf2_reinit(&hpf2Config, &pHPF->pHPF2[ihpf2]);
+        }
+
+        if (result != MA_SUCCESS) {
+            ma_uint32 jhpf1;
+            ma_uint32 jhpf2;
+
+            for (jhpf1 = 0; jhpf1 < hpf1Count; jhpf1 += 1) {
+                ma_hpf1_uninit(&pHPF->pHPF1[jhpf1], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
+            }
+
+            for (jhpf2 = 0; jhpf2 < ihpf2; jhpf2 += 1) {
+                ma_hpf2_uninit(&pHPF->pHPF2[jhpf2], NULL);  /* No need for allocation callbacks here since we used a preallocated heap allocation. */
+            }
+
+            return result;
+        }
+    }
+
+    pHPF->hpf1Count  = hpf1Count;
+    pHPF->hpf2Count  = hpf2Count;
+    pHPF->format     = pConfig->format;
+    pHPF->channels   = pConfig->channels;
+    pHPF->sampleRate = pConfig->sampleRate;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_hpf_get_heap_size(const ma_hpf_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_hpf_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_hpf_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return result;
+}
+
+MA_API ma_result ma_hpf_init_preallocated(const ma_hpf_config* pConfig, void* pHeap, ma_hpf* pLPF)
+{
+    if (pLPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pLPF);
+
+    return ma_hpf_reinit__internal(pConfig, pHeap, pLPF, /*isNew*/MA_TRUE);
+}
+
+MA_API ma_result ma_hpf_init(const ma_hpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf* pHPF)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_hpf_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_hpf_init_preallocated(pConfig, pHeap, pHPF);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pHPF->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_hpf_uninit(ma_hpf* pHPF, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_uint32 ihpf1;
+    ma_uint32 ihpf2;
+
+    if (pHPF == NULL) {
+        return;
+    }
+
+    for (ihpf1 = 0; ihpf1 < pHPF->hpf1Count; ihpf1 += 1) {
+        ma_hpf1_uninit(&pHPF->pHPF1[ihpf1], pAllocationCallbacks);
+    }
+
+    for (ihpf2 = 0; ihpf2 < pHPF->hpf2Count; ihpf2 += 1) {
+        ma_hpf2_uninit(&pHPF->pHPF2[ihpf2], pAllocationCallbacks);
+    }
+
+    if (pHPF->_ownsHeap) {
+        ma_free(pHPF->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_hpf_reinit(const ma_hpf_config* pConfig, ma_hpf* pHPF)
+{
+    return ma_hpf_reinit__internal(pConfig, NULL, pHPF, /*isNew*/MA_FALSE);
+}
+
+MA_API ma_result ma_hpf_process_pcm_frames(ma_hpf* pHPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_result result;
+    ma_uint32 ihpf1;
+    ma_uint32 ihpf2;
+
+    if (pHPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Faster path for in-place. */
+    if (pFramesOut == pFramesIn) {
+        for (ihpf1 = 0; ihpf1 < pHPF->hpf1Count; ihpf1 += 1) {
+            result = ma_hpf1_process_pcm_frames(&pHPF->pHPF1[ihpf1], pFramesOut, pFramesOut, frameCount);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+        }
+
+        for (ihpf2 = 0; ihpf2 < pHPF->hpf2Count; ihpf2 += 1) {
+            result = ma_hpf2_process_pcm_frames(&pHPF->pHPF2[ihpf2], pFramesOut, pFramesOut, frameCount);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+        }
+    }
+
+    /* Slightly slower path for copying. */
+    if (pFramesOut != pFramesIn) {
+        ma_uint32 iFrame;
+
+        /*  */ if (pHPF->format == ma_format_f32) {
+            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
+            const float* pFramesInF32  = (const float*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                MA_COPY_MEMORY(pFramesOutF32, pFramesInF32, ma_get_bytes_per_frame(pHPF->format, pHPF->channels));
+
+                for (ihpf1 = 0; ihpf1 < pHPF->hpf1Count; ihpf1 += 1) {
+                    ma_hpf1_process_pcm_frame_f32(&pHPF->pHPF1[ihpf1], pFramesOutF32, pFramesOutF32);
+                }
+
+                for (ihpf2 = 0; ihpf2 < pHPF->hpf2Count; ihpf2 += 1) {
+                    ma_hpf2_process_pcm_frame_f32(&pHPF->pHPF2[ihpf2], pFramesOutF32, pFramesOutF32);
+                }
+
+                pFramesOutF32 += pHPF->channels;
+                pFramesInF32  += pHPF->channels;
+            }
+        } else if (pHPF->format == ma_format_s16) {
+            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
+            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                MA_COPY_MEMORY(pFramesOutS16, pFramesInS16, ma_get_bytes_per_frame(pHPF->format, pHPF->channels));
+
+                for (ihpf1 = 0; ihpf1 < pHPF->hpf1Count; ihpf1 += 1) {
+                    ma_hpf1_process_pcm_frame_s16(&pHPF->pHPF1[ihpf1], pFramesOutS16, pFramesOutS16);
+                }
+
+                for (ihpf2 = 0; ihpf2 < pHPF->hpf2Count; ihpf2 += 1) {
+                    ma_hpf2_process_pcm_frame_s16(&pHPF->pHPF2[ihpf2], pFramesOutS16, pFramesOutS16);
+                }
+
+                pFramesOutS16 += pHPF->channels;
+                pFramesInS16  += pHPF->channels;
+            }
+        } else {
+            MA_ASSERT(MA_FALSE);
+            return MA_INVALID_OPERATION;    /* Should never hit this. */
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint32 ma_hpf_get_latency(const ma_hpf* pHPF)
+{
+    if (pHPF == NULL) {
+        return 0;
+    }
+
+    return pHPF->hpf2Count*2 + pHPF->hpf1Count;
+}
+
+
+/**************************************************************************************************************************************************************
+
+Band-Pass Filtering
+
+**************************************************************************************************************************************************************/
+MA_API ma_bpf2_config ma_bpf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, double q)
+{
+    ma_bpf2_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format = format;
+    config.channels = channels;
+    config.sampleRate = sampleRate;
+    config.cutoffFrequency = cutoffFrequency;
+    config.q = q;
+
+    /* Q cannot be 0 or else it'll result in a division by 0. In this case just default to 0.707107. */
+    if (config.q == 0) {
+        config.q = 0.707107;
+    }
+
+    return config;
+}
+
+
+static MA_INLINE ma_biquad_config ma_bpf2__get_biquad_config(const ma_bpf2_config* pConfig)
+{
+    ma_biquad_config bqConfig;
+    double q;
+    double w;
+    double s;
+    double c;
+    double a;
+
+    MA_ASSERT(pConfig != NULL);
+
+    q = pConfig->q;
+    w = 2 * MA_PI_D * pConfig->cutoffFrequency / pConfig->sampleRate;
+    s = ma_sind(w);
+    c = ma_cosd(w);
+    a = s / (2*q);
+
+    bqConfig.b0 =  q * a;
+    bqConfig.b1 =  0;
+    bqConfig.b2 = -q * a;
+    bqConfig.a0 =  1 + a;
+    bqConfig.a1 = -2 * c;
+    bqConfig.a2 =  1 - a;
+
+    bqConfig.format   = pConfig->format;
+    bqConfig.channels = pConfig->channels;
+
+    return bqConfig;
+}
+
+MA_API ma_result ma_bpf2_get_heap_size(const ma_bpf2_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_biquad_config bqConfig;
+    bqConfig = ma_bpf2__get_biquad_config(pConfig);
+
+    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
+}
+
+MA_API ma_result ma_bpf2_init_preallocated(const ma_bpf2_config* pConfig, void* pHeap, ma_bpf2* pBPF)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pBPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pBPF);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_bpf2__get_biquad_config(pConfig);
+    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pBPF->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_bpf2_init(const ma_bpf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf2* pBPF)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_bpf2_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_bpf2_init_preallocated(pConfig, pHeap, pBPF);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pBPF->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
+    return MA_SUCCESS;
+}
+
+MA_API void ma_bpf2_uninit(ma_bpf2* pBPF, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pBPF == NULL) {
+        return;
+    }
+
+    ma_biquad_uninit(&pBPF->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
+}
+
+MA_API ma_result ma_bpf2_reinit(const ma_bpf2_config* pConfig, ma_bpf2* pBPF)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pBPF == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_bpf2__get_biquad_config(pConfig);
+    result = ma_biquad_reinit(&bqConfig, &pBPF->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_bpf2_process_pcm_frame_s16(ma_bpf2* pBPF, ma_int16* pFrameOut, const ma_int16* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_s16(&pBPF->bq, pFrameOut, pFrameIn);
+}
+
+static MA_INLINE void ma_bpf2_process_pcm_frame_f32(ma_bpf2* pBPF, float* pFrameOut, const float* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_f32(&pBPF->bq, pFrameOut, pFrameIn);
+}
+
+MA_API ma_result ma_bpf2_process_pcm_frames(ma_bpf2* pBPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pBPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_biquad_process_pcm_frames(&pBPF->bq, pFramesOut, pFramesIn, frameCount);
+}
+
+MA_API ma_uint32 ma_bpf2_get_latency(const ma_bpf2* pBPF)
+{
+    if (pBPF == NULL) {
+        return 0;
+    }
+
+    return ma_biquad_get_latency(&pBPF->bq);
+}
+
+
+MA_API ma_bpf_config ma_bpf_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
+{
+    ma_bpf_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format          = format;
+    config.channels        = channels;
+    config.sampleRate      = sampleRate;
+    config.cutoffFrequency = cutoffFrequency;
+    config.order           = ma_min(order, MA_MAX_FILTER_ORDER);
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t bpf2Offset;
+} ma_bpf_heap_layout;
+
+static ma_result ma_bpf_get_heap_layout(const ma_bpf_config* pConfig, ma_bpf_heap_layout* pHeapLayout)
+{
+    ma_result result;
+    ma_uint32 bpf2Count;
+    ma_uint32 ibpf2;
+
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->order > MA_MAX_FILTER_ORDER) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* We must have an even number of order. */
+    if ((pConfig->order & 0x1) != 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    bpf2Count = pConfig->order / 2;
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* BPF 2 */
+    pHeapLayout->bpf2Offset = pHeapLayout->sizeInBytes;
+    for (ibpf2 = 0; ibpf2 < bpf2Count; ibpf2 += 1) {
+        size_t bpf2HeapSizeInBytes;
+        ma_bpf2_config bpf2Config = ma_bpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, 0.707107);   /* <-- The "q" parameter does not matter for the purpose of calculating the heap size. */
+
+        result = ma_bpf2_get_heap_size(&bpf2Config, &bpf2HeapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->sizeInBytes += sizeof(ma_bpf2) + bpf2HeapSizeInBytes;
+    }
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_bpf_reinit__internal(const ma_bpf_config* pConfig, void* pHeap, ma_bpf* pBPF, ma_bool32 isNew)
+{
+    ma_result result;
+    ma_uint32 bpf2Count;
+    ma_uint32 ibpf2;
+    ma_bpf_heap_layout heapLayout;  /* Only used if isNew is true. */
+
+    if (pBPF == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Only supporting f32 and s16. */
+    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The format cannot be changed after initialization. */
+    if (pBPF->format != ma_format_unknown && pBPF->format != pConfig->format) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* The channel count cannot be changed after initialization. */
+    if (pBPF->channels != 0 && pBPF->channels != pConfig->channels) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pConfig->order > MA_MAX_FILTER_ORDER) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* We must have an even number of order. */
+    if ((pConfig->order & 0x1) != 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    bpf2Count = pConfig->order / 2;
+
+    /* The filter order can't change between reinits. */
+    if (!isNew) {
+        if (pBPF->bpf2Count != bpf2Count) {
+            return MA_INVALID_OPERATION;
+        }
+    }
+
+    if (isNew) {
+        result = ma_bpf_get_heap_layout(pConfig, &heapLayout);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pBPF->_pHeap = pHeap;
+        MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+        pBPF->pBPF2 = (ma_bpf2*)ma_offset_ptr(pHeap, heapLayout.bpf2Offset);
+    } else {
+        MA_ZERO_OBJECT(&heapLayout);
+    }
+
+    for (ibpf2 = 0; ibpf2 < bpf2Count; ibpf2 += 1) {
+        ma_bpf2_config bpf2Config;
+        double q;
+
+        /* TODO: Calculate Q to make this a proper Butterworth filter. */
+        q = 0.707107;
+
+        bpf2Config = ma_bpf2_config_init(pConfig->format, pConfig->channels, pConfig->sampleRate, pConfig->cutoffFrequency, q);
+
+        if (isNew) {
+            size_t bpf2HeapSizeInBytes;
+
+            result = ma_bpf2_get_heap_size(&bpf2Config, &bpf2HeapSizeInBytes);
+            if (result == MA_SUCCESS) {
+                result = ma_bpf2_init_preallocated(&bpf2Config, ma_offset_ptr(pHeap, heapLayout.bpf2Offset + (sizeof(ma_bpf2) * bpf2Count) + (ibpf2 * bpf2HeapSizeInBytes)), &pBPF->pBPF2[ibpf2]);
+            }
+        } else {
+            result = ma_bpf2_reinit(&bpf2Config, &pBPF->pBPF2[ibpf2]);
+        }
+
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    pBPF->bpf2Count = bpf2Count;
+    pBPF->format    = pConfig->format;
+    pBPF->channels  = pConfig->channels;
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_bpf_get_heap_size(const ma_bpf_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_bpf_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_bpf_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_bpf_init_preallocated(const ma_bpf_config* pConfig, void* pHeap, ma_bpf* pBPF)
+{
+    if (pBPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pBPF);
+
+    return ma_bpf_reinit__internal(pConfig, pHeap, pBPF, /*isNew*/MA_TRUE);
+}
+
+MA_API ma_result ma_bpf_init(const ma_bpf_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf* pBPF)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_bpf_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_bpf_init_preallocated(pConfig, pHeap, pBPF);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pBPF->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_bpf_uninit(ma_bpf* pBPF, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_uint32 ibpf2;
+
+    if (pBPF == NULL) {
+        return;
+    }
+
+    for (ibpf2 = 0; ibpf2 < pBPF->bpf2Count; ibpf2 += 1) {
+        ma_bpf2_uninit(&pBPF->pBPF2[ibpf2], pAllocationCallbacks);
+    }
+
+    if (pBPF->_ownsHeap) {
+        ma_free(pBPF->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_bpf_reinit(const ma_bpf_config* pConfig, ma_bpf* pBPF)
+{
+    return ma_bpf_reinit__internal(pConfig, NULL, pBPF, /*isNew*/MA_FALSE);
+}
+
+MA_API ma_result ma_bpf_process_pcm_frames(ma_bpf* pBPF, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_result result;
+    ma_uint32 ibpf2;
+
+    if (pBPF == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Faster path for in-place. */
+    if (pFramesOut == pFramesIn) {
+        for (ibpf2 = 0; ibpf2 < pBPF->bpf2Count; ibpf2 += 1) {
+            result = ma_bpf2_process_pcm_frames(&pBPF->pBPF2[ibpf2], pFramesOut, pFramesOut, frameCount);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+        }
+    }
+
+    /* Slightly slower path for copying. */
+    if (pFramesOut != pFramesIn) {
+        ma_uint32 iFrame;
+
+        /*  */ if (pBPF->format == ma_format_f32) {
+            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
+            const float* pFramesInF32  = (const float*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                MA_COPY_MEMORY(pFramesOutF32, pFramesInF32, ma_get_bytes_per_frame(pBPF->format, pBPF->channels));
+
+                for (ibpf2 = 0; ibpf2 < pBPF->bpf2Count; ibpf2 += 1) {
+                    ma_bpf2_process_pcm_frame_f32(&pBPF->pBPF2[ibpf2], pFramesOutF32, pFramesOutF32);
+                }
+
+                pFramesOutF32 += pBPF->channels;
+                pFramesInF32  += pBPF->channels;
+            }
+        } else if (pBPF->format == ma_format_s16) {
+            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
+            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                MA_COPY_MEMORY(pFramesOutS16, pFramesInS16, ma_get_bytes_per_frame(pBPF->format, pBPF->channels));
+
+                for (ibpf2 = 0; ibpf2 < pBPF->bpf2Count; ibpf2 += 1) {
+                    ma_bpf2_process_pcm_frame_s16(&pBPF->pBPF2[ibpf2], pFramesOutS16, pFramesOutS16);
+                }
+
+                pFramesOutS16 += pBPF->channels;
+                pFramesInS16  += pBPF->channels;
+            }
+        } else {
+            MA_ASSERT(MA_FALSE);
+            return MA_INVALID_OPERATION;    /* Should never hit this. */
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint32 ma_bpf_get_latency(const ma_bpf* pBPF)
+{
+    if (pBPF == NULL) {
+        return 0;
+    }
+
+    return pBPF->bpf2Count*2;
+}
+
+
+/**************************************************************************************************************************************************************
+
+Notching Filter
+
+**************************************************************************************************************************************************************/
+MA_API ma_notch2_config ma_notch2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double q, double frequency)
+{
+    ma_notch2_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format     = format;
+    config.channels   = channels;
+    config.sampleRate = sampleRate;
+    config.q          = q;
+    config.frequency  = frequency;
+
+    if (config.q == 0) {
+        config.q = 0.707107;
+    }
+
+    return config;
+}
+
+
+static MA_INLINE ma_biquad_config ma_notch2__get_biquad_config(const ma_notch2_config* pConfig)
+{
+    ma_biquad_config bqConfig;
+    double q;
+    double w;
+    double s;
+    double c;
+    double a;
+
+    MA_ASSERT(pConfig != NULL);
+
+    q = pConfig->q;
+    w = 2 * MA_PI_D * pConfig->frequency / pConfig->sampleRate;
+    s = ma_sind(w);
+    c = ma_cosd(w);
+    a = s / (2*q);
+
+    bqConfig.b0 =  1;
+    bqConfig.b1 = -2 * c;
+    bqConfig.b2 =  1;
+    bqConfig.a0 =  1 + a;
+    bqConfig.a1 = -2 * c;
+    bqConfig.a2 =  1 - a;
+
+    bqConfig.format   = pConfig->format;
+    bqConfig.channels = pConfig->channels;
+
+    return bqConfig;
+}
+
+MA_API ma_result ma_notch2_get_heap_size(const ma_notch2_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_biquad_config bqConfig;
+    bqConfig = ma_notch2__get_biquad_config(pConfig);
+
+    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
+}
+
+MA_API ma_result ma_notch2_init_preallocated(const ma_notch2_config* pConfig, void* pHeap, ma_notch2* pFilter)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pFilter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pFilter);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_notch2__get_biquad_config(pConfig);
+    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pFilter->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_notch2_init(const ma_notch2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_notch2* pFilter)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_notch2_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_notch2_init_preallocated(pConfig, pHeap, pFilter);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pFilter->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
+    return MA_SUCCESS;
+}
+
+MA_API void ma_notch2_uninit(ma_notch2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFilter == NULL) {
+        return;
+    }
+
+    ma_biquad_uninit(&pFilter->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
+}
+
+MA_API ma_result ma_notch2_reinit(const ma_notch2_config* pConfig, ma_notch2* pFilter)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pFilter == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_notch2__get_biquad_config(pConfig);
+    result = ma_biquad_reinit(&bqConfig, &pFilter->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_notch2_process_pcm_frame_s16(ma_notch2* pFilter, ma_int16* pFrameOut, const ma_int16* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_s16(&pFilter->bq, pFrameOut, pFrameIn);
+}
+
+static MA_INLINE void ma_notch2_process_pcm_frame_f32(ma_notch2* pFilter, float* pFrameOut, const float* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_f32(&pFilter->bq, pFrameOut, pFrameIn);
+}
+
+MA_API ma_result ma_notch2_process_pcm_frames(ma_notch2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pFilter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_biquad_process_pcm_frames(&pFilter->bq, pFramesOut, pFramesIn, frameCount);
+}
+
+MA_API ma_uint32 ma_notch2_get_latency(const ma_notch2* pFilter)
+{
+    if (pFilter == NULL) {
+        return 0;
+    }
+
+    return ma_biquad_get_latency(&pFilter->bq);
+}
+
+
+
+/**************************************************************************************************************************************************************
+
+Peaking EQ Filter
+
+**************************************************************************************************************************************************************/
+MA_API ma_peak2_config ma_peak2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency)
+{
+    ma_peak2_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format     = format;
+    config.channels   = channels;
+    config.sampleRate = sampleRate;
+    config.gainDB     = gainDB;
+    config.q          = q;
+    config.frequency  = frequency;
+
+    if (config.q == 0) {
+        config.q = 0.707107;
+    }
+
+    return config;
+}
+
+
+static MA_INLINE ma_biquad_config ma_peak2__get_biquad_config(const ma_peak2_config* pConfig)
+{
+    ma_biquad_config bqConfig;
+    double q;
+    double w;
+    double s;
+    double c;
+    double a;
+    double A;
+
+    MA_ASSERT(pConfig != NULL);
+
+    q = pConfig->q;
+    w = 2 * MA_PI_D * pConfig->frequency / pConfig->sampleRate;
+    s = ma_sind(w);
+    c = ma_cosd(w);
+    a = s / (2*q);
+    A = ma_powd(10, (pConfig->gainDB / 40));
+
+    bqConfig.b0 =  1 + (a * A);
+    bqConfig.b1 = -2 * c;
+    bqConfig.b2 =  1 - (a * A);
+    bqConfig.a0 =  1 + (a / A);
+    bqConfig.a1 = -2 * c;
+    bqConfig.a2 =  1 - (a / A);
+
+    bqConfig.format   = pConfig->format;
+    bqConfig.channels = pConfig->channels;
+
+    return bqConfig;
+}
+
+MA_API ma_result ma_peak2_get_heap_size(const ma_peak2_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_biquad_config bqConfig;
+    bqConfig = ma_peak2__get_biquad_config(pConfig);
+
+    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
+}
+
+MA_API ma_result ma_peak2_init_preallocated(const ma_peak2_config* pConfig, void* pHeap, ma_peak2* pFilter)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pFilter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pFilter);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_peak2__get_biquad_config(pConfig);
+    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pFilter->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_peak2_init(const ma_peak2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_peak2* pFilter)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_peak2_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_peak2_init_preallocated(pConfig, pHeap, pFilter);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pFilter->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
+    return MA_SUCCESS;
+}
+
+MA_API void ma_peak2_uninit(ma_peak2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFilter == NULL) {
+        return;
+    }
+
+    ma_biquad_uninit(&pFilter->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
+}
+
+MA_API ma_result ma_peak2_reinit(const ma_peak2_config* pConfig, ma_peak2* pFilter)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pFilter == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_peak2__get_biquad_config(pConfig);
+    result = ma_biquad_reinit(&bqConfig, &pFilter->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_peak2_process_pcm_frame_s16(ma_peak2* pFilter, ma_int16* pFrameOut, const ma_int16* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_s16(&pFilter->bq, pFrameOut, pFrameIn);
+}
+
+static MA_INLINE void ma_peak2_process_pcm_frame_f32(ma_peak2* pFilter, float* pFrameOut, const float* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_f32(&pFilter->bq, pFrameOut, pFrameIn);
+}
+
+MA_API ma_result ma_peak2_process_pcm_frames(ma_peak2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pFilter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_biquad_process_pcm_frames(&pFilter->bq, pFramesOut, pFramesIn, frameCount);
+}
+
+MA_API ma_uint32 ma_peak2_get_latency(const ma_peak2* pFilter)
+{
+    if (pFilter == NULL) {
+        return 0;
+    }
+
+    return ma_biquad_get_latency(&pFilter->bq);
+}
+
+
+/**************************************************************************************************************************************************************
+
+Low Shelf Filter
+
+**************************************************************************************************************************************************************/
+MA_API ma_loshelf2_config ma_loshelf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double shelfSlope, double frequency)
+{
+    ma_loshelf2_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format     = format;
+    config.channels   = channels;
+    config.sampleRate = sampleRate;
+    config.gainDB     = gainDB;
+    config.shelfSlope = shelfSlope;
+    config.frequency  = frequency;
+
+    return config;
+}
+
+
+static MA_INLINE ma_biquad_config ma_loshelf2__get_biquad_config(const ma_loshelf2_config* pConfig)
+{
+    ma_biquad_config bqConfig;
+    double w;
+    double s;
+    double c;
+    double A;
+    double S;
+    double a;
+    double sqrtA;
+
+    MA_ASSERT(pConfig != NULL);
+
+    w = 2 * MA_PI_D * pConfig->frequency / pConfig->sampleRate;
+    s = ma_sind(w);
+    c = ma_cosd(w);
+    A = ma_powd(10, (pConfig->gainDB / 40));
+    S = pConfig->shelfSlope;
+    a = s/2 * ma_sqrtd((A + 1/A) * (1/S - 1) + 2);
+    sqrtA = 2*ma_sqrtd(A)*a;
+
+    bqConfig.b0 =  A * ((A + 1) - (A - 1)*c + sqrtA);
+    bqConfig.b1 =  2 * A * ((A - 1) - (A + 1)*c);
+    bqConfig.b2 =  A * ((A + 1) - (A - 1)*c - sqrtA);
+    bqConfig.a0 =  (A + 1) + (A - 1)*c + sqrtA;
+    bqConfig.a1 = -2 * ((A - 1) + (A + 1)*c);
+    bqConfig.a2 =  (A + 1) + (A - 1)*c - sqrtA;
+
+    bqConfig.format   = pConfig->format;
+    bqConfig.channels = pConfig->channels;
+
+    return bqConfig;
+}
+
+MA_API ma_result ma_loshelf2_get_heap_size(const ma_loshelf2_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_biquad_config bqConfig;
+    bqConfig = ma_loshelf2__get_biquad_config(pConfig);
+
+    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
+}
+
+MA_API ma_result ma_loshelf2_init_preallocated(const ma_loshelf2_config* pConfig, void* pHeap, ma_loshelf2* pFilter)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pFilter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pFilter);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_loshelf2__get_biquad_config(pConfig);
+    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pFilter->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_loshelf2_init(const ma_loshelf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_loshelf2* pFilter)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_loshelf2_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_loshelf2_init_preallocated(pConfig, pHeap, pFilter);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pFilter->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
+    return MA_SUCCESS;
+}
+
+MA_API void ma_loshelf2_uninit(ma_loshelf2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFilter == NULL) {
+        return;
+    }
+
+    ma_biquad_uninit(&pFilter->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
+}
+
+MA_API ma_result ma_loshelf2_reinit(const ma_loshelf2_config* pConfig, ma_loshelf2* pFilter)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pFilter == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_loshelf2__get_biquad_config(pConfig);
+    result = ma_biquad_reinit(&bqConfig, &pFilter->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_loshelf2_process_pcm_frame_s16(ma_loshelf2* pFilter, ma_int16* pFrameOut, const ma_int16* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_s16(&pFilter->bq, pFrameOut, pFrameIn);
+}
+
+static MA_INLINE void ma_loshelf2_process_pcm_frame_f32(ma_loshelf2* pFilter, float* pFrameOut, const float* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_f32(&pFilter->bq, pFrameOut, pFrameIn);
+}
+
+MA_API ma_result ma_loshelf2_process_pcm_frames(ma_loshelf2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pFilter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_biquad_process_pcm_frames(&pFilter->bq, pFramesOut, pFramesIn, frameCount);
+}
+
+MA_API ma_uint32 ma_loshelf2_get_latency(const ma_loshelf2* pFilter)
+{
+    if (pFilter == NULL) {
+        return 0;
+    }
+
+    return ma_biquad_get_latency(&pFilter->bq);
+}
+
+
+/**************************************************************************************************************************************************************
+
+High Shelf Filter
+
+**************************************************************************************************************************************************************/
+MA_API ma_hishelf2_config ma_hishelf2_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double shelfSlope, double frequency)
+{
+    ma_hishelf2_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format     = format;
+    config.channels   = channels;
+    config.sampleRate = sampleRate;
+    config.gainDB     = gainDB;
+    config.shelfSlope = shelfSlope;
+    config.frequency  = frequency;
+
+    return config;
+}
+
+
+static MA_INLINE ma_biquad_config ma_hishelf2__get_biquad_config(const ma_hishelf2_config* pConfig)
+{
+    ma_biquad_config bqConfig;
+    double w;
+    double s;
+    double c;
+    double A;
+    double S;
+    double a;
+    double sqrtA;
+
+    MA_ASSERT(pConfig != NULL);
+
+    w = 2 * MA_PI_D * pConfig->frequency / pConfig->sampleRate;
+    s = ma_sind(w);
+    c = ma_cosd(w);
+    A = ma_powd(10, (pConfig->gainDB / 40));
+    S = pConfig->shelfSlope;
+    a = s/2 * ma_sqrtd((A + 1/A) * (1/S - 1) + 2);
+    sqrtA = 2*ma_sqrtd(A)*a;
+
+    bqConfig.b0 =  A * ((A + 1) + (A - 1)*c + sqrtA);
+    bqConfig.b1 = -2 * A * ((A - 1) + (A + 1)*c);
+    bqConfig.b2 =  A * ((A + 1) + (A - 1)*c - sqrtA);
+    bqConfig.a0 =  (A + 1) - (A - 1)*c + sqrtA;
+    bqConfig.a1 =  2 * ((A - 1) - (A + 1)*c);
+    bqConfig.a2 =  (A + 1) - (A - 1)*c - sqrtA;
+
+    bqConfig.format   = pConfig->format;
+    bqConfig.channels = pConfig->channels;
+
+    return bqConfig;
+}
+
+MA_API ma_result ma_hishelf2_get_heap_size(const ma_hishelf2_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_biquad_config bqConfig;
+    bqConfig = ma_hishelf2__get_biquad_config(pConfig);
+
+    return ma_biquad_get_heap_size(&bqConfig, pHeapSizeInBytes);
+}
+
+MA_API ma_result ma_hishelf2_init_preallocated(const ma_hishelf2_config* pConfig, void* pHeap, ma_hishelf2* pFilter)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pFilter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pFilter);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_hishelf2__get_biquad_config(pConfig);
+    result = ma_biquad_init_preallocated(&bqConfig, pHeap, &pFilter->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_hishelf2_init(const ma_hishelf2_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hishelf2* pFilter)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_hishelf2_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_hishelf2_init_preallocated(pConfig, pHeap, pFilter);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pFilter->bq._ownsHeap = MA_TRUE;    /* <-- This will cause the biquad to take ownership of the heap and free it when it's uninitialized. */
+    return MA_SUCCESS;
+}
+
+MA_API void ma_hishelf2_uninit(ma_hishelf2* pFilter, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFilter == NULL) {
+        return;
+    }
+
+    ma_biquad_uninit(&pFilter->bq, pAllocationCallbacks);   /* <-- This will free the heap allocation. */
+}
+
+MA_API ma_result ma_hishelf2_reinit(const ma_hishelf2_config* pConfig, ma_hishelf2* pFilter)
+{
+    ma_result result;
+    ma_biquad_config bqConfig;
+
+    if (pFilter == NULL || pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    bqConfig = ma_hishelf2__get_biquad_config(pConfig);
+    result = ma_biquad_reinit(&bqConfig, &pFilter->bq);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static MA_INLINE void ma_hishelf2_process_pcm_frame_s16(ma_hishelf2* pFilter, ma_int16* pFrameOut, const ma_int16* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_s16(&pFilter->bq, pFrameOut, pFrameIn);
+}
+
+static MA_INLINE void ma_hishelf2_process_pcm_frame_f32(ma_hishelf2* pFilter, float* pFrameOut, const float* pFrameIn)
+{
+    ma_biquad_process_pcm_frame_f32(&pFilter->bq, pFrameOut, pFrameIn);
+}
+
+MA_API ma_result ma_hishelf2_process_pcm_frames(ma_hishelf2* pFilter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pFilter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_biquad_process_pcm_frames(&pFilter->bq, pFramesOut, pFramesIn, frameCount);
+}
+
+MA_API ma_uint32 ma_hishelf2_get_latency(const ma_hishelf2* pFilter)
+{
+    if (pFilter == NULL) {
+        return 0;
+    }
+
+    return ma_biquad_get_latency(&pFilter->bq);
+}
+
+
+
+/*
+Delay
+*/
+MA_API ma_delay_config ma_delay_config_init(ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 delayInFrames, float decay)
+{
+    ma_delay_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.channels      = channels;
+    config.sampleRate    = sampleRate;
+    config.delayInFrames = delayInFrames;
+    config.delayStart    = (decay == 0) ? MA_TRUE : MA_FALSE;   /* Delay the start if it looks like we're not configuring an echo. */
+    config.wet           = 1;
+    config.dry           = 1;
+    config.decay         = decay;
+
+    return config;
+}
+
+
+MA_API ma_result ma_delay_init(const ma_delay_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_delay* pDelay)
+{
+    if (pDelay == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDelay);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->decay < 0 || pConfig->decay > 1) {
+        return MA_INVALID_ARGS;
+    }
+
+    pDelay->config             = *pConfig;
+    pDelay->bufferSizeInFrames = pConfig->delayInFrames;
+    pDelay->cursor             = 0;
+
+    pDelay->pBuffer = (float*)ma_malloc((size_t)(pDelay->bufferSizeInFrames * ma_get_bytes_per_frame(ma_format_f32, pConfig->channels)), pAllocationCallbacks);
+    if (pDelay->pBuffer == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    ma_silence_pcm_frames(pDelay->pBuffer, pDelay->bufferSizeInFrames, ma_format_f32, pConfig->channels);
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_delay_uninit(ma_delay* pDelay, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pDelay == NULL) {
+        return;
+    }
+
+    ma_free(pDelay->pBuffer, pAllocationCallbacks);
+}
+
+MA_API ma_result ma_delay_process_pcm_frames(ma_delay* pDelay, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
+{
+    ma_uint32 iFrame;
+    ma_uint32 iChannel;
+    float* pFramesOutF32 = (float*)pFramesOut;
+    const float* pFramesInF32 = (const float*)pFramesIn;
+
+    if (pDelay == NULL || pFramesOut == NULL || pFramesIn == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        for (iChannel = 0; iChannel < pDelay->config.channels; iChannel += 1) {
+            ma_uint32 iBuffer = (pDelay->cursor * pDelay->config.channels) + iChannel;
+
+            if (pDelay->config.delayStart) {
+                /* Delayed start. */
+
+                /* Read */
+                pFramesOutF32[iChannel] = pDelay->pBuffer[iBuffer] * pDelay->config.wet;
+
+                /* Feedback */
+                pDelay->pBuffer[iBuffer] = (pDelay->pBuffer[iBuffer] * pDelay->config.decay) + (pFramesInF32[iChannel] * pDelay->config.dry);
+            } else {
+                /* Immediate start */
+
+                /* Feedback */
+                pDelay->pBuffer[iBuffer] = (pDelay->pBuffer[iBuffer] * pDelay->config.decay) + (pFramesInF32[iChannel] * pDelay->config.dry);
+
+                /* Read */
+                pFramesOutF32[iChannel] = pDelay->pBuffer[iBuffer] * pDelay->config.wet;
+            }
+        }
+
+        pDelay->cursor = (pDelay->cursor + 1) % pDelay->bufferSizeInFrames;
+
+        pFramesOutF32 += pDelay->config.channels;
+        pFramesInF32  += pDelay->config.channels;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_delay_set_wet(ma_delay* pDelay, float value)
+{
+    if (pDelay == NULL) {
+        return;
+    }
+
+    pDelay->config.wet = value;
+}
+
+MA_API float ma_delay_get_wet(const ma_delay* pDelay)
+{
+    if (pDelay == NULL) {
+        return 0;
+    }
+
+    return pDelay->config.wet;
+}
+
+MA_API void ma_delay_set_dry(ma_delay* pDelay, float value)
+{
+    if (pDelay == NULL) {
+        return;
+    }
+
+    pDelay->config.dry = value;
+}
+
+MA_API float ma_delay_get_dry(const ma_delay* pDelay)
+{
+    if (pDelay == NULL) {
+        return 0;
+    }
+
+    return pDelay->config.dry;
+}
+
+MA_API void ma_delay_set_decay(ma_delay* pDelay, float value)
+{
+    if (pDelay == NULL) {
+        return;
+    }
+
+    pDelay->config.decay = value;
+}
+
+MA_API float ma_delay_get_decay(const ma_delay* pDelay)
+{
+    if (pDelay == NULL) {
+        return 0;
+    }
+
+    return pDelay->config.decay;
+}
+
+
+MA_API ma_gainer_config ma_gainer_config_init(ma_uint32 channels, ma_uint32 smoothTimeInFrames)
+{
+    ma_gainer_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.channels           = channels;
+    config.smoothTimeInFrames = smoothTimeInFrames;
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t oldGainsOffset;
+    size_t newGainsOffset;
+} ma_gainer_heap_layout;
+
+static ma_result ma_gainer_get_heap_layout(const ma_gainer_config* pConfig, ma_gainer_heap_layout* pHeapLayout)
+{
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Old gains. */
+    pHeapLayout->oldGainsOffset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += sizeof(float) * pConfig->channels;
+
+    /* New gains. */
+    pHeapLayout->newGainsOffset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += sizeof(float) * pConfig->channels;
+
+    /* Alignment. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_gainer_get_heap_size(const ma_gainer_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_gainer_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_gainer_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_gainer_init_preallocated(const ma_gainer_config* pConfig, void* pHeap, ma_gainer* pGainer)
+{
+    ma_result result;
+    ma_gainer_heap_layout heapLayout;
+    ma_uint32 iChannel;
+
+    if (pGainer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pGainer);
+
+    if (pConfig == NULL || pHeap == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_gainer_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pGainer->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pGainer->pOldGains = (float*)ma_offset_ptr(pHeap, heapLayout.oldGainsOffset);
+    pGainer->pNewGains = (float*)ma_offset_ptr(pHeap, heapLayout.newGainsOffset);
+    pGainer->masterVolume = 1;
+
+    pGainer->config = *pConfig;
+    pGainer->t      = (ma_uint32)-1;  /* No interpolation by default. */
+
+    for (iChannel = 0; iChannel < pConfig->channels; iChannel += 1) {
+        pGainer->pOldGains[iChannel] = 1;
+        pGainer->pNewGains[iChannel] = 1;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_gainer_init(const ma_gainer_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_gainer* pGainer)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_gainer_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to retrieve the size of the heap allocation. */
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_gainer_init_preallocated(pConfig, pHeap, pGainer);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pGainer->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_gainer_uninit(ma_gainer* pGainer, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pGainer == NULL) {
+        return;
+    }
+
+    if (pGainer->_ownsHeap) {
+        ma_free(pGainer->_pHeap, pAllocationCallbacks);
+    }
+}
+
+static float ma_gainer_calculate_current_gain(const ma_gainer* pGainer, ma_uint32 channel)
+{
+    float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
+    return ma_mix_f32_fast(pGainer->pOldGains[channel], pGainer->pNewGains[channel], a);
+}
+
+static /*__attribute__((noinline))*/ ma_result ma_gainer_process_pcm_frames_internal(ma_gainer * pGainer, void* MA_RESTRICT pFramesOut, const void* MA_RESTRICT pFramesIn, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannel;
+    ma_uint64 interpolatedFrameCount;
+
+    MA_ASSERT(pGainer != NULL);
+
+    /*
+    We don't necessarily need to apply a linear interpolation for the entire frameCount frames. When
+    linear interpolation is not needed we can do a simple volume adjustment which will be more
+    efficient than a lerp with an alpha value of 1.
+
+    To do this, all we need to do is determine how many frames need to have a lerp applied. Then we
+    just process that number of frames with linear interpolation. After that we run on an optimized
+    path which just applies the new gains without a lerp.
+    */
+    if (pGainer->t >= pGainer->config.smoothTimeInFrames) {
+        interpolatedFrameCount = 0;
+    } else {
+        interpolatedFrameCount = pGainer->t - pGainer->config.smoothTimeInFrames;
+        if (interpolatedFrameCount > frameCount) {
+            interpolatedFrameCount = frameCount;
+        }
+    }
+
+    /*
+    Start off with our interpolated frames. When we do this, we'll adjust frameCount and our pointers
+    so that the fast path can work naturally without consideration of the interpolated path.
+    */
+    if (interpolatedFrameCount > 0) {
+        /* We can allow the input and output buffers to be null in which case we'll just update the internal timer. */
+        if (pFramesOut != NULL && pFramesIn != NULL) {
+            /*
+            All we're really doing here is moving the old gains towards the new gains. We don't want to
+            be modifying the gains inside the ma_gainer object because that will break things. Instead
+            we can make a copy here on the stack. For extreme channel counts we can fall back to a slower
+            implementation which just uses a standard lerp.
+            */
+            float* pFramesOutF32 = (float*)pFramesOut;
+            const float* pFramesInF32 = (const float*)pFramesIn;
+            float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
+            float d = 1.0f / pGainer->config.smoothTimeInFrames;
+
+            if (pGainer->config.channels <= 32) {
+                float pRunningGain[32];
+                float pRunningGainDelta[32];    /* Could this be heap-allocated as part of the ma_gainer object? */
+
+                /* Initialize the running gain. */
+                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+                    float t = (pGainer->pNewGains[iChannel] - pGainer->pOldGains[iChannel]) * pGainer->masterVolume;
+                    pRunningGainDelta[iChannel] = t * d;
+                    pRunningGain[iChannel] = (pGainer->pOldGains[iChannel] * pGainer->masterVolume) + (t * a);
+                }
+
+                iFrame = 0;
+
+                /* Optimized paths for common channel counts. This is mostly just experimenting with some SIMD ideas. It's not necessarily final. */
+                if (pGainer->config.channels == 2) {
+                #if defined(MA_SUPPORT_SSE2)
+                    if (ma_has_sse2()) {
+                        ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
+
+                        /* Expand some arrays so we can have a clean SIMD loop below. */
+                        __m128 runningGainDelta0 = _mm_set_ps(pRunningGainDelta[1], pRunningGainDelta[0], pRunningGainDelta[1], pRunningGainDelta[0]);
+                        __m128 runningGain0      = _mm_set_ps(pRunningGain[1] + pRunningGainDelta[1], pRunningGain[0] + pRunningGainDelta[0], pRunningGain[1], pRunningGain[0]);
+
+                        for (; iFrame < unrolledLoopCount; iFrame += 1) {
+                            _mm_storeu_ps(&pFramesOutF32[iFrame*4 + 0], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*4 + 0]), runningGain0));
+                            runningGain0 = _mm_add_ps(runningGain0, runningGainDelta0);
+                        }
+
+                        iFrame = unrolledLoopCount << 1;
+                    } else
+                #endif
+                    {
+                        /*
+                        Two different scalar implementations here. Clang (and I assume GCC) will vectorize
+                        both of these, but the bottom version results in a nicer vectorization with less
+                        instructions emitted. The problem, however, is that the bottom version runs slower
+                        when compiled with MSVC. The top version will be partially vectorized by MSVC.
+                        */
+                    #if defined(_MSC_VER) && !defined(__clang__)
+                        ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
+
+                        /* Expand some arrays so we can have a clean 4x SIMD operation in the loop. */
+                        pRunningGainDelta[2] = pRunningGainDelta[0];
+                        pRunningGainDelta[3] = pRunningGainDelta[1];
+                        pRunningGain[2] = pRunningGain[0] + pRunningGainDelta[0];
+                        pRunningGain[3] = pRunningGain[1] + pRunningGainDelta[1];
+
+                        for (; iFrame < unrolledLoopCount; iFrame += 1) {
+                            pFramesOutF32[iFrame*4 + 0] = pFramesInF32[iFrame*4 + 0] * pRunningGain[0];
+                            pFramesOutF32[iFrame*4 + 1] = pFramesInF32[iFrame*4 + 1] * pRunningGain[1];
+                            pFramesOutF32[iFrame*4 + 2] = pFramesInF32[iFrame*4 + 2] * pRunningGain[2];
+                            pFramesOutF32[iFrame*4 + 3] = pFramesInF32[iFrame*4 + 3] * pRunningGain[3];
+
+                            /* Move the running gain forward towards the new gain. */
+                            pRunningGain[0] += pRunningGainDelta[0];
+                            pRunningGain[1] += pRunningGainDelta[1];
+                            pRunningGain[2] += pRunningGainDelta[2];
+                            pRunningGain[3] += pRunningGainDelta[3];
+                        }
+
+                        iFrame = unrolledLoopCount << 1;
+                    #else
+                        for (; iFrame < interpolatedFrameCount; iFrame += 1) {
+                            for (iChannel = 0; iChannel < 2; iChannel += 1) {
+                                pFramesOutF32[iFrame*2 + iChannel] = pFramesInF32[iFrame*2 + iChannel] * pRunningGain[iChannel];
+                            }
+
+                            for (iChannel = 0; iChannel < 2; iChannel += 1) {
+                                pRunningGain[iChannel] += pRunningGainDelta[iChannel];
+                            }
+                        }
+                    #endif
+                    }
+                } else if (pGainer->config.channels == 6) {
+                #if defined(MA_SUPPORT_SSE2)
+                    if (ma_has_sse2()) {
+                        /*
+                        For 6 channels things are a bit more complicated because 6 isn't cleanly divisible by 4. We need to do 2 frames
+                        at a time, meaning we'll be doing 12 samples in a group. Like the stereo case we'll need to expand some arrays
+                        so we can do clean 4x SIMD operations.
+                        */
+                        ma_uint64 unrolledLoopCount = interpolatedFrameCount >> 1;
+
+                        /* Expand some arrays so we can have a clean SIMD loop below. */
+                        __m128 runningGainDelta0 = _mm_set_ps(pRunningGainDelta[3], pRunningGainDelta[2], pRunningGainDelta[1], pRunningGainDelta[0]);
+                        __m128 runningGainDelta1 = _mm_set_ps(pRunningGainDelta[1], pRunningGainDelta[0], pRunningGainDelta[5], pRunningGainDelta[4]);
+                        __m128 runningGainDelta2 = _mm_set_ps(pRunningGainDelta[5], pRunningGainDelta[4], pRunningGainDelta[3], pRunningGainDelta[2]);
+
+                        __m128 runningGain0      = _mm_set_ps(pRunningGain[3],                        pRunningGain[2],                        pRunningGain[1],                        pRunningGain[0]);
+                        __m128 runningGain1      = _mm_set_ps(pRunningGain[1] + pRunningGainDelta[1], pRunningGain[0] + pRunningGainDelta[0], pRunningGain[5],                        pRunningGain[4]);
+                        __m128 runningGain2      = _mm_set_ps(pRunningGain[5] + pRunningGainDelta[5], pRunningGain[4] + pRunningGainDelta[4], pRunningGain[3] + pRunningGainDelta[3], pRunningGain[2] + pRunningGainDelta[2]);
+
+                        for (; iFrame < unrolledLoopCount; iFrame += 1) {
+                            _mm_storeu_ps(&pFramesOutF32[iFrame*12 + 0], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*12 + 0]), runningGain0));
+                            _mm_storeu_ps(&pFramesOutF32[iFrame*12 + 4], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*12 + 4]), runningGain1));
+                            _mm_storeu_ps(&pFramesOutF32[iFrame*12 + 8], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*12 + 8]), runningGain2));
+
+                            runningGain0 = _mm_add_ps(runningGain0, runningGainDelta0);
+                            runningGain1 = _mm_add_ps(runningGain1, runningGainDelta1);
+                            runningGain2 = _mm_add_ps(runningGain2, runningGainDelta2);
+                        }
+
+                        iFrame = unrolledLoopCount << 1;
+                    } else
+                #endif
+                    {
+                        for (; iFrame < interpolatedFrameCount; iFrame += 1) {
+                            for (iChannel = 0; iChannel < 6; iChannel += 1) {
+                                pFramesOutF32[iFrame*6 + iChannel] = pFramesInF32[iFrame*6 + iChannel] * pRunningGain[iChannel];
+                            }
+
+                            /* Move the running gain forward towards the new gain. */
+                            for (iChannel = 0; iChannel < 6; iChannel += 1) {
+                                pRunningGain[iChannel] += pRunningGainDelta[iChannel];
+                            }
+                        }
+                    }
+                } else if (pGainer->config.channels == 8) {
+                    /* For 8 channels we can just go over frame by frame and do all eight channels as 2 separate 4x SIMD operations. */
+                #if defined(MA_SUPPORT_SSE2)
+                    if (ma_has_sse2()) {
+                        __m128 runningGainDelta0 = _mm_loadu_ps(&pRunningGainDelta[0]);
+                        __m128 runningGainDelta1 = _mm_loadu_ps(&pRunningGainDelta[4]);
+                        __m128 runningGain0      = _mm_loadu_ps(&pRunningGain[0]);
+                        __m128 runningGain1      = _mm_loadu_ps(&pRunningGain[4]);
+
+                        for (; iFrame < interpolatedFrameCount; iFrame += 1) {
+                            _mm_storeu_ps(&pFramesOutF32[iFrame*8 + 0], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*8 + 0]), runningGain0));
+                            _mm_storeu_ps(&pFramesOutF32[iFrame*8 + 4], _mm_mul_ps(_mm_loadu_ps(&pFramesInF32[iFrame*8 + 4]), runningGain1));
+
+                            runningGain0 = _mm_add_ps(runningGain0, runningGainDelta0);
+                            runningGain1 = _mm_add_ps(runningGain1, runningGainDelta1);
+                        }
+                    } else
+                #endif
+                    {
+                        /* This is crafted so that it auto-vectorizes when compiled with Clang. */
+                        for (; iFrame < interpolatedFrameCount; iFrame += 1) {
+                            for (iChannel = 0; iChannel < 8; iChannel += 1) {
+                                pFramesOutF32[iFrame*8 + iChannel] = pFramesInF32[iFrame*8 + iChannel] * pRunningGain[iChannel];
+                            }
+
+                            /* Move the running gain forward towards the new gain. */
+                            for (iChannel = 0; iChannel < 8; iChannel += 1) {
+                                pRunningGain[iChannel] += pRunningGainDelta[iChannel];
+                            }
+                        }
+                    }
+                }
+
+                for (; iFrame < interpolatedFrameCount; iFrame += 1) {
+                    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+                        pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * pRunningGain[iChannel];
+                        pRunningGain[iChannel] += pRunningGainDelta[iChannel];
+                    }
+                }
+            } else {
+                /* Slower path for extreme channel counts where we can't fit enough on the stack. We could also move this to the heap as part of the ma_gainer object which might even be better since it'll only be updated when the gains actually change. */
+                for (iFrame = 0; iFrame < interpolatedFrameCount; iFrame += 1) {
+                    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+                        pFramesOutF32[iFrame*pGainer->config.channels + iChannel] = pFramesInF32[iFrame*pGainer->config.channels + iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
+                    }
+
+                    a += d;
+                }
+            }
+        }
+
+        /* Make sure the timer is updated. */
+        pGainer->t = (ma_uint32)ma_min(pGainer->t + interpolatedFrameCount, pGainer->config.smoothTimeInFrames);
+
+        /* Adjust our arguments so the next part can work normally. */
+        frameCount -= interpolatedFrameCount;
+        pFramesOut  = ma_offset_ptr(pFramesOut, interpolatedFrameCount * sizeof(float));
+        pFramesIn   = ma_offset_ptr(pFramesIn,  interpolatedFrameCount * sizeof(float));
+    }
+
+    /* All we need to do here is apply the new gains using an optimized path. */
+    if (pFramesOut != NULL && pFramesIn != NULL) {
+        if (pGainer->config.channels <= 32) {
+            float gains[32];
+            for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+                gains[iChannel] = pGainer->pNewGains[iChannel] * pGainer->masterVolume;
+            }
+
+            ma_copy_and_apply_volume_factor_per_channel_f32((float*)pFramesOut, (const float*)pFramesIn, frameCount, pGainer->config.channels, gains);
+        } else {
+            /* Slow path. Too many channels to fit on the stack. Need to apply a master volume as a separate path. */
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+                    ((float*)pFramesOut)[iFrame*pGainer->config.channels + iChannel] = ((const float*)pFramesIn)[iFrame*pGainer->config.channels + iChannel] * pGainer->pNewGains[iChannel] * pGainer->masterVolume;
+                }
+            }
+        }
+    }
+
+    /* Now that some frames have been processed we need to make sure future changes to the gain are interpolated. */
+    if (pGainer->t == (ma_uint32)-1) {
+        pGainer->t  = (ma_uint32)ma_min(pGainer->config.smoothTimeInFrames, frameCount);
+    }
+
+#if 0
+    if (pGainer->t >= pGainer->config.smoothTimeInFrames) {
+        /* Fast path. No gain calculation required. */
+        ma_copy_and_apply_volume_factor_per_channel_f32(pFramesOutF32, pFramesInF32, frameCount, pGainer->config.channels, pGainer->pNewGains);
+        ma_apply_volume_factor_f32(pFramesOutF32, frameCount * pGainer->config.channels, pGainer->masterVolume);
+
+        /* Now that some frames have been processed we need to make sure future changes to the gain are interpolated. */
+        if (pGainer->t == (ma_uint32)-1) {
+            pGainer->t = pGainer->config.smoothTimeInFrames;
+        }
+    } else {
+        /* Slow path. Need to interpolate the gain for each channel individually. */
+
+        /* We can allow the input and output buffers to be null in which case we'll just update the internal timer. */
+        if (pFramesOut != NULL && pFramesIn != NULL) {
+            float a = (float)pGainer->t / pGainer->config.smoothTimeInFrames;
+            float d = 1.0f / pGainer->config.smoothTimeInFrames;
+            ma_uint32 channelCount = pGainer->config.channels;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channelCount; iChannel += 1) {
+                    pFramesOutF32[iChannel] = pFramesInF32[iChannel] * ma_mix_f32_fast(pGainer->pOldGains[iChannel], pGainer->pNewGains[iChannel], a) * pGainer->masterVolume;
+                }
+
+                pFramesOutF32 += channelCount;
+                pFramesInF32  += channelCount;
+
+                a += d;
+                if (a > 1) {
+                    a = 1;
+                }
+            }
+        }
+
+        pGainer->t = (ma_uint32)ma_min(pGainer->t + frameCount, pGainer->config.smoothTimeInFrames);
+
+    #if 0   /* Reference implementation. */
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            /* We can allow the input and output buffers to be null in which case we'll just update the internal timer. */
+            if (pFramesOut != NULL && pFramesIn != NULL) {
+                for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+                    pFramesOutF32[iFrame * pGainer->config.channels + iChannel] = pFramesInF32[iFrame * pGainer->config.channels + iChannel] * ma_gainer_calculate_current_gain(pGainer, iChannel) * pGainer->masterVolume;
+                }
+            }
+
+            /* Move interpolation time forward, but don't go beyond our smoothing time. */
+            pGainer->t = ma_min(pGainer->t + 1, pGainer->config.smoothTimeInFrames);
+        }
+    #endif
+    }
+#endif
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_gainer_process_pcm_frames(ma_gainer* pGainer, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pGainer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    ma_gainer_process_pcm_frames_internal() marks pFramesOut and pFramesIn with MA_RESTRICT which
+    helps with auto-vectorization.
+    */
+    return ma_gainer_process_pcm_frames_internal(pGainer, pFramesOut, pFramesIn, frameCount);
+}
+
+static void ma_gainer_set_gain_by_index(ma_gainer* pGainer, float newGain, ma_uint32 iChannel)
+{
+    pGainer->pOldGains[iChannel] = ma_gainer_calculate_current_gain(pGainer, iChannel);
+    pGainer->pNewGains[iChannel] = newGain;
+}
+
+static void ma_gainer_reset_smoothing_time(ma_gainer* pGainer)
+{
+    if (pGainer->t == (ma_uint32)-1) {
+        pGainer->t = pGainer->config.smoothTimeInFrames;    /* No smoothing required for initial gains setting. */
+    } else {
+        pGainer->t = 0;
+    }
+}
+
+MA_API ma_result ma_gainer_set_gain(ma_gainer* pGainer, float newGain)
+{
+    ma_uint32 iChannel;
+
+    if (pGainer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+        ma_gainer_set_gain_by_index(pGainer, newGain, iChannel);
+    }
+
+    /* The smoothing time needs to be reset to ensure we always interpolate by the configured smoothing time, but only if it's not the first setting. */
+    ma_gainer_reset_smoothing_time(pGainer);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_gainer_set_gains(ma_gainer* pGainer, float* pNewGains)
+{
+    ma_uint32 iChannel;
+
+    if (pGainer == NULL || pNewGains == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (iChannel = 0; iChannel < pGainer->config.channels; iChannel += 1) {
+        ma_gainer_set_gain_by_index(pGainer, pNewGains[iChannel], iChannel);
+    }
+
+    /* The smoothing time needs to be reset to ensure we always interpolate by the configured smoothing time, but only if it's not the first setting. */
+    ma_gainer_reset_smoothing_time(pGainer);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_gainer_set_master_volume(ma_gainer* pGainer, float volume)
+{
+    if (pGainer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pGainer->masterVolume = volume;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_gainer_get_master_volume(const ma_gainer* pGainer, float* pVolume)
+{
+    if (pGainer == NULL || pVolume == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pVolume = pGainer->masterVolume;
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_panner_config ma_panner_config_init(ma_format format, ma_uint32 channels)
+{
+    ma_panner_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format   = format;
+    config.channels = channels;
+    config.mode     = ma_pan_mode_balance;  /* Set to balancing mode by default because it's consistent with other audio engines and most likely what the caller is expecting. */
+    config.pan      = 0;
+
+    return config;
+}
+
+
+MA_API ma_result ma_panner_init(const ma_panner_config* pConfig, ma_panner* pPanner)
+{
+    if (pPanner == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pPanner);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pPanner->format   = pConfig->format;
+    pPanner->channels = pConfig->channels;
+    pPanner->mode     = pConfig->mode;
+    pPanner->pan      = pConfig->pan;
+
+    return MA_SUCCESS;
+}
+
+static void ma_stereo_balance_pcm_frames_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, float pan)
+{
+    ma_uint64 iFrame;
+
+    if (pan > 0) {
+        float factor = 1.0f - pan;
+        if (pFramesOut == pFramesIn) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                pFramesOut[iFrame*2 + 0] = pFramesIn[iFrame*2 + 0] * factor;
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                pFramesOut[iFrame*2 + 0] = pFramesIn[iFrame*2 + 0] * factor;
+                pFramesOut[iFrame*2 + 1] = pFramesIn[iFrame*2 + 1];
+            }
+        }
+    } else {
+        float factor = 1.0f + pan;
+        if (pFramesOut == pFramesIn) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                pFramesOut[iFrame*2 + 1] = pFramesIn[iFrame*2 + 1] * factor;
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                pFramesOut[iFrame*2 + 0] = pFramesIn[iFrame*2 + 0];
+                pFramesOut[iFrame*2 + 1] = pFramesIn[iFrame*2 + 1] * factor;
+            }
+        }
+    }
+}
+
+static void ma_stereo_balance_pcm_frames(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_format format, float pan)
+{
+    if (pan == 0) {
+        /* Fast path. No panning required. */
+        if (pFramesOut == pFramesIn) {
+            /* No-op */
+        } else {
+            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, format, 2);
+        }
+
+        return;
+    }
+
+    switch (format) {
+        case ma_format_f32: ma_stereo_balance_pcm_frames_f32((float*)pFramesOut, (float*)pFramesIn, frameCount, pan); break;
+
+        /* Unknown format. Just copy. */
+        default:
+        {
+            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, format, 2);
+        } break;
+    }
+}
+
+
+static void ma_stereo_pan_pcm_frames_f32(float* pFramesOut, const float* pFramesIn, ma_uint64 frameCount, float pan)
+{
+    ma_uint64 iFrame;
+
+    if (pan > 0) {
+        float factorL0 = 1.0f - pan;
+        float factorL1 = 0.0f + pan;
+
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float sample0 = (pFramesIn[iFrame*2 + 0] * factorL0);
+            float sample1 = (pFramesIn[iFrame*2 + 0] * factorL1) + pFramesIn[iFrame*2 + 1];
+
+            pFramesOut[iFrame*2 + 0] = sample0;
+            pFramesOut[iFrame*2 + 1] = sample1;
+        }
+    } else {
+        float factorR0 = 0.0f - pan;
+        float factorR1 = 1.0f + pan;
+
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float sample0 = pFramesIn[iFrame*2 + 0] + (pFramesIn[iFrame*2 + 1] * factorR0);
+            float sample1 =                           (pFramesIn[iFrame*2 + 1] * factorR1);
+
+            pFramesOut[iFrame*2 + 0] = sample0;
+            pFramesOut[iFrame*2 + 1] = sample1;
+        }
+    }
+}
+
+static void ma_stereo_pan_pcm_frames(void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount, ma_format format, float pan)
+{
+    if (pan == 0) {
+        /* Fast path. No panning required. */
+        if (pFramesOut == pFramesIn) {
+            /* No-op */
+        } else {
+            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, format, 2);
+        }
+
+        return;
+    }
+
+    switch (format) {
+        case ma_format_f32: ma_stereo_pan_pcm_frames_f32((float*)pFramesOut, (float*)pFramesIn, frameCount, pan); break;
+
+        /* Unknown format. Just copy. */
+        default:
+        {
+            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, format, 2);
+        } break;
+    }
+}
+
+MA_API ma_result ma_panner_process_pcm_frames(ma_panner* pPanner, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pPanner == NULL || pFramesOut == NULL || pFramesIn == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pPanner->channels == 2) {
+        /* Stereo case. For now assume channel 0 is left and channel right is 1, but should probably add support for a channel map. */
+        if (pPanner->mode == ma_pan_mode_balance) {
+            ma_stereo_balance_pcm_frames(pFramesOut, pFramesIn, frameCount, pPanner->format, pPanner->pan);
+        } else {
+            ma_stereo_pan_pcm_frames(pFramesOut, pFramesIn, frameCount, pPanner->format, pPanner->pan);
+        }
+    } else {
+        if (pPanner->channels == 1) {
+            /* Panning has no effect on mono streams. */
+            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, pPanner->format, pPanner->channels);
+        } else {
+            /* For now we're not going to support non-stereo set ups. Not sure how I want to handle this case just yet. */
+            ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, pPanner->format, pPanner->channels);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_panner_set_mode(ma_panner* pPanner, ma_pan_mode mode)
+{
+    if (pPanner == NULL) {
+        return;
+    }
+
+    pPanner->mode = mode;
+}
+
+MA_API ma_pan_mode ma_panner_get_mode(const ma_panner* pPanner)
+{
+    if (pPanner == NULL) {
+        return ma_pan_mode_balance;
+    }
+
+    return pPanner->mode;
+}
+
+MA_API void ma_panner_set_pan(ma_panner* pPanner, float pan)
+{
+    if (pPanner == NULL) {
+        return;
+    }
+
+    pPanner->pan = ma_clamp(pan, -1.0f, 1.0f);
+}
+
+MA_API float ma_panner_get_pan(const ma_panner* pPanner)
+{
+    if (pPanner == NULL) {
+        return 0;
+    }
+
+    return pPanner->pan;
+}
+
+
+
+
+MA_API ma_fader_config ma_fader_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
+{
+    ma_fader_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format     = format;
+    config.channels   = channels;
+    config.sampleRate = sampleRate;
+
+    return config;
+}
+
+
+MA_API ma_result ma_fader_init(const ma_fader_config* pConfig, ma_fader* pFader)
+{
+    if (pFader == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pFader);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Only f32 is supported for now. */
+    if (pConfig->format != ma_format_f32) {
+        return MA_INVALID_ARGS;
+    }
+
+    pFader->config         = *pConfig;
+    pFader->volumeBeg      = 1;
+    pFader->volumeEnd      = 1;
+    pFader->lengthInFrames = 0;
+    pFader->cursorInFrames = 0;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_fader_process_pcm_frames(ma_fader* pFader, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pFader == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* If the cursor is still negative we need to just copy the absolute number of those frames, but no more than frameCount. */
+    if (pFader->cursorInFrames < 0) {
+        ma_uint64 absCursorInFrames = (ma_uint64)0 - pFader->cursorInFrames;
+        if (absCursorInFrames > frameCount) {
+            absCursorInFrames = frameCount;
+        }
+
+        ma_copy_pcm_frames(pFramesOut, pFramesIn, absCursorInFrames, pFader->config.format, pFader->config.channels);
+
+        pFader->cursorInFrames += absCursorInFrames;
+        frameCount -= absCursorInFrames;
+        pFramesOut  = ma_offset_ptr(pFramesOut, ma_get_bytes_per_frame(pFader->config.format, pFader->config.channels)*absCursorInFrames);
+        pFramesIn   = ma_offset_ptr(pFramesIn,  ma_get_bytes_per_frame(pFader->config.format, pFader->config.channels)*absCursorInFrames);
+    }
+
+    if (pFader->cursorInFrames >= 0) {
+        /*
+        For now we need to clamp frameCount so that the cursor never overflows 32-bits. This is required for
+        the conversion to a float which we use for the linear interpolation. This might be changed later.
+        */
+        if (frameCount + pFader->cursorInFrames > UINT_MAX) {
+            frameCount = UINT_MAX - pFader->cursorInFrames;
+        }
+
+        /* Optimized path if volumeBeg and volumeEnd are equal. */
+        if (pFader->volumeBeg == pFader->volumeEnd) {
+            if (pFader->volumeBeg == 1) {
+                /* Straight copy. */
+                ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, pFader->config.format, pFader->config.channels);
+            } else {
+                /* Copy with volume. */
+                ma_copy_and_apply_volume_and_clip_pcm_frames(pFramesOut, pFramesIn, frameCount, pFader->config.format, pFader->config.channels, pFader->volumeBeg);
+            }
+        } else {
+            /* Slower path. Volumes are different, so may need to do an interpolation. */
+            if ((ma_uint64)pFader->cursorInFrames >= pFader->lengthInFrames) {
+                /* Fast path. We've gone past the end of the fade period so just apply the end volume to all samples. */
+                ma_copy_and_apply_volume_and_clip_pcm_frames(pFramesOut, pFramesIn, frameCount, pFader->config.format, pFader->config.channels, pFader->volumeEnd);
+            } else {
+                /* Slow path. This is where we do the actual fading. */
+                ma_uint64 iFrame;
+                ma_uint32 iChannel;
+
+                /* For now we only support f32. Support for other formats might be added later. */
+                if (pFader->config.format == ma_format_f32) {
+                    const float* pFramesInF32  = (const float*)pFramesIn;
+                    /* */ float* pFramesOutF32 = (      float*)pFramesOut;
+
+                    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                        float a = (ma_uint32)ma_min(pFader->cursorInFrames + iFrame, pFader->lengthInFrames) / (float)((ma_uint32)pFader->lengthInFrames);   /* Safe cast due to the frameCount clamp at the top of this function. */
+                        float volume = ma_mix_f32_fast(pFader->volumeBeg, pFader->volumeEnd, a);
+
+                        for (iChannel = 0; iChannel < pFader->config.channels; iChannel += 1) {
+                            pFramesOutF32[iFrame*pFader->config.channels + iChannel] = pFramesInF32[iFrame*pFader->config.channels + iChannel] * volume;
+                        }
+                    }
+                } else {
+                    return MA_NOT_IMPLEMENTED;
+                }
+            }
+        }
+    }
+
+    pFader->cursorInFrames += frameCount;
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_fader_get_data_format(const ma_fader* pFader, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate)
+{
+    if (pFader == NULL) {
+        return;
+    }
+
+    if (pFormat != NULL) {
+        *pFormat = pFader->config.format;
+    }
+
+    if (pChannels != NULL) {
+        *pChannels = pFader->config.channels;
+    }
+
+    if (pSampleRate != NULL) {
+        *pSampleRate = pFader->config.sampleRate;
+    }
+}
+
+MA_API void ma_fader_set_fade(ma_fader* pFader, float volumeBeg, float volumeEnd, ma_uint64 lengthInFrames)
+{
+    ma_fader_set_fade_ex(pFader, volumeBeg, volumeEnd, lengthInFrames, 0);
+}
+
+MA_API void ma_fader_set_fade_ex(ma_fader* pFader, float volumeBeg, float volumeEnd, ma_uint64 lengthInFrames, ma_int64 startOffsetInFrames)
+{
+    if (pFader == NULL) {
+        return;
+    }
+
+    /* If the volume is negative, use current volume. */
+    if (volumeBeg < 0) {
+        volumeBeg = ma_fader_get_current_volume(pFader);
+    }
+
+    /*
+    The length needs to be clamped to 32-bits due to how we convert it to a float for linear
+    interpolation reasons. I might change this requirement later, but for now it's not important.
+    */
+    if (lengthInFrames > UINT_MAX) {
+        lengthInFrames = UINT_MAX;
+    }
+
+    /* The start offset needs to be clamped to ensure it doesn't overflow a signed number. */
+    if (startOffsetInFrames > INT_MAX) {
+        startOffsetInFrames = INT_MAX;
+    }
+
+    pFader->volumeBeg      = volumeBeg;
+    pFader->volumeEnd      = volumeEnd;
+    pFader->lengthInFrames = lengthInFrames;
+    pFader->cursorInFrames = -startOffsetInFrames;
+}
+
+MA_API float ma_fader_get_current_volume(const ma_fader* pFader)
+{
+    if (pFader == NULL) {
+        return 0.0f;
+    }
+
+    /* Any frames prior to the start of the fade period will be at unfaded volume. */
+    if (pFader->cursorInFrames < 0) {
+        return 1.0f;
+    }
+
+    /* The current volume depends on the position of the cursor. */
+    if (pFader->cursorInFrames == 0) {
+        return pFader->volumeBeg;
+    } else if ((ma_uint64)pFader->cursorInFrames >= pFader->lengthInFrames) {   /* Safe case because the < 0 case was checked above. */
+        return pFader->volumeEnd;
+    } else {
+        /* The cursor is somewhere inside the fading period. We can figure this out with a simple linear interpolation between volumeBeg and volumeEnd based on our cursor position. */
+        return ma_mix_f32_fast(pFader->volumeBeg, pFader->volumeEnd, (ma_uint32)pFader->cursorInFrames / (float)((ma_uint32)pFader->lengthInFrames));    /* Safe cast to uint32 because we clamp it in ma_fader_process_pcm_frames(). */
+    }
+}
+
+
+
+
+
+MA_API ma_vec3f ma_vec3f_init_3f(float x, float y, float z)
+{
+    ma_vec3f v;
+
+    v.x = x;
+    v.y = y;
+    v.z = z;
+
+    return v;
+}
+
+MA_API ma_vec3f ma_vec3f_sub(ma_vec3f a, ma_vec3f b)
+{
+    return ma_vec3f_init_3f(
+        a.x - b.x,
+        a.y - b.y,
+        a.z - b.z
+    );
+}
+
+MA_API ma_vec3f ma_vec3f_neg(ma_vec3f a)
+{
+    return ma_vec3f_init_3f(
+        -a.x,
+        -a.y,
+        -a.z
+    );
+}
+
+MA_API float ma_vec3f_dot(ma_vec3f a, ma_vec3f b)
+{
+    return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+MA_API float ma_vec3f_len2(ma_vec3f v)
+{
+    return ma_vec3f_dot(v, v);
+}
+
+MA_API float ma_vec3f_len(ma_vec3f v)
+{
+    return (float)ma_sqrtd(ma_vec3f_len2(v));
+}
+
+
+
+MA_API float ma_vec3f_dist(ma_vec3f a, ma_vec3f b)
+{
+    return ma_vec3f_len(ma_vec3f_sub(a, b));
+}
+
+MA_API ma_vec3f ma_vec3f_normalize(ma_vec3f v)
+{
+    float invLen;
+    float len2 = ma_vec3f_len2(v);
+    if (len2 == 0) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    invLen = ma_rsqrtf(len2);
+    v.x *= invLen;
+    v.y *= invLen;
+    v.z *= invLen;
+
+    return v;
+}
+
+MA_API ma_vec3f ma_vec3f_cross(ma_vec3f a, ma_vec3f b)
+{
+    return ma_vec3f_init_3f(
+        a.y*b.z - a.z*b.y,
+        a.z*b.x - a.x*b.z,
+        a.x*b.y - a.y*b.x
+    );
+}
+
+
+MA_API void ma_atomic_vec3f_init(ma_atomic_vec3f* v, ma_vec3f value)
+{
+    v->v = value;
+    v->lock = 0;    /* Important this is initialized to 0. */
+}
+
+MA_API void ma_atomic_vec3f_set(ma_atomic_vec3f* v, ma_vec3f value)
+{
+    ma_spinlock_lock(&v->lock);
+    {
+        v->v = value;
+    }
+    ma_spinlock_unlock(&v->lock);
+}
+
+MA_API ma_vec3f ma_atomic_vec3f_get(ma_atomic_vec3f* v)
+{
+    ma_vec3f r;
+
+    ma_spinlock_lock(&v->lock);
+    {
+        r = v->v;
+    }
+    ma_spinlock_unlock(&v->lock);
+
+    return r;
+}
+
+
+
+static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChannelMapOut, ma_uint32 channelsOut, const float* pFramesIn, const ma_channel* pChannelMapIn, ma_uint32 channelsIn, ma_uint64 frameCount, ma_channel_mix_mode mode, ma_mono_expansion_mode monoExpansionMode);
+static ma_bool32 ma_is_spatial_channel_position(ma_channel channelPosition);
+
+
+#ifndef MA_DEFAULT_SPEED_OF_SOUND
+#define MA_DEFAULT_SPEED_OF_SOUND   343.3f
+#endif
+
+/*
+These vectors represent the direction that speakers are facing from the center point. They're used
+for panning in the spatializer. Must be normalized.
+*/
+static ma_vec3f g_maChannelDirections[MA_CHANNEL_POSITION_COUNT] = {
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_NONE */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_MONO */
+    {-0.7071f,  0.0f,    -0.7071f },  /* MA_CHANNEL_FRONT_LEFT */
+    {+0.7071f,  0.0f,    -0.7071f },  /* MA_CHANNEL_FRONT_RIGHT */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_FRONT_CENTER */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_LFE */
+    {-0.7071f,  0.0f,    +0.7071f },  /* MA_CHANNEL_BACK_LEFT */
+    {+0.7071f,  0.0f,    +0.7071f },  /* MA_CHANNEL_BACK_RIGHT */
+    {-0.3162f,  0.0f,    -0.9487f },  /* MA_CHANNEL_FRONT_LEFT_CENTER */
+    {+0.3162f,  0.0f,    -0.9487f },  /* MA_CHANNEL_FRONT_RIGHT_CENTER */
+    { 0.0f,     0.0f,    +1.0f    },  /* MA_CHANNEL_BACK_CENTER */
+    {-1.0f,     0.0f,     0.0f    },  /* MA_CHANNEL_SIDE_LEFT */
+    {+1.0f,     0.0f,     0.0f    },  /* MA_CHANNEL_SIDE_RIGHT */
+    { 0.0f,    +1.0f,     0.0f    },  /* MA_CHANNEL_TOP_CENTER */
+    {-0.5774f, +0.5774f, -0.5774f },  /* MA_CHANNEL_TOP_FRONT_LEFT */
+    { 0.0f,    +0.7071f, -0.7071f },  /* MA_CHANNEL_TOP_FRONT_CENTER */
+    {+0.5774f, +0.5774f, -0.5774f },  /* MA_CHANNEL_TOP_FRONT_RIGHT */
+    {-0.5774f, +0.5774f, +0.5774f },  /* MA_CHANNEL_TOP_BACK_LEFT */
+    { 0.0f,    +0.7071f, +0.7071f },  /* MA_CHANNEL_TOP_BACK_CENTER */
+    {+0.5774f, +0.5774f, +0.5774f },  /* MA_CHANNEL_TOP_BACK_RIGHT */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_0 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_1 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_2 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_3 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_4 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_5 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_6 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_7 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_8 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_9 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_10 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_11 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_12 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_13 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_14 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_15 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_16 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_17 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_18 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_19 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_20 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_21 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_22 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_23 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_24 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_25 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_26 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_27 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_28 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_29 */
+    { 0.0f,     0.0f,    -1.0f    },  /* MA_CHANNEL_AUX_30 */
+    { 0.0f,     0.0f,    -1.0f    }   /* MA_CHANNEL_AUX_31 */
+};
+
+static ma_vec3f ma_get_channel_direction(ma_channel channel)
+{
+    if (channel >= MA_CHANNEL_POSITION_COUNT) {
+        return ma_vec3f_init_3f(0, 0, -1);
+    } else {
+        return g_maChannelDirections[channel];
+    }
+}
+
+
+
+static float ma_attenuation_inverse(float distance, float minDistance, float maxDistance, float rolloff)
+{
+    if (minDistance >= maxDistance) {
+        return 1;   /* To avoid division by zero. Do not attenuate. */
+    }
+
+    return minDistance / (minDistance + rolloff * (ma_clamp(distance, minDistance, maxDistance) - minDistance));
+}
+
+static float ma_attenuation_linear(float distance, float minDistance, float maxDistance, float rolloff)
+{
+    if (minDistance >= maxDistance) {
+        return 1;   /* To avoid division by zero. Do not attenuate. */
+    }
+
+    return 1 - rolloff * (ma_clamp(distance, minDistance, maxDistance) - minDistance) / (maxDistance - minDistance);
+}
+
+static float ma_attenuation_exponential(float distance, float minDistance, float maxDistance, float rolloff)
+{
+    if (minDistance >= maxDistance) {
+        return 1;   /* To avoid division by zero. Do not attenuate. */
+    }
+
+    return (float)ma_powd(ma_clamp(distance, minDistance, maxDistance) / minDistance, -rolloff);
+}
+
+
+/*
+Doppler Effect calculation taken from the OpenAL spec, with two main differences:
+
+  1) The source to listener vector will have already been calculated at an earlier step so we can
+     just use that directly. We need only the position of the source relative to the origin.
+
+  2) We don't scale by a frequency because we actually just want the ratio which we'll plug straight
+     into the resampler directly.
+*/
+static float ma_doppler_pitch(ma_vec3f relativePosition, ma_vec3f sourceVelocity, ma_vec3f listenVelocity, float speedOfSound, float dopplerFactor)
+{
+    float len;
+    float vls;
+    float vss;
+
+    len = ma_vec3f_len(relativePosition);
+
+    /*
+    There's a case where the position of the source will be right on top of the listener in which
+    case the length will be 0 and we'll end up with a division by zero. We can just return a ratio
+    of 1.0 in this case. This is not considered in the OpenAL spec, but is necessary.
+    */
+    if (len == 0) {
+        return 1.0;
+    }
+
+    vls = ma_vec3f_dot(relativePosition, listenVelocity) / len;
+    vss = ma_vec3f_dot(relativePosition, sourceVelocity) / len;
+
+    vls = ma_min(vls, speedOfSound / dopplerFactor);
+    vss = ma_min(vss, speedOfSound / dopplerFactor);
+
+    return (speedOfSound - dopplerFactor*vls) / (speedOfSound - dopplerFactor*vss);
+}
+
+
+static void ma_get_default_channel_map_for_spatializer(ma_channel* pChannelMap, size_t channelMapCap, ma_uint32 channelCount)
+{
+    /*
+    Special case for stereo. Want to default the left and right speakers to side left and side
+    right so that they're facing directly down the X axis rather than slightly forward. Not
+    doing this will result in sounds being quieter when behind the listener. This might
+    actually be good for some scenarios, but I don't think it's an appropriate default because
+    it can be a bit unexpected.
+    */
+    if (channelCount == 2) {
+        pChannelMap[0] = MA_CHANNEL_SIDE_LEFT;
+        pChannelMap[1] = MA_CHANNEL_SIDE_RIGHT;
+    } else {
+        ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, channelCount);
+    }
+}
+
+
+MA_API ma_spatializer_listener_config ma_spatializer_listener_config_init(ma_uint32 channelsOut)
+{
+    ma_spatializer_listener_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.channelsOut             = channelsOut;
+    config.pChannelMapOut          = NULL;
+    config.handedness              = ma_handedness_right;
+    config.worldUp                 = ma_vec3f_init_3f(0, 1,  0);
+    config.coneInnerAngleInRadians = 6.283185f; /* 360 degrees. */
+    config.coneOuterAngleInRadians = 6.283185f; /* 360 degrees. */
+    config.coneOuterGain           = 0;
+    config.speedOfSound            = 343.3f;    /* Same as OpenAL. Used for doppler effect. */
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t channelMapOutOffset;
+} ma_spatializer_listener_heap_layout;
+
+static ma_result ma_spatializer_listener_get_heap_layout(const ma_spatializer_listener_config* pConfig, ma_spatializer_listener_heap_layout* pHeapLayout)
+{
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channelsOut == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Channel map. We always need this, even for passthroughs. */
+    pHeapLayout->channelMapOutOffset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += ma_align_64(sizeof(*pConfig->pChannelMapOut) * pConfig->channelsOut);
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_spatializer_listener_get_heap_size(const ma_spatializer_listener_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_spatializer_listener_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_spatializer_listener_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_spatializer_listener_init_preallocated(const ma_spatializer_listener_config* pConfig, void* pHeap, ma_spatializer_listener* pListener)
+{
+    ma_result result;
+    ma_spatializer_listener_heap_layout heapLayout;
+
+    if (pListener == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pListener);
+
+    result = ma_spatializer_listener_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pListener->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pListener->config    = *pConfig;
+    ma_atomic_vec3f_init(&pListener->position,  ma_vec3f_init_3f(0, 0, 0));
+    ma_atomic_vec3f_init(&pListener->direction, ma_vec3f_init_3f(0, 0, -1));
+    ma_atomic_vec3f_init(&pListener->velocity,  ma_vec3f_init_3f(0, 0,  0));
+    pListener->isEnabled = MA_TRUE;
+
+    /* Swap the forward direction if we're left handed (it was initialized based on right handed). */
+    if (pListener->config.handedness == ma_handedness_left) {
+        ma_vec3f negDir = ma_vec3f_neg(ma_spatializer_listener_get_direction(pListener));
+        ma_spatializer_listener_set_direction(pListener, negDir.x, negDir.y, negDir.z);
+    }
+
+
+    /* We must always have a valid channel map. */
+    pListener->config.pChannelMapOut = (ma_channel*)ma_offset_ptr(pHeap, heapLayout.channelMapOutOffset);
+
+    /* Use a slightly different default channel map for stereo. */
+    if (pConfig->pChannelMapOut == NULL) {
+        ma_get_default_channel_map_for_spatializer(pListener->config.pChannelMapOut, pConfig->channelsOut, pConfig->channelsOut);
+    } else {
+        ma_channel_map_copy_or_default(pListener->config.pChannelMapOut, pConfig->channelsOut, pConfig->pChannelMapOut, pConfig->channelsOut);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_spatializer_listener_init(const ma_spatializer_listener_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_spatializer_listener* pListener)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_spatializer_listener_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_spatializer_listener_init_preallocated(pConfig, pHeap, pListener);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pListener->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_spatializer_listener_uninit(ma_spatializer_listener* pListener, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    if (pListener->_ownsHeap) {
+        ma_free(pListener->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_channel* ma_spatializer_listener_get_channel_map(ma_spatializer_listener* pListener)
+{
+    if (pListener == NULL) {
+        return NULL;
+    }
+
+    return pListener->config.pChannelMapOut;
+}
+
+MA_API void ma_spatializer_listener_set_cone(ma_spatializer_listener* pListener, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    pListener->config.coneInnerAngleInRadians = innerAngleInRadians;
+    pListener->config.coneOuterAngleInRadians = outerAngleInRadians;
+    pListener->config.coneOuterGain           = outerGain;
+}
+
+MA_API void ma_spatializer_listener_get_cone(const ma_spatializer_listener* pListener, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    if (pInnerAngleInRadians != NULL) {
+        *pInnerAngleInRadians = pListener->config.coneInnerAngleInRadians;
+    }
+
+    if (pOuterAngleInRadians != NULL) {
+        *pOuterAngleInRadians = pListener->config.coneOuterAngleInRadians;
+    }
+
+    if (pOuterGain != NULL) {
+        *pOuterGain = pListener->config.coneOuterGain;
+    }
+}
+
+MA_API void ma_spatializer_listener_set_position(ma_spatializer_listener* pListener, float x, float y, float z)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    ma_atomic_vec3f_set(&pListener->position, ma_vec3f_init_3f(x, y, z));
+}
+
+MA_API ma_vec3f ma_spatializer_listener_get_position(const ma_spatializer_listener* pListener)
+{
+    if (pListener == NULL) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pListener->position); /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
+}
+
+MA_API void ma_spatializer_listener_set_direction(ma_spatializer_listener* pListener, float x, float y, float z)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    ma_atomic_vec3f_set(&pListener->direction, ma_vec3f_init_3f(x, y, z));
+}
+
+MA_API ma_vec3f ma_spatializer_listener_get_direction(const ma_spatializer_listener* pListener)
+{
+    if (pListener == NULL) {
+        return ma_vec3f_init_3f(0, 0, -1);
+    }
+
+    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pListener->direction);    /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
+}
+
+MA_API void ma_spatializer_listener_set_velocity(ma_spatializer_listener* pListener, float x, float y, float z)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    ma_atomic_vec3f_set(&pListener->velocity, ma_vec3f_init_3f(x, y, z));
+}
+
+MA_API ma_vec3f ma_spatializer_listener_get_velocity(const ma_spatializer_listener* pListener)
+{
+    if (pListener == NULL) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pListener->velocity); /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
+}
+
+MA_API void ma_spatializer_listener_set_speed_of_sound(ma_spatializer_listener* pListener, float speedOfSound)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    pListener->config.speedOfSound = speedOfSound;
+}
+
+MA_API float ma_spatializer_listener_get_speed_of_sound(const ma_spatializer_listener* pListener)
+{
+    if (pListener == NULL) {
+        return 0;
+    }
+
+    return pListener->config.speedOfSound;
+}
+
+MA_API void ma_spatializer_listener_set_world_up(ma_spatializer_listener* pListener, float x, float y, float z)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    pListener->config.worldUp = ma_vec3f_init_3f(x, y, z);
+}
+
+MA_API ma_vec3f ma_spatializer_listener_get_world_up(const ma_spatializer_listener* pListener)
+{
+    if (pListener == NULL) {
+        return ma_vec3f_init_3f(0, 1, 0);
+    }
+
+    return pListener->config.worldUp;
+}
+
+MA_API void ma_spatializer_listener_set_enabled(ma_spatializer_listener* pListener, ma_bool32 isEnabled)
+{
+    if (pListener == NULL) {
+        return;
+    }
+
+    pListener->isEnabled = isEnabled;
+}
+
+MA_API ma_bool32 ma_spatializer_listener_is_enabled(const ma_spatializer_listener* pListener)
+{
+    if (pListener == NULL) {
+        return MA_FALSE;
+    }
+
+    return pListener->isEnabled;
+}
+
+
+
+
+MA_API ma_spatializer_config ma_spatializer_config_init(ma_uint32 channelsIn, ma_uint32 channelsOut)
+{
+    ma_spatializer_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.channelsIn                   = channelsIn;
+    config.channelsOut                  = channelsOut;
+    config.pChannelMapIn                = NULL;
+    config.attenuationModel             = ma_attenuation_model_inverse;
+    config.positioning                  = ma_positioning_absolute;
+    config.handedness                   = ma_handedness_right;
+    config.minGain                      = 0;
+    config.maxGain                      = 1;
+    config.minDistance                  = 1;
+    config.maxDistance                  = MA_FLT_MAX;
+    config.rolloff                      = 1;
+    config.coneInnerAngleInRadians      = 6.283185f; /* 360 degrees. */
+    config.coneOuterAngleInRadians      = 6.283185f; /* 360 degrees. */
+    config.coneOuterGain                = 0.0f;
+    config.dopplerFactor                = 1;
+    config.directionalAttenuationFactor = 1;
+    config.minSpatializationChannelGain = 0.2f;
+    config.gainSmoothTimeInFrames       = 360;       /* 7.5ms @ 48K. */
+
+    return config;
+}
+
+
+static ma_gainer_config ma_spatializer_gainer_config_init(const ma_spatializer_config* pConfig)
+{
+    MA_ASSERT(pConfig != NULL);
+    return ma_gainer_config_init(pConfig->channelsOut, pConfig->gainSmoothTimeInFrames);
+}
+
+static ma_result ma_spatializer_validate_config(const ma_spatializer_config* pConfig)
+{
+    MA_ASSERT(pConfig != NULL);
+
+    if (pConfig->channelsIn == 0 || pConfig->channelsOut == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    return MA_SUCCESS;
+}
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t channelMapInOffset;
+    size_t newChannelGainsOffset;
+    size_t gainerOffset;
+} ma_spatializer_heap_layout;
+
+static ma_result ma_spatializer_get_heap_layout(const ma_spatializer_config* pConfig, ma_spatializer_heap_layout* pHeapLayout)
+{
+    ma_result result;
+
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_spatializer_validate_config(pConfig);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Channel map. */
+    pHeapLayout->channelMapInOffset = MA_SIZE_MAX;  /* <-- MA_SIZE_MAX indicates no allocation necessary. */
+    if (pConfig->pChannelMapIn != NULL) {
+        pHeapLayout->channelMapInOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += ma_align_64(sizeof(*pConfig->pChannelMapIn) * pConfig->channelsIn);
+    }
+
+    /* New channel gains for output. */
+    pHeapLayout->newChannelGainsOffset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += ma_align_64(sizeof(float) * pConfig->channelsOut);
+
+    /* Gainer. */
+    {
+        size_t gainerHeapSizeInBytes;
+        ma_gainer_config gainerConfig;
+
+        gainerConfig = ma_spatializer_gainer_config_init(pConfig);
+
+        result = ma_gainer_get_heap_size(&gainerConfig, &gainerHeapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->gainerOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += ma_align_64(gainerHeapSizeInBytes);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_spatializer_get_heap_size(const ma_spatializer_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_spatializer_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;  /* Safety. */
+
+    result = ma_spatializer_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_spatializer_init_preallocated(const ma_spatializer_config* pConfig, void* pHeap, ma_spatializer* pSpatializer)
+{
+    ma_result result;
+    ma_spatializer_heap_layout heapLayout;
+    ma_gainer_config gainerConfig;
+
+    if (pSpatializer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pSpatializer);
+
+    if (pConfig == NULL || pHeap == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_spatializer_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pSpatializer->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pSpatializer->channelsIn                   = pConfig->channelsIn;
+    pSpatializer->channelsOut                  = pConfig->channelsOut;
+    pSpatializer->attenuationModel             = pConfig->attenuationModel;
+    pSpatializer->positioning                  = pConfig->positioning;
+    pSpatializer->handedness                   = pConfig->handedness;
+    pSpatializer->minGain                      = pConfig->minGain;
+    pSpatializer->maxGain                      = pConfig->maxGain;
+    pSpatializer->minDistance                  = pConfig->minDistance;
+    pSpatializer->maxDistance                  = pConfig->maxDistance;
+    pSpatializer->rolloff                      = pConfig->rolloff;
+    pSpatializer->coneInnerAngleInRadians      = pConfig->coneInnerAngleInRadians;
+    pSpatializer->coneOuterAngleInRadians      = pConfig->coneOuterAngleInRadians;
+    pSpatializer->coneOuterGain                = pConfig->coneOuterGain;
+    pSpatializer->dopplerFactor                = pConfig->dopplerFactor;
+    pSpatializer->minSpatializationChannelGain = pConfig->minSpatializationChannelGain;
+    pSpatializer->directionalAttenuationFactor = pConfig->directionalAttenuationFactor;
+    pSpatializer->gainSmoothTimeInFrames       = pConfig->gainSmoothTimeInFrames;
+    ma_atomic_vec3f_init(&pSpatializer->position,  ma_vec3f_init_3f(0, 0,  0));
+    ma_atomic_vec3f_init(&pSpatializer->direction, ma_vec3f_init_3f(0, 0, -1));
+    ma_atomic_vec3f_init(&pSpatializer->velocity,  ma_vec3f_init_3f(0, 0,  0));
+    pSpatializer->dopplerPitch                 = 1;
+
+    /* Swap the forward direction if we're left handed (it was initialized based on right handed). */
+    if (pSpatializer->handedness == ma_handedness_left) {
+        ma_vec3f negDir = ma_vec3f_neg(ma_spatializer_get_direction(pSpatializer));
+        ma_spatializer_set_direction(pSpatializer, negDir.x, negDir.y, negDir.z);
+    }
+
+    /* Channel map. This will be on the heap. */
+    if (pConfig->pChannelMapIn != NULL) {
+        pSpatializer->pChannelMapIn = (ma_channel*)ma_offset_ptr(pHeap, heapLayout.channelMapInOffset);
+        ma_channel_map_copy_or_default(pSpatializer->pChannelMapIn, pSpatializer->channelsIn, pConfig->pChannelMapIn, pSpatializer->channelsIn);
+    }
+
+    /* New channel gains for output channels. */
+    pSpatializer->pNewChannelGainsOut = (float*)ma_offset_ptr(pHeap, heapLayout.newChannelGainsOffset);
+
+    /* Gainer. */
+    gainerConfig = ma_spatializer_gainer_config_init(pConfig);
+
+    result = ma_gainer_init_preallocated(&gainerConfig, ma_offset_ptr(pHeap, heapLayout.gainerOffset), &pSpatializer->gainer);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the gainer. */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_spatializer_init(const ma_spatializer_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_spatializer* pSpatializer)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    /* We'll need a heap allocation to retrieve the size. */
+    result = ma_spatializer_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_spatializer_init_preallocated(pConfig, pHeap, pSpatializer);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pSpatializer->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_spatializer_uninit(ma_spatializer* pSpatializer, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_gainer_uninit(&pSpatializer->gainer, pAllocationCallbacks);
+
+    if (pSpatializer->_ownsHeap) {
+        ma_free(pSpatializer->_pHeap, pAllocationCallbacks);
+    }
+}
+
+static float ma_calculate_angular_gain(ma_vec3f dirA, ma_vec3f dirB, float coneInnerAngleInRadians, float coneOuterAngleInRadians, float coneOuterGain)
+{
+    /*
+    Angular attenuation.
+
+    Unlike distance gain, the math for this is not specified by the OpenAL spec so we'll just go ahead and figure
+    this out for ourselves at the expense of possibly being inconsistent with other implementations.
+
+    To do cone attenuation, I'm just using the same math that we'd use to implement a basic spotlight in OpenGL. We
+    just need to get the direction from the source to the listener and then do a dot product against that and the
+    direction of the spotlight. Then we just compare that dot product against the cosine of the inner and outer
+    angles. If the dot product is greater than the outer angle, we just use coneOuterGain. If it's less than
+    the inner angle, we just use a gain of 1. Otherwise we linearly interpolate between 1 and coneOuterGain.
+    */
+    if (coneInnerAngleInRadians < 6.283185f) {
+        float angularGain = 1;
+        float cutoffInner = (float)ma_cosd(coneInnerAngleInRadians*0.5f);
+        float cutoffOuter = (float)ma_cosd(coneOuterAngleInRadians*0.5f);
+        float d;
+
+        d = ma_vec3f_dot(dirA, dirB);
+
+        if (d > cutoffInner) {
+            /* It's inside the inner angle. */
+            angularGain = 1;
+        } else {
+            /* It's outside the inner angle. */
+            if (d > cutoffOuter) {
+                /* It's between the inner and outer angle. We need to linearly interpolate between 1 and coneOuterGain. */
+                angularGain = ma_mix_f32(coneOuterGain, 1, (d - cutoffOuter) / (cutoffInner - cutoffOuter));
+            } else {
+                /* It's outside the outer angle. */
+                angularGain = coneOuterGain;
+            }
+        }
+
+        /*printf("d = %f; cutoffInner = %f; cutoffOuter = %f; angularGain = %f\n", d, cutoffInner, cutoffOuter, angularGain);*/
+        return angularGain;
+    } else {
+        /* Inner angle is 360 degrees so no need to do any attenuation. */
+        return 1;
+    }
+}
+
+MA_API ma_result ma_spatializer_process_pcm_frames(ma_spatializer* pSpatializer, ma_spatializer_listener* pListener, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_channel* pChannelMapIn  = pSpatializer->pChannelMapIn;
+    ma_channel* pChannelMapOut = pListener->config.pChannelMapOut;
+
+    if (pSpatializer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* If we're not spatializing we need to run an optimized path. */
+    if (ma_atomic_load_i32(&pSpatializer->attenuationModel) == ma_attenuation_model_none) {
+        if (ma_spatializer_listener_is_enabled(pListener)) {
+            /* No attenuation is required, but we'll need to do some channel conversion. */
+            if (pSpatializer->channelsIn == pSpatializer->channelsOut) {
+                ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, ma_format_f32, pSpatializer->channelsIn);
+            } else {
+                ma_channel_map_apply_f32((float*)pFramesOut, pChannelMapOut, pSpatializer->channelsOut, (const float*)pFramesIn, pChannelMapIn, pSpatializer->channelsIn, frameCount, ma_channel_mix_mode_rectangular, ma_mono_expansion_mode_default);   /* Safe casts to float* because f32 is the only supported format. */
+            }
+        } else {
+            /* The listener is disabled. Output silence. */
+            ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, pSpatializer->channelsOut);
+        }
+
+        /*
+        We're not doing attenuation so don't bother with doppler for now. I'm not sure if this is
+        the correct thinking so might need to review this later.
+        */
+        pSpatializer->dopplerPitch = 1;
+    } else {
+        /*
+        Let's first determine which listener the sound is closest to. Need to keep in mind that we
+        might not have a world or any listeners, in which case we just spatializer based on the
+        listener being positioned at the origin (0, 0, 0).
+        */
+        ma_vec3f relativePosNormalized;
+        ma_vec3f relativePos;   /* The position relative to the listener. */
+        ma_vec3f relativeDir;   /* The direction of the sound, relative to the listener. */
+        ma_vec3f listenerVel;   /* The velocity of the listener. For doppler pitch calculation. */
+        float speedOfSound;
+        float distance = 0;
+        float gain = 1;
+        ma_uint32 iChannel;
+        const ma_uint32 channelsOut = pSpatializer->channelsOut;
+        const ma_uint32 channelsIn  = pSpatializer->channelsIn;
+        float minDistance = ma_spatializer_get_min_distance(pSpatializer);
+        float maxDistance = ma_spatializer_get_max_distance(pSpatializer);
+        float rolloff = ma_spatializer_get_rolloff(pSpatializer);
+        float dopplerFactor = ma_spatializer_get_doppler_factor(pSpatializer);
+
+        /*
+        We'll need the listener velocity for doppler pitch calculations. The speed of sound is
+        defined by the listener, so we'll grab that here too.
+        */
+        if (pListener != NULL) {
+            listenerVel  = ma_spatializer_listener_get_velocity(pListener);
+            speedOfSound = pListener->config.speedOfSound;
+        } else {
+            listenerVel  = ma_vec3f_init_3f(0, 0, 0);
+            speedOfSound = MA_DEFAULT_SPEED_OF_SOUND;
+        }
+
+        if (pListener == NULL || ma_spatializer_get_positioning(pSpatializer) == ma_positioning_relative) {
+            /* There's no listener or we're using relative positioning. */
+            relativePos = ma_spatializer_get_position(pSpatializer);
+            relativeDir = ma_spatializer_get_direction(pSpatializer);
+        } else {
+            /*
+            We've found a listener and we're using absolute positioning. We need to transform the
+            sound's position and direction so that it's relative to listener. Later on we'll use
+            this for determining the factors to apply to each channel to apply the panning effect.
+            */
+            ma_spatializer_get_relative_position_and_direction(pSpatializer, pListener, &relativePos, &relativeDir);
+        }
+
+        distance = ma_vec3f_len(relativePos);
+
+        /* We've gathered the data, so now we can apply some spatialization. */
+        switch (ma_spatializer_get_attenuation_model(pSpatializer)) {
+            case ma_attenuation_model_inverse:
+            {
+                gain = ma_attenuation_inverse(distance, minDistance, maxDistance, rolloff);
+            } break;
+            case ma_attenuation_model_linear:
+            {
+                gain = ma_attenuation_linear(distance, minDistance, maxDistance, rolloff);
+            } break;
+            case ma_attenuation_model_exponential:
+            {
+                gain = ma_attenuation_exponential(distance, minDistance, maxDistance, rolloff);
+            } break;
+            case ma_attenuation_model_none:
+            default:
+            {
+                gain = 1;
+            } break;
+        }
+
+        /* Normalize the position. */
+        if (distance > 0.001f) {
+            float distanceInv = 1/distance;
+            relativePosNormalized    = relativePos;
+            relativePosNormalized.x *= distanceInv;
+            relativePosNormalized.y *= distanceInv;
+            relativePosNormalized.z *= distanceInv;
+        } else {
+            distance = 0;
+            relativePosNormalized = ma_vec3f_init_3f(0, 0, 0);
+        }
+
+        /*
+        Angular attenuation.
+
+        Unlike distance gain, the math for this is not specified by the OpenAL spec so we'll just go ahead and figure
+        this out for ourselves at the expense of possibly being inconsistent with other implementations.
+
+        To do cone attenuation, I'm just using the same math that we'd use to implement a basic spotlight in OpenGL. We
+        just need to get the direction from the source to the listener and then do a dot product against that and the
+        direction of the spotlight. Then we just compare that dot product against the cosine of the inner and outer
+        angles. If the dot product is greater than the outer angle, we just use coneOuterGain. If it's less than
+        the inner angle, we just use a gain of 1. Otherwise we linearly interpolate between 1 and coneOuterGain.
+        */
+        if (distance > 0) {
+            /* Source angular gain. */
+            float spatializerConeInnerAngle;
+            float spatializerConeOuterAngle;
+            float spatializerConeOuterGain;
+            ma_spatializer_get_cone(pSpatializer, &spatializerConeInnerAngle, &spatializerConeOuterAngle, &spatializerConeOuterGain);
+
+            gain *= ma_calculate_angular_gain(relativeDir, ma_vec3f_neg(relativePosNormalized), spatializerConeInnerAngle, spatializerConeOuterAngle, spatializerConeOuterGain);
+
+            /*
+            We're supporting angular gain on the listener as well for those who want to reduce the volume of sounds that
+            are positioned behind the listener. On default settings, this will have no effect.
+            */
+            if (pListener != NULL && pListener->config.coneInnerAngleInRadians < 6.283185f) {
+                ma_vec3f listenerDirection;
+                float listenerInnerAngle;
+                float listenerOuterAngle;
+                float listenerOuterGain;
+
+                if (pListener->config.handedness == ma_handedness_right) {
+                    listenerDirection = ma_vec3f_init_3f(0, 0, -1);
+                } else {
+                    listenerDirection = ma_vec3f_init_3f(0, 0, +1);
+                }
+
+                listenerInnerAngle = pListener->config.coneInnerAngleInRadians;
+                listenerOuterAngle = pListener->config.coneOuterAngleInRadians;
+                listenerOuterGain  = pListener->config.coneOuterGain;
+
+                gain *= ma_calculate_angular_gain(listenerDirection, relativePosNormalized, listenerInnerAngle, listenerOuterAngle, listenerOuterGain);
+            }
+        } else {
+            /* The sound is right on top of the listener. Don't do any angular attenuation. */
+        }
+
+
+        /* Clamp the gain. */
+        gain = ma_clamp(gain, ma_spatializer_get_min_gain(pSpatializer), ma_spatializer_get_max_gain(pSpatializer));
+
+        /*
+        The gain needs to be applied per-channel here. The spatialization code below will be changing the per-channel
+        gains which will then eventually be passed into the gainer which will deal with smoothing the gain transitions
+        to avoid harsh changes in gain.
+        */
+        for (iChannel = 0; iChannel < channelsOut; iChannel += 1) {
+            pSpatializer->pNewChannelGainsOut[iChannel] = gain;
+        }
+
+        /*
+        Convert to our output channel count. If the listener is disabled we just output silence here. We cannot ignore
+        the whole section of code here because we need to update some internal spatialization state.
+        */
+        if (ma_spatializer_listener_is_enabled(pListener)) {
+            ma_channel_map_apply_f32((float*)pFramesOut, pChannelMapOut, channelsOut, (const float*)pFramesIn, pChannelMapIn, channelsIn, frameCount, ma_channel_mix_mode_rectangular, ma_mono_expansion_mode_default);
+        } else {
+            ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, pSpatializer->channelsOut);
+        }
+
+
+        /*
+        Panning. This is where we'll apply the gain and convert to the output channel count. We have an optimized path for
+        when we're converting to a mono stream. In that case we don't really need to do any panning - we just apply the
+        gain to the final output.
+        */
+        /*printf("distance=%f; gain=%f\n", distance, gain);*/
+
+        /* We must have a valid channel map here to ensure we spatialize properly. */
+        MA_ASSERT(pChannelMapOut != NULL);
+
+        /*
+        We're not converting to mono so we'll want to apply some panning. This is where the feeling of something being
+        to the left, right, infront or behind the listener is calculated. I'm just using a basic model here. Note that
+        the code below is not based on any specific algorithm. I'm just implementing this off the top of my head and
+        seeing how it goes. There might be better ways to do this.
+
+        To determine the direction of the sound relative to a speaker I'm using dot products. Each speaker is given a
+        direction. For example, the left channel in a stereo system will be -1 on the X axis and the right channel will
+        be +1 on the X axis. A dot product is performed against the direction vector of the channel and the normalized
+        position of the sound.
+        */
+
+        /*
+        Calculate our per-channel gains. We do this based on the normalized relative position of the sound and it's
+        relation to the direction of the channel.
+        */
+        if (distance > 0) {
+            ma_vec3f unitPos = relativePos;
+            float distanceInv = 1/distance;
+            unitPos.x *= distanceInv;
+            unitPos.y *= distanceInv;
+            unitPos.z *= distanceInv;
+
+            for (iChannel = 0; iChannel < channelsOut; iChannel += 1) {
+                ma_channel channelOut;
+                float d;
+                float dMin;
+
+                channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannel);
+                if (ma_is_spatial_channel_position(channelOut)) {
+                    d = ma_mix_f32_fast(1, ma_vec3f_dot(unitPos, ma_get_channel_direction(channelOut)), ma_spatializer_get_directional_attenuation_factor(pSpatializer));
+                } else {
+                    d = 1;  /* It's not a spatial channel so there's no real notion of direction. */
+                }
+
+                /*
+                In my testing, if the panning effect is too aggressive it makes spatialization feel uncomfortable.
+                The "dMin" variable below is used to control the aggressiveness of the panning effect. When set to
+                0, panning will be most extreme and any sounds that are positioned on the opposite side of the
+                speaker will be completely silent from that speaker. Not only does this feel uncomfortable, it
+                doesn't even remotely represent the real world at all because sounds that come from your right side
+                are still clearly audible from your left side. Setting "dMin" to 1 will result in no panning at
+                all, which is also not ideal. By setting it to something greater than 0, the spatialization effect
+                becomes much less dramatic and a lot more bearable.
+
+                Summary: 0 = more extreme panning; 1 = no panning.
+                */
+                dMin = pSpatializer->minSpatializationChannelGain;
+
+                /*
+                At this point, "d" will be positive if the sound is on the same side as the channel and negative if
+                it's on the opposite side. It will be in the range of -1..1. There's two ways I can think of to
+                calculate a panning value. The first is to simply convert it to 0..1, however this has a problem
+                which I'm not entirely happy with. Considering a stereo system, when a sound is positioned right
+                in front of the listener it'll result in each speaker getting a gain of 0.5. I don't know if I like
+                the idea of having a scaling factor of 0.5 being applied to a sound when it's sitting right in front
+                of the listener. I would intuitively expect that to be played at full volume, or close to it.
+
+                The second idea I think of is to only apply a reduction in gain when the sound is on the opposite
+                side of the speaker. That is, reduce the gain only when the dot product is negative. The problem
+                with this is that there will not be any attenuation as the sound sweeps around the 180 degrees
+                where the dot product is positive. The idea with this option is that you leave the gain at 1 when
+                the sound is being played on the same side as the speaker and then you just reduce the volume when
+                the sound is on the other side.
+
+                The summarize, I think the first option should give a better sense of spatialization, but the second
+                option is better for preserving the sound's power.
+
+                UPDATE: In my testing, I find the first option to sound better. You can feel the sense of space a
+                bit better, but you can also hear the reduction in volume when it's right in front.
+                */
+                #if 1
+                {
+                    /*
+                    Scale the dot product from -1..1 to 0..1. Will result in a sound directly in front losing power
+                    by being played at 0.5 gain.
+                    */
+                    d = (d + 1) * 0.5f;  /* -1..1 to 0..1 */
+                    d = ma_max(d, dMin);
+                    pSpatializer->pNewChannelGainsOut[iChannel] *= d;
+                }
+                #else
+                {
+                    /*
+                    Only reduce the volume of the sound if it's on the opposite side. This path keeps the volume more
+                    consistent, but comes at the expense of a worse sense of space and positioning.
+                    */
+                    if (d < 0) {
+                        d += 1; /* Move into the positive range. */
+                        d = ma_max(d, dMin);
+                        channelGainsOut[iChannel] *= d;
+                    }
+                }
+                #endif
+            }
+        } else {
+            /* Assume the sound is right on top of us. Don't do any panning. */
+        }
+
+        /* Now we need to apply the volume to each channel. This needs to run through the gainer to ensure we get a smooth volume transition. */
+        ma_gainer_set_gains(&pSpatializer->gainer, pSpatializer->pNewChannelGainsOut);
+        ma_gainer_process_pcm_frames(&pSpatializer->gainer, pFramesOut, pFramesOut, frameCount);
+
+        /*
+        Before leaving we'll want to update our doppler pitch so that the caller can apply some
+        pitch shifting if they desire. Note that we need to negate the relative position here
+        because the doppler calculation needs to be source-to-listener, but ours is listener-to-
+        source.
+        */
+        if (dopplerFactor > 0) {
+            pSpatializer->dopplerPitch = ma_doppler_pitch(ma_vec3f_sub(ma_spatializer_listener_get_position(pListener), ma_spatializer_get_position(pSpatializer)), ma_spatializer_get_velocity(pSpatializer), listenerVel, speedOfSound, dopplerFactor);
+        } else {
+            pSpatializer->dopplerPitch = 1;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_spatializer_set_master_volume(ma_spatializer* pSpatializer, float volume)
+{
+    if (pSpatializer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_gainer_set_master_volume(&pSpatializer->gainer, volume);
+}
+
+MA_API ma_result ma_spatializer_get_master_volume(const ma_spatializer* pSpatializer, float* pVolume)
+{
+    if (pSpatializer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_gainer_get_master_volume(&pSpatializer->gainer, pVolume);
+}
+
+MA_API ma_uint32 ma_spatializer_get_input_channels(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 0;
+    }
+
+    return pSpatializer->channelsIn;
+}
+
+MA_API ma_uint32 ma_spatializer_get_output_channels(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 0;
+    }
+
+    return pSpatializer->channelsOut;
+}
+
+MA_API void ma_spatializer_set_attenuation_model(ma_spatializer* pSpatializer, ma_attenuation_model attenuationModel)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_i32(&pSpatializer->attenuationModel, attenuationModel);
+}
+
+MA_API ma_attenuation_model ma_spatializer_get_attenuation_model(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return ma_attenuation_model_none;
+    }
+
+    return (ma_attenuation_model)ma_atomic_load_i32(&pSpatializer->attenuationModel);
+}
+
+MA_API void ma_spatializer_set_positioning(ma_spatializer* pSpatializer, ma_positioning positioning)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_i32(&pSpatializer->positioning, positioning);
+}
+
+MA_API ma_positioning ma_spatializer_get_positioning(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return ma_positioning_absolute;
+    }
+
+    return (ma_positioning)ma_atomic_load_i32(&pSpatializer->positioning);
+}
+
+MA_API void ma_spatializer_set_rolloff(ma_spatializer* pSpatializer, float rolloff)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_f32(&pSpatializer->rolloff, rolloff);
+}
+
+MA_API float ma_spatializer_get_rolloff(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 0;
+    }
+
+    return ma_atomic_load_f32(&pSpatializer->rolloff);
+}
+
+MA_API void ma_spatializer_set_min_gain(ma_spatializer* pSpatializer, float minGain)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_f32(&pSpatializer->minGain, minGain);
+}
+
+MA_API float ma_spatializer_get_min_gain(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 0;
+    }
+
+    return ma_atomic_load_f32(&pSpatializer->minGain);
+}
+
+MA_API void ma_spatializer_set_max_gain(ma_spatializer* pSpatializer, float maxGain)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_f32(&pSpatializer->maxGain, maxGain);
+}
+
+MA_API float ma_spatializer_get_max_gain(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 0;
+    }
+
+    return ma_atomic_load_f32(&pSpatializer->maxGain);
+}
+
+MA_API void ma_spatializer_set_min_distance(ma_spatializer* pSpatializer, float minDistance)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_f32(&pSpatializer->minDistance, minDistance);
+}
+
+MA_API float ma_spatializer_get_min_distance(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 0;
+    }
+
+    return ma_atomic_load_f32(&pSpatializer->minDistance);
+}
+
+MA_API void ma_spatializer_set_max_distance(ma_spatializer* pSpatializer, float maxDistance)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_f32(&pSpatializer->maxDistance, maxDistance);
+}
+
+MA_API float ma_spatializer_get_max_distance(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 0;
+    }
+
+    return ma_atomic_load_f32(&pSpatializer->maxDistance);
+}
+
+MA_API void ma_spatializer_set_cone(ma_spatializer* pSpatializer, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_f32(&pSpatializer->coneInnerAngleInRadians, innerAngleInRadians);
+    ma_atomic_exchange_f32(&pSpatializer->coneOuterAngleInRadians, outerAngleInRadians);
+    ma_atomic_exchange_f32(&pSpatializer->coneOuterGain,           outerGain);
+}
+
+MA_API void ma_spatializer_get_cone(const ma_spatializer* pSpatializer, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    if (pInnerAngleInRadians != NULL) {
+        *pInnerAngleInRadians = ma_atomic_load_f32(&pSpatializer->coneInnerAngleInRadians);
+    }
+
+    if (pOuterAngleInRadians != NULL) {
+        *pOuterAngleInRadians = ma_atomic_load_f32(&pSpatializer->coneOuterAngleInRadians);
+    }
+
+    if (pOuterGain != NULL) {
+        *pOuterGain = ma_atomic_load_f32(&pSpatializer->coneOuterGain);
+    }
+}
+
+MA_API void ma_spatializer_set_doppler_factor(ma_spatializer* pSpatializer, float dopplerFactor)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_f32(&pSpatializer->dopplerFactor, dopplerFactor);
+}
+
+MA_API float ma_spatializer_get_doppler_factor(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 1;
+    }
+
+    return ma_atomic_load_f32(&pSpatializer->dopplerFactor);
+}
+
+MA_API void ma_spatializer_set_directional_attenuation_factor(ma_spatializer* pSpatializer, float directionalAttenuationFactor)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_f32(&pSpatializer->directionalAttenuationFactor, directionalAttenuationFactor);
+}
+
+MA_API float ma_spatializer_get_directional_attenuation_factor(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return 1;
+    }
+
+    return ma_atomic_load_f32(&pSpatializer->directionalAttenuationFactor);
+}
+
+MA_API void ma_spatializer_set_position(ma_spatializer* pSpatializer, float x, float y, float z)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_vec3f_set(&pSpatializer->position, ma_vec3f_init_3f(x, y, z));
+}
+
+MA_API ma_vec3f ma_spatializer_get_position(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pSpatializer->position);  /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
+}
+
+MA_API void ma_spatializer_set_direction(ma_spatializer* pSpatializer, float x, float y, float z)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_vec3f_set(&pSpatializer->direction, ma_vec3f_init_3f(x, y, z));
+}
+
+MA_API ma_vec3f ma_spatializer_get_direction(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return ma_vec3f_init_3f(0, 0, -1);
+    }
+
+    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pSpatializer->direction); /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
+}
+
+MA_API void ma_spatializer_set_velocity(ma_spatializer* pSpatializer, float x, float y, float z)
+{
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    ma_atomic_vec3f_set(&pSpatializer->velocity, ma_vec3f_init_3f(x, y, z));
+}
+
+MA_API ma_vec3f ma_spatializer_get_velocity(const ma_spatializer* pSpatializer)
+{
+    if (pSpatializer == NULL) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_atomic_vec3f_get((ma_atomic_vec3f*)&pSpatializer->velocity);  /* Naughty const-cast. It's just for atomically loading the vec3 which should be safe. */
+}
+
+MA_API void ma_spatializer_get_relative_position_and_direction(const ma_spatializer* pSpatializer, const ma_spatializer_listener* pListener, ma_vec3f* pRelativePos, ma_vec3f* pRelativeDir)
+{
+    if (pRelativePos != NULL) {
+        pRelativePos->x = 0;
+        pRelativePos->y = 0;
+        pRelativePos->z = 0;
+    }
+
+    if (pRelativeDir != NULL) {
+        pRelativeDir->x = 0;
+        pRelativeDir->y = 0;
+        pRelativeDir->z = -1;
+    }
+
+    if (pSpatializer == NULL) {
+        return;
+    }
+
+    if (pListener == NULL || ma_spatializer_get_positioning(pSpatializer) == ma_positioning_relative) {
+        /* There's no listener or we're using relative positioning. */
+        if (pRelativePos != NULL) {
+            *pRelativePos = ma_spatializer_get_position(pSpatializer);
+        }
+        if (pRelativeDir != NULL) {
+            *pRelativeDir = ma_spatializer_get_direction(pSpatializer);
+        }
+    } else {
+        ma_vec3f spatializerPosition;
+        ma_vec3f spatializerDirection;
+        ma_vec3f listenerPosition;
+        ma_vec3f listenerDirection;
+        ma_vec3f v;
+        ma_vec3f axisX;
+        ma_vec3f axisY;
+        ma_vec3f axisZ;
+        float m[4][4];
+
+        spatializerPosition  = ma_spatializer_get_position(pSpatializer);
+        spatializerDirection = ma_spatializer_get_direction(pSpatializer);
+        listenerPosition     = ma_spatializer_listener_get_position(pListener);
+        listenerDirection    = ma_spatializer_listener_get_direction(pListener);
+
+        /*
+        We need to calculate the right vector from our forward and up vectors. This is done with
+        a cross product.
+        */
+        axisZ = ma_vec3f_normalize(listenerDirection);                                  /* Normalization required here because we can't trust the caller. */
+        axisX = ma_vec3f_normalize(ma_vec3f_cross(axisZ, pListener->config.worldUp));   /* Normalization required here because the world up vector may not be perpendicular with the forward vector. */
+
+        /*
+        The calculation of axisX above can result in a zero-length vector if the listener is
+        looking straight up on the Y axis. We'll need to fall back to a +X in this case so that
+        the calculations below don't fall apart. This is where a quaternion based listener and
+        sound orientation would come in handy.
+        */
+        if (ma_vec3f_len2(axisX) == 0) {
+            axisX = ma_vec3f_init_3f(1, 0, 0);
+        }
+
+        axisY = ma_vec3f_cross(axisX, axisZ);                                           /* No normalization is required here because axisX and axisZ are unit length and perpendicular. */
+
+        /*
+        We need to swap the X axis if we're left handed because otherwise the cross product above
+        will have resulted in it pointing in the wrong direction (right handed was assumed in the
+        cross products above).
+        */
+        if (pListener->config.handedness == ma_handedness_left) {
+            axisX = ma_vec3f_neg(axisX);
+        }
+
+        /* Lookat. */
+        m[0][0] =  axisX.x; m[1][0] =  axisX.y; m[2][0] =  axisX.z; m[3][0] = -ma_vec3f_dot(axisX,               listenerPosition);
+        m[0][1] =  axisY.x; m[1][1] =  axisY.y; m[2][1] =  axisY.z; m[3][1] = -ma_vec3f_dot(axisY,               listenerPosition);
+        m[0][2] = -axisZ.x; m[1][2] = -axisZ.y; m[2][2] = -axisZ.z; m[3][2] = -ma_vec3f_dot(ma_vec3f_neg(axisZ), listenerPosition);
+        m[0][3] = 0;        m[1][3] = 0;        m[2][3] = 0;        m[3][3] = 1;
+
+        /*
+        Multiply the lookat matrix by the spatializer position to transform it to listener
+        space. This allows calculations to work based on the sound being relative to the
+        origin which makes things simpler.
+        */
+        if (pRelativePos != NULL) {
+            v = spatializerPosition;
+            pRelativePos->x = m[0][0] * v.x + m[1][0] * v.y + m[2][0] * v.z + m[3][0] * 1;
+            pRelativePos->y = m[0][1] * v.x + m[1][1] * v.y + m[2][1] * v.z + m[3][1] * 1;
+            pRelativePos->z = m[0][2] * v.x + m[1][2] * v.y + m[2][2] * v.z + m[3][2] * 1;
+        }
+
+        /*
+        The direction of the sound needs to also be transformed so that it's relative to the
+        rotation of the listener.
+        */
+        if (pRelativeDir != NULL) {
+            v = spatializerDirection;
+            pRelativeDir->x = m[0][0] * v.x + m[1][0] * v.y + m[2][0] * v.z;
+            pRelativeDir->y = m[0][1] * v.x + m[1][1] * v.y + m[2][1] * v.z;
+            pRelativeDir->z = m[0][2] * v.x + m[1][2] * v.y + m[2][2] * v.z;
+        }
+    }
+}
+
+
+
+
+/**************************************************************************************************************************************************************
+
+Resampling
+
+**************************************************************************************************************************************************************/
+MA_API ma_linear_resampler_config ma_linear_resampler_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
+{
+    ma_linear_resampler_config config;
+    MA_ZERO_OBJECT(&config);
+    config.format           = format;
+    config.channels         = channels;
+    config.sampleRateIn     = sampleRateIn;
+    config.sampleRateOut    = sampleRateOut;
+    config.lpfOrder         = ma_min(MA_DEFAULT_RESAMPLER_LPF_ORDER, MA_MAX_FILTER_ORDER);
+    config.lpfNyquistFactor = 1;
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t x0Offset;
+    size_t x1Offset;
+    size_t lpfOffset;
+} ma_linear_resampler_heap_layout;
+
+
+static void ma_linear_resampler_adjust_timer_for_new_rate(ma_linear_resampler* pResampler, ma_uint32 oldSampleRateOut, ma_uint32 newSampleRateOut)
+{
+    /*
+    So what's happening here? Basically we need to adjust the fractional component of the time advance based on the new rate. The old time advance will
+    be based on the old sample rate, but we are needing to adjust it to that it's based on the new sample rate.
+    */
+    ma_uint32 oldRateTimeWhole = pResampler->inTimeFrac / oldSampleRateOut;  /* <-- This should almost never be anything other than 0, but leaving it here to make this more general and robust just in case. */
+    ma_uint32 oldRateTimeFract = pResampler->inTimeFrac % oldSampleRateOut;
+
+    pResampler->inTimeFrac =
+         (oldRateTimeWhole * newSampleRateOut) +
+        ((oldRateTimeFract * newSampleRateOut) / oldSampleRateOut);
+
+    /* Make sure the fractional part is less than the output sample rate. */
+    pResampler->inTimeInt += pResampler->inTimeFrac / pResampler->config.sampleRateOut;
+    pResampler->inTimeFrac = pResampler->inTimeFrac % pResampler->config.sampleRateOut;
+}
+
+static ma_result ma_linear_resampler_set_rate_internal(ma_linear_resampler* pResampler, void* pHeap, ma_linear_resampler_heap_layout* pHeapLayout, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut, ma_bool32 isResamplerAlreadyInitialized)
+{
+    ma_result result;
+    ma_uint32 gcf;
+    ma_uint32 lpfSampleRate;
+    double lpfCutoffFrequency;
+    ma_lpf_config lpfConfig;
+    ma_uint32 oldSampleRateOut; /* Required for adjusting time advance down the bottom. */
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (sampleRateIn == 0 || sampleRateOut == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    oldSampleRateOut = pResampler->config.sampleRateOut;
+
+    pResampler->config.sampleRateIn  = sampleRateIn;
+    pResampler->config.sampleRateOut = sampleRateOut;
+
+    /* Simplify the sample rate. */
+    gcf = ma_gcf_u32(pResampler->config.sampleRateIn, pResampler->config.sampleRateOut);
+    pResampler->config.sampleRateIn  /= gcf;
+    pResampler->config.sampleRateOut /= gcf;
+
+    /* Always initialize the low-pass filter, even when the order is 0. */
+    if (pResampler->config.lpfOrder > MA_MAX_FILTER_ORDER) {
+        return MA_INVALID_ARGS;
+    }
+
+    lpfSampleRate      = (ma_uint32)(ma_max(pResampler->config.sampleRateIn, pResampler->config.sampleRateOut));
+    lpfCutoffFrequency = (   double)(ma_min(pResampler->config.sampleRateIn, pResampler->config.sampleRateOut) * 0.5 * pResampler->config.lpfNyquistFactor);
+
+    lpfConfig = ma_lpf_config_init(pResampler->config.format, pResampler->config.channels, lpfSampleRate, lpfCutoffFrequency, pResampler->config.lpfOrder);
+
+    /*
+    If the resampler is already initialized we don't want to do a fresh initialization of the low-pass filter because it will result in the cached frames
+    getting cleared. Instead we re-initialize the filter which will maintain any cached frames.
+    */
+    if (isResamplerAlreadyInitialized) {
+        result = ma_lpf_reinit(&lpfConfig, &pResampler->lpf);
+    } else {
+        result = ma_lpf_init_preallocated(&lpfConfig, ma_offset_ptr(pHeap, pHeapLayout->lpfOffset), &pResampler->lpf);
+    }
+
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+
+    pResampler->inAdvanceInt  = pResampler->config.sampleRateIn / pResampler->config.sampleRateOut;
+    pResampler->inAdvanceFrac = pResampler->config.sampleRateIn % pResampler->config.sampleRateOut;
+
+    /* Our timer was based on the old rate. We need to adjust it so that it's based on the new rate. */
+    ma_linear_resampler_adjust_timer_for_new_rate(pResampler, oldSampleRateOut, pResampler->config.sampleRateOut);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_linear_resampler_get_heap_layout(const ma_linear_resampler_config* pConfig, ma_linear_resampler_heap_layout* pHeapLayout)
+{
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->format != ma_format_f32 && pConfig->format != ma_format_s16) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* x0 */
+    pHeapLayout->x0Offset = pHeapLayout->sizeInBytes;
+    if (pConfig->format == ma_format_f32) {
+        pHeapLayout->sizeInBytes += sizeof(float) * pConfig->channels;
+    } else {
+        pHeapLayout->sizeInBytes += sizeof(ma_int16) * pConfig->channels;
+    }
+
+    /* x1 */
+    pHeapLayout->x1Offset = pHeapLayout->sizeInBytes;
+    if (pConfig->format == ma_format_f32) {
+        pHeapLayout->sizeInBytes += sizeof(float) * pConfig->channels;
+    } else {
+        pHeapLayout->sizeInBytes += sizeof(ma_int16) * pConfig->channels;
+    }
+
+    /* LPF */
+    pHeapLayout->lpfOffset = ma_align_64(pHeapLayout->sizeInBytes);
+    {
+        ma_result result;
+        size_t lpfHeapSizeInBytes;
+        ma_lpf_config lpfConfig = ma_lpf_config_init(pConfig->format, pConfig->channels, 1, 1, pConfig->lpfOrder);  /* Sample rate and cutoff frequency do not matter. */
+
+        result = ma_lpf_get_heap_size(&lpfConfig, &lpfHeapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->sizeInBytes += lpfHeapSizeInBytes;
+    }
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_linear_resampler_get_heap_size(const ma_linear_resampler_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_linear_resampler_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_linear_resampler_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_linear_resampler_init_preallocated(const ma_linear_resampler_config* pConfig, void* pHeap, ma_linear_resampler* pResampler)
+{
+    ma_result result;
+    ma_linear_resampler_heap_layout heapLayout;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pResampler);
+
+    result = ma_linear_resampler_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pResampler->config = *pConfig;
+
+    pResampler->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    if (pConfig->format == ma_format_f32) {
+        pResampler->x0.f32 = (float*)ma_offset_ptr(pHeap, heapLayout.x0Offset);
+        pResampler->x1.f32 = (float*)ma_offset_ptr(pHeap, heapLayout.x1Offset);
+    } else {
+        pResampler->x0.s16 = (ma_int16*)ma_offset_ptr(pHeap, heapLayout.x0Offset);
+        pResampler->x1.s16 = (ma_int16*)ma_offset_ptr(pHeap, heapLayout.x1Offset);
+    }
+
+    /* Setting the rate will set up the filter and time advances for us. */
+    result = ma_linear_resampler_set_rate_internal(pResampler, pHeap, &heapLayout, pConfig->sampleRateIn, pConfig->sampleRateOut, /* isResamplerAlreadyInitialized = */ MA_FALSE);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pResampler->inTimeInt  = 1;  /* Set this to one to force an input sample to always be loaded for the first output frame. */
+    pResampler->inTimeFrac = 0;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_linear_resampler_init(const ma_linear_resampler_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_linear_resampler* pResampler)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_linear_resampler_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_linear_resampler_init_preallocated(pConfig, pHeap, pResampler);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pResampler->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_linear_resampler_uninit(ma_linear_resampler* pResampler, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pResampler == NULL) {
+        return;
+    }
+
+    ma_lpf_uninit(&pResampler->lpf, pAllocationCallbacks);
+
+    if (pResampler->_ownsHeap) {
+        ma_free(pResampler->_pHeap, pAllocationCallbacks);
+    }
+}
+
+static MA_INLINE ma_int16 ma_linear_resampler_mix_s16(ma_int16 x, ma_int16 y, ma_int32 a, const ma_int32 shift)
+{
+    ma_int32 b;
+    ma_int32 c;
+    ma_int32 r;
+
+    MA_ASSERT(a <= (1<<shift));
+
+    b = x * ((1<<shift) - a);
+    c = y * a;
+    r = b + c;
+
+    return (ma_int16)(r >> shift);
+}
+
+static void ma_linear_resampler_interpolate_frame_s16(ma_linear_resampler* pResampler, ma_int16* MA_RESTRICT pFrameOut)
+{
+    ma_uint32 c;
+    ma_uint32 a;
+    const ma_uint32 channels = pResampler->config.channels;
+    const ma_uint32 shift = 12;
+
+    MA_ASSERT(pResampler != NULL);
+    MA_ASSERT(pFrameOut  != NULL);
+
+    a = (pResampler->inTimeFrac << shift) / pResampler->config.sampleRateOut;
+
+    MA_ASSUME(channels > 0);
+    for (c = 0; c < channels; c += 1) {
+        ma_int16 s = ma_linear_resampler_mix_s16(pResampler->x0.s16[c], pResampler->x1.s16[c], a, shift);
+        pFrameOut[c] = s;
+    }
+}
+
+
+static void ma_linear_resampler_interpolate_frame_f32(ma_linear_resampler* pResampler, float* MA_RESTRICT pFrameOut)
+{
+    ma_uint32 c;
+    float a;
+    const ma_uint32 channels = pResampler->config.channels;
+
+    MA_ASSERT(pResampler != NULL);
+    MA_ASSERT(pFrameOut  != NULL);
+
+    a = (float)pResampler->inTimeFrac / pResampler->config.sampleRateOut;
+
+    MA_ASSUME(channels > 0);
+    for (c = 0; c < channels; c += 1) {
+        float s = ma_mix_f32_fast(pResampler->x0.f32[c], pResampler->x1.f32[c], a);
+        pFrameOut[c] = s;
+    }
+}
+
+static ma_result ma_linear_resampler_process_pcm_frames_s16_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    const ma_int16* pFramesInS16;
+    /* */ ma_int16* pFramesOutS16;
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 framesProcessedIn;
+    ma_uint64 framesProcessedOut;
+
+    MA_ASSERT(pResampler     != NULL);
+    MA_ASSERT(pFrameCountIn  != NULL);
+    MA_ASSERT(pFrameCountOut != NULL);
+
+    pFramesInS16       = (const ma_int16*)pFramesIn;
+    pFramesOutS16      = (      ma_int16*)pFramesOut;
+    frameCountIn       = *pFrameCountIn;
+    frameCountOut      = *pFrameCountOut;
+    framesProcessedIn  = 0;
+    framesProcessedOut = 0;
+
+    while (framesProcessedOut < frameCountOut) {
+        /* Before interpolating we need to load the buffers. When doing this we need to ensure we run every input sample through the filter. */
+        while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
+            ma_uint32 iChannel;
+
+            if (pFramesInS16 != NULL) {
+                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
+                    pResampler->x1.s16[iChannel] = pFramesInS16[iChannel];
+                }
+                pFramesInS16 += pResampler->config.channels;
+            } else {
+                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
+                    pResampler->x1.s16[iChannel] = 0;
+                }
+            }
+
+            /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
+            if (pResampler->config.sampleRateIn != pResampler->config.sampleRateOut) {
+                ma_lpf_process_pcm_frame_s16(&pResampler->lpf, pResampler->x1.s16, pResampler->x1.s16);
+            }
+
+            framesProcessedIn     += 1;
+            pResampler->inTimeInt -= 1;
+        }
+
+        if (pResampler->inTimeInt > 0) {
+            break;  /* Ran out of input data. */
+        }
+
+        /* Getting here means the frames have been loaded and filtered and we can generate the next output frame. */
+        if (pFramesOutS16 != NULL) {
+            MA_ASSERT(pResampler->inTimeInt == 0);
+            ma_linear_resampler_interpolate_frame_s16(pResampler, pFramesOutS16);
+
+            pFramesOutS16 += pResampler->config.channels;
+        }
+
+        framesProcessedOut += 1;
+
+        /* Advance time forward. */
+        pResampler->inTimeInt  += pResampler->inAdvanceInt;
+        pResampler->inTimeFrac += pResampler->inAdvanceFrac;
+        if (pResampler->inTimeFrac >= pResampler->config.sampleRateOut) {
+            pResampler->inTimeFrac -= pResampler->config.sampleRateOut;
+            pResampler->inTimeInt  += 1;
+        }
+    }
+
+    *pFrameCountIn  = framesProcessedIn;
+    *pFrameCountOut = framesProcessedOut;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_linear_resampler_process_pcm_frames_s16_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    const ma_int16* pFramesInS16;
+    /* */ ma_int16* pFramesOutS16;
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 framesProcessedIn;
+    ma_uint64 framesProcessedOut;
+
+    MA_ASSERT(pResampler     != NULL);
+    MA_ASSERT(pFrameCountIn  != NULL);
+    MA_ASSERT(pFrameCountOut != NULL);
+
+    pFramesInS16       = (const ma_int16*)pFramesIn;
+    pFramesOutS16      = (      ma_int16*)pFramesOut;
+    frameCountIn       = *pFrameCountIn;
+    frameCountOut      = *pFrameCountOut;
+    framesProcessedIn  = 0;
+    framesProcessedOut = 0;
+
+    while (framesProcessedOut < frameCountOut) {
+        /* Before interpolating we need to load the buffers. */
+        while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
+            ma_uint32 iChannel;
+
+            if (pFramesInS16 != NULL) {
+                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
+                    pResampler->x1.s16[iChannel] = pFramesInS16[iChannel];
+                }
+                pFramesInS16 += pResampler->config.channels;
+            } else {
+                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+                    pResampler->x0.s16[iChannel] = pResampler->x1.s16[iChannel];
+                    pResampler->x1.s16[iChannel] = 0;
+                }
+            }
+
+            framesProcessedIn     += 1;
+            pResampler->inTimeInt -= 1;
+        }
+
+        if (pResampler->inTimeInt > 0) {
+            break;  /* Ran out of input data. */
+        }
+
+        /* Getting here means the frames have been loaded and we can generate the next output frame. */
+        if (pFramesOutS16 != NULL) {
+            MA_ASSERT(pResampler->inTimeInt == 0);
+            ma_linear_resampler_interpolate_frame_s16(pResampler, pFramesOutS16);
+
+            /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
+            if (pResampler->config.sampleRateIn != pResampler->config.sampleRateOut) {
+                ma_lpf_process_pcm_frame_s16(&pResampler->lpf, pFramesOutS16, pFramesOutS16);
+            }
+
+            pFramesOutS16 += pResampler->config.channels;
+        }
+
+        framesProcessedOut += 1;
+
+        /* Advance time forward. */
+        pResampler->inTimeInt  += pResampler->inAdvanceInt;
+        pResampler->inTimeFrac += pResampler->inAdvanceFrac;
+        if (pResampler->inTimeFrac >= pResampler->config.sampleRateOut) {
+            pResampler->inTimeFrac -= pResampler->config.sampleRateOut;
+            pResampler->inTimeInt  += 1;
+        }
+    }
+
+    *pFrameCountIn  = framesProcessedIn;
+    *pFrameCountOut = framesProcessedOut;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_linear_resampler_process_pcm_frames_s16(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    MA_ASSERT(pResampler != NULL);
+
+    if (pResampler->config.sampleRateIn > pResampler->config.sampleRateOut) {
+        return ma_linear_resampler_process_pcm_frames_s16_downsample(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+    } else {
+        return ma_linear_resampler_process_pcm_frames_s16_upsample(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+    }
+}
+
+
+static ma_result ma_linear_resampler_process_pcm_frames_f32_downsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    const float* pFramesInF32;
+    /* */ float* pFramesOutF32;
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 framesProcessedIn;
+    ma_uint64 framesProcessedOut;
+
+    MA_ASSERT(pResampler     != NULL);
+    MA_ASSERT(pFrameCountIn  != NULL);
+    MA_ASSERT(pFrameCountOut != NULL);
+
+    pFramesInF32       = (const float*)pFramesIn;
+    pFramesOutF32      = (      float*)pFramesOut;
+    frameCountIn       = *pFrameCountIn;
+    frameCountOut      = *pFrameCountOut;
+    framesProcessedIn  = 0;
+    framesProcessedOut = 0;
+
+    while (framesProcessedOut < frameCountOut) {
+        /* Before interpolating we need to load the buffers. When doing this we need to ensure we run every input sample through the filter. */
+        while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
+            ma_uint32 iChannel;
+
+            if (pFramesInF32 != NULL) {
+                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
+                    pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
+                }
+                pFramesInF32 += pResampler->config.channels;
+            } else {
+                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
+                    pResampler->x1.f32[iChannel] = 0;
+                }
+            }
+
+            /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
+            if (pResampler->config.sampleRateIn != pResampler->config.sampleRateOut) {
+                ma_lpf_process_pcm_frame_f32(&pResampler->lpf, pResampler->x1.f32, pResampler->x1.f32);
+            }
+
+            framesProcessedIn     += 1;
+            pResampler->inTimeInt -= 1;
+        }
+
+        if (pResampler->inTimeInt > 0) {
+            break;  /* Ran out of input data. */
+        }
+
+        /* Getting here means the frames have been loaded and filtered and we can generate the next output frame. */
+        if (pFramesOutF32 != NULL) {
+            MA_ASSERT(pResampler->inTimeInt == 0);
+            ma_linear_resampler_interpolate_frame_f32(pResampler, pFramesOutF32);
+
+            pFramesOutF32 += pResampler->config.channels;
+        }
+
+        framesProcessedOut += 1;
+
+        /* Advance time forward. */
+        pResampler->inTimeInt  += pResampler->inAdvanceInt;
+        pResampler->inTimeFrac += pResampler->inAdvanceFrac;
+        if (pResampler->inTimeFrac >= pResampler->config.sampleRateOut) {
+            pResampler->inTimeFrac -= pResampler->config.sampleRateOut;
+            pResampler->inTimeInt  += 1;
+        }
+    }
+
+    *pFrameCountIn  = framesProcessedIn;
+    *pFrameCountOut = framesProcessedOut;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_linear_resampler_process_pcm_frames_f32_upsample(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    const float* pFramesInF32;
+    /* */ float* pFramesOutF32;
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 framesProcessedIn;
+    ma_uint64 framesProcessedOut;
+
+    MA_ASSERT(pResampler     != NULL);
+    MA_ASSERT(pFrameCountIn  != NULL);
+    MA_ASSERT(pFrameCountOut != NULL);
+
+    pFramesInF32       = (const float*)pFramesIn;
+    pFramesOutF32      = (      float*)pFramesOut;
+    frameCountIn       = *pFrameCountIn;
+    frameCountOut      = *pFrameCountOut;
+    framesProcessedIn  = 0;
+    framesProcessedOut = 0;
+
+    while (framesProcessedOut < frameCountOut) {
+        /* Before interpolating we need to load the buffers. */
+        while (pResampler->inTimeInt > 0 && frameCountIn > framesProcessedIn) {
+            ma_uint32 iChannel;
+
+            if (pFramesInF32 != NULL) {
+                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
+                    pResampler->x1.f32[iChannel] = pFramesInF32[iChannel];
+                }
+                pFramesInF32 += pResampler->config.channels;
+            } else {
+                for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+                    pResampler->x0.f32[iChannel] = pResampler->x1.f32[iChannel];
+                    pResampler->x1.f32[iChannel] = 0;
+                }
+            }
+
+            framesProcessedIn     += 1;
+            pResampler->inTimeInt -= 1;
+        }
+
+        if (pResampler->inTimeInt > 0) {
+            break;  /* Ran out of input data. */
+        }
+
+        /* Getting here means the frames have been loaded and we can generate the next output frame. */
+        if (pFramesOutF32 != NULL) {
+            MA_ASSERT(pResampler->inTimeInt == 0);
+            ma_linear_resampler_interpolate_frame_f32(pResampler, pFramesOutF32);
+
+            /* Filter. Do not apply filtering if sample rates are the same or else you'll get dangerous glitching. */
+            if (pResampler->config.sampleRateIn != pResampler->config.sampleRateOut) {
+                ma_lpf_process_pcm_frame_f32(&pResampler->lpf, pFramesOutF32, pFramesOutF32);
+            }
+
+            pFramesOutF32 += pResampler->config.channels;
+        }
+
+        framesProcessedOut += 1;
+
+        /* Advance time forward. */
+        pResampler->inTimeInt  += pResampler->inAdvanceInt;
+        pResampler->inTimeFrac += pResampler->inAdvanceFrac;
+        if (pResampler->inTimeFrac >= pResampler->config.sampleRateOut) {
+            pResampler->inTimeFrac -= pResampler->config.sampleRateOut;
+            pResampler->inTimeInt  += 1;
+        }
+    }
+
+    *pFrameCountIn  = framesProcessedIn;
+    *pFrameCountOut = framesProcessedOut;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_linear_resampler_process_pcm_frames_f32(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    MA_ASSERT(pResampler != NULL);
+
+    if (pResampler->config.sampleRateIn > pResampler->config.sampleRateOut) {
+        return ma_linear_resampler_process_pcm_frames_f32_downsample(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+    } else {
+        return ma_linear_resampler_process_pcm_frames_f32_upsample(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+    }
+}
+
+
+MA_API ma_result ma_linear_resampler_process_pcm_frames(ma_linear_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*  */ if (pResampler->config.format == ma_format_s16) {
+        return ma_linear_resampler_process_pcm_frames_s16(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+    } else if (pResampler->config.format == ma_format_f32) {
+        return ma_linear_resampler_process_pcm_frames_f32(pResampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+    } else {
+        /* Should never get here. Getting here means the format is not supported and you didn't check the return value of ma_linear_resampler_init(). */
+        MA_ASSERT(MA_FALSE);
+        return MA_INVALID_ARGS;
+    }
+}
+
+
+MA_API ma_result ma_linear_resampler_set_rate(ma_linear_resampler* pResampler, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
+{
+    return ma_linear_resampler_set_rate_internal(pResampler, NULL, NULL, sampleRateIn, sampleRateOut, /* isResamplerAlreadyInitialized = */ MA_TRUE);
+}
+
+MA_API ma_result ma_linear_resampler_set_rate_ratio(ma_linear_resampler* pResampler, float ratioInOut)
+{
+    ma_uint32 n;
+    ma_uint32 d;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ratioInOut <= 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    d = 1000000;
+    n = (ma_uint32)(ratioInOut * d);
+
+    if (n == 0) {
+        return MA_INVALID_ARGS; /* Ratio too small. */
+    }
+
+    MA_ASSERT(n != 0);
+
+    return ma_linear_resampler_set_rate(pResampler, n, d);
+}
+
+MA_API ma_uint64 ma_linear_resampler_get_input_latency(const ma_linear_resampler* pResampler)
+{
+    if (pResampler == NULL) {
+        return 0;
+    }
+
+    return 1 + ma_lpf_get_latency(&pResampler->lpf);
+}
+
+MA_API ma_uint64 ma_linear_resampler_get_output_latency(const ma_linear_resampler* pResampler)
+{
+    if (pResampler == NULL) {
+        return 0;
+    }
+
+    return ma_linear_resampler_get_input_latency(pResampler) * pResampler->config.sampleRateOut / pResampler->config.sampleRateIn;
+}
+
+MA_API ma_result ma_linear_resampler_get_required_input_frame_count(const ma_linear_resampler* pResampler, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount)
+{
+    ma_uint64 inputFrameCount;
+
+    if (pInputFrameCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pInputFrameCount = 0;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (outputFrameCount == 0) {
+        return MA_SUCCESS;
+    }
+
+    /* Any whole input frames are consumed before the first output frame is generated. */
+    inputFrameCount = pResampler->inTimeInt;
+    outputFrameCount -= 1;
+
+    /* The rest of the output frames can be calculated in constant time. */
+    inputFrameCount += outputFrameCount * pResampler->inAdvanceInt;
+    inputFrameCount += (pResampler->inTimeFrac + (outputFrameCount * pResampler->inAdvanceFrac)) / pResampler->config.sampleRateOut;
+
+    *pInputFrameCount = inputFrameCount;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_linear_resampler_get_expected_output_frame_count(const ma_linear_resampler* pResampler, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount)
+{
+    ma_uint64 outputFrameCount;
+    ma_uint64 preliminaryInputFrameCountFromFrac;
+    ma_uint64 preliminaryInputFrameCount;
+
+    if (pOutputFrameCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pOutputFrameCount = 0;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    The first step is to get a preliminary output frame count. This will either be exactly equal to what we need, or less by 1. We need to
+    determine how many input frames will be consumed by this value. If it's greater than our original input frame count it means we won't
+    be able to generate an extra frame because we will have run out of input data. Otherwise we will have enough input for the generation
+    of an extra output frame. This add-by-one logic is necessary due to how the data loading logic works when processing frames.
+    */
+    outputFrameCount = (inputFrameCount * pResampler->config.sampleRateOut) / pResampler->config.sampleRateIn;
+
+    /*
+    We need to determine how many *whole* input frames will have been processed to generate our preliminary output frame count. This is
+    used in the logic below to determine whether or not we need to add an extra output frame.
+    */
+    preliminaryInputFrameCountFromFrac = (pResampler->inTimeFrac + outputFrameCount*pResampler->inAdvanceFrac) / pResampler->config.sampleRateOut;
+    preliminaryInputFrameCount         = (pResampler->inTimeInt  + outputFrameCount*pResampler->inAdvanceInt ) + preliminaryInputFrameCountFromFrac;
+
+    /*
+    If the total number of *whole* input frames that would be required to generate our preliminary output frame count is greater than
+    the amount of whole input frames we have available as input we need to *not* add an extra output frame as there won't be enough data
+    to actually process. Otherwise we need to add the extra output frame.
+    */
+    if (preliminaryInputFrameCount <= inputFrameCount) {
+        outputFrameCount += 1;
+    }
+
+    *pOutputFrameCount = outputFrameCount;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_linear_resampler_reset(ma_linear_resampler* pResampler)
+{
+    ma_uint32 iChannel;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Timers need to be cleared back to zero. */
+    pResampler->inTimeInt  = 1;  /* Set this to one to force an input sample to always be loaded for the first output frame. */
+    pResampler->inTimeFrac = 0;
+
+    /* Cached samples need to be cleared. */
+    if (pResampler->config.format == ma_format_f32) {
+        for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+            pResampler->x0.f32[iChannel] = 0;
+            pResampler->x1.f32[iChannel] = 0;
+        }
+    } else {
+        for (iChannel = 0; iChannel < pResampler->config.channels; iChannel += 1) {
+            pResampler->x0.s16[iChannel] = 0;
+            pResampler->x1.s16[iChannel] = 0;
+        }
+    }
+
+    /* The low pass filter needs to have its cache reset. */
+    ma_lpf_clear_cache(&pResampler->lpf);
+
+    return MA_SUCCESS;
+}
+
+
+
+/* Linear resampler backend vtable. */
+static ma_linear_resampler_config ma_resampling_backend_get_config__linear(const ma_resampler_config* pConfig)
+{
+    ma_linear_resampler_config linearConfig;
+
+    linearConfig = ma_linear_resampler_config_init(pConfig->format, pConfig->channels, pConfig->sampleRateIn, pConfig->sampleRateOut);
+    linearConfig.lpfOrder = pConfig->linear.lpfOrder;
+
+    return linearConfig;
+}
+
+static ma_result ma_resampling_backend_get_heap_size__linear(void* pUserData, const ma_resampler_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_linear_resampler_config linearConfig;
+
+    (void)pUserData;
+
+    linearConfig = ma_resampling_backend_get_config__linear(pConfig);
+
+    return ma_linear_resampler_get_heap_size(&linearConfig, pHeapSizeInBytes);
+}
+
+static ma_result ma_resampling_backend_init__linear(void* pUserData, const ma_resampler_config* pConfig, void* pHeap, ma_resampling_backend** ppBackend)
+{
+    ma_resampler* pResampler = (ma_resampler*)pUserData;
+    ma_result result;
+    ma_linear_resampler_config linearConfig;
+
+    (void)pUserData;
+
+    linearConfig = ma_resampling_backend_get_config__linear(pConfig);
+
+    result = ma_linear_resampler_init_preallocated(&linearConfig, pHeap, &pResampler->state.linear);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *ppBackend = &pResampler->state.linear;
+
+    return MA_SUCCESS;
+}
+
+static void ma_resampling_backend_uninit__linear(void* pUserData, ma_resampling_backend* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    (void)pUserData;
+
+    ma_linear_resampler_uninit((ma_linear_resampler*)pBackend, pAllocationCallbacks);
+}
+
+static ma_result ma_resampling_backend_process__linear(void* pUserData, ma_resampling_backend* pBackend, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    (void)pUserData;
+
+    return ma_linear_resampler_process_pcm_frames((ma_linear_resampler*)pBackend, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+}
+
+static ma_result ma_resampling_backend_set_rate__linear(void* pUserData, ma_resampling_backend* pBackend, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
+{
+    (void)pUserData;
+
+    return ma_linear_resampler_set_rate((ma_linear_resampler*)pBackend, sampleRateIn, sampleRateOut);
+}
+
+static ma_uint64 ma_resampling_backend_get_input_latency__linear(void* pUserData, const ma_resampling_backend* pBackend)
+{
+    (void)pUserData;
+
+    return ma_linear_resampler_get_input_latency((const ma_linear_resampler*)pBackend);
+}
+
+static ma_uint64 ma_resampling_backend_get_output_latency__linear(void* pUserData, const ma_resampling_backend* pBackend)
+{
+    (void)pUserData;
+
+    return ma_linear_resampler_get_output_latency((const ma_linear_resampler*)pBackend);
+}
+
+static ma_result ma_resampling_backend_get_required_input_frame_count__linear(void* pUserData, const ma_resampling_backend* pBackend, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount)
+{
+    (void)pUserData;
+
+    return ma_linear_resampler_get_required_input_frame_count((const ma_linear_resampler*)pBackend, outputFrameCount, pInputFrameCount);
+}
+
+static ma_result ma_resampling_backend_get_expected_output_frame_count__linear(void* pUserData, const ma_resampling_backend* pBackend, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount)
+{
+    (void)pUserData;
+
+    return ma_linear_resampler_get_expected_output_frame_count((const ma_linear_resampler*)pBackend, inputFrameCount, pOutputFrameCount);
+}
+
+static ma_result ma_resampling_backend_reset__linear(void* pUserData, ma_resampling_backend* pBackend)
+{
+    (void)pUserData;
+
+    return ma_linear_resampler_reset((ma_linear_resampler*)pBackend);
+}
+
+static ma_resampling_backend_vtable g_ma_linear_resampler_vtable =
+{
+    ma_resampling_backend_get_heap_size__linear,
+    ma_resampling_backend_init__linear,
+    ma_resampling_backend_uninit__linear,
+    ma_resampling_backend_process__linear,
+    ma_resampling_backend_set_rate__linear,
+    ma_resampling_backend_get_input_latency__linear,
+    ma_resampling_backend_get_output_latency__linear,
+    ma_resampling_backend_get_required_input_frame_count__linear,
+    ma_resampling_backend_get_expected_output_frame_count__linear,
+    ma_resampling_backend_reset__linear
+};
+
+
+
+MA_API ma_resampler_config ma_resampler_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut, ma_resample_algorithm algorithm)
+{
+    ma_resampler_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format = format;
+    config.channels = channels;
+    config.sampleRateIn = sampleRateIn;
+    config.sampleRateOut = sampleRateOut;
+    config.algorithm = algorithm;
+
+    /* Linear. */
+    config.linear.lpfOrder = ma_min(MA_DEFAULT_RESAMPLER_LPF_ORDER, MA_MAX_FILTER_ORDER);
+
+    return config;
+}
+
+static ma_result ma_resampler_get_vtable(const ma_resampler_config* pConfig, ma_resampler* pResampler, ma_resampling_backend_vtable** ppVTable, void** ppUserData)
+{
+    MA_ASSERT(pConfig    != NULL);
+    MA_ASSERT(ppVTable   != NULL);
+    MA_ASSERT(ppUserData != NULL);
+
+    /* Safety. */
+    *ppVTable   = NULL;
+    *ppUserData = NULL;
+
+    switch (pConfig->algorithm)
+    {
+        case ma_resample_algorithm_linear:
+        {
+            *ppVTable   = &g_ma_linear_resampler_vtable;
+            *ppUserData = pResampler;
+        } break;
+
+        case ma_resample_algorithm_custom:
+        {
+            *ppVTable   = pConfig->pBackendVTable;
+            *ppUserData = pConfig->pBackendUserData;
+        } break;
+
+        default: return MA_INVALID_ARGS;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resampler_get_heap_size(const ma_resampler_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_resampling_backend_vtable* pVTable;
+    void* pVTableUserData;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_resampler_get_vtable(pConfig, NULL, &pVTable, &pVTableUserData);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pVTable == NULL || pVTable->onGetHeapSize == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    result = pVTable->onGetHeapSize(pVTableUserData, pConfig, pHeapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resampler_init_preallocated(const ma_resampler_config* pConfig, void* pHeap, ma_resampler* pResampler)
+{
+    ma_result result;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pResampler);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pResampler->_pHeap        = pHeap;
+    pResampler->format        = pConfig->format;
+    pResampler->channels      = pConfig->channels;
+    pResampler->sampleRateIn  = pConfig->sampleRateIn;
+    pResampler->sampleRateOut = pConfig->sampleRateOut;
+
+    result = ma_resampler_get_vtable(pConfig, pResampler, &pResampler->pBackendVTable, &pResampler->pBackendUserData);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onInit == NULL) {
+        return MA_NOT_IMPLEMENTED;  /* onInit not implemented. */
+    }
+
+    result = pResampler->pBackendVTable->onInit(pResampler->pBackendUserData, pConfig, pHeap, &pResampler->pBackend);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resampler_init(const ma_resampler_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_resampler* pResampler)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_resampler_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_resampler_init_preallocated(pConfig, pHeap, pResampler);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pResampler->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_resampler_uninit(ma_resampler* pResampler, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pResampler == NULL) {
+        return;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onUninit == NULL) {
+        return;
+    }
+
+    pResampler->pBackendVTable->onUninit(pResampler->pBackendUserData, pResampler->pBackend, pAllocationCallbacks);
+
+    if (pResampler->_ownsHeap) {
+        ma_free(pResampler->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_resampler_process_pcm_frames(ma_resampler* pResampler, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pFrameCountOut == NULL && pFrameCountIn == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onProcess == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pResampler->pBackendVTable->onProcess(pResampler->pBackendUserData, pResampler->pBackend, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+}
+
+MA_API ma_result ma_resampler_set_rate(ma_resampler* pResampler, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
+{
+    ma_result result;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (sampleRateIn == 0 || sampleRateOut == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onSetRate == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    result = pResampler->pBackendVTable->onSetRate(pResampler->pBackendUserData, pResampler->pBackend, sampleRateIn, sampleRateOut);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pResampler->sampleRateIn  = sampleRateIn;
+    pResampler->sampleRateOut = sampleRateOut;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resampler_set_rate_ratio(ma_resampler* pResampler, float ratio)
+{
+    ma_uint32 n;
+    ma_uint32 d;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ratio <= 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    d = 1000;
+    n = (ma_uint32)(ratio * d);
+
+    if (n == 0) {
+        return MA_INVALID_ARGS; /* Ratio too small. */
+    }
+
+    MA_ASSERT(n != 0);
+
+    return ma_resampler_set_rate(pResampler, n, d);
+}
+
+MA_API ma_uint64 ma_resampler_get_input_latency(const ma_resampler* pResampler)
+{
+    if (pResampler == NULL) {
+        return 0;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onGetInputLatency == NULL) {
+        return 0;
+    }
+
+    return pResampler->pBackendVTable->onGetInputLatency(pResampler->pBackendUserData, pResampler->pBackend);
+}
+
+MA_API ma_uint64 ma_resampler_get_output_latency(const ma_resampler* pResampler)
+{
+    if (pResampler == NULL) {
+        return 0;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onGetOutputLatency == NULL) {
+        return 0;
+    }
+
+    return pResampler->pBackendVTable->onGetOutputLatency(pResampler->pBackendUserData, pResampler->pBackend);
+}
+
+MA_API ma_result ma_resampler_get_required_input_frame_count(const ma_resampler* pResampler, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount)
+{
+    if (pInputFrameCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pInputFrameCount = 0;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onGetRequiredInputFrameCount == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pResampler->pBackendVTable->onGetRequiredInputFrameCount(pResampler->pBackendUserData, pResampler->pBackend, outputFrameCount, pInputFrameCount);
+}
+
+MA_API ma_result ma_resampler_get_expected_output_frame_count(const ma_resampler* pResampler, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount)
+{
+    if (pOutputFrameCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pOutputFrameCount = 0;
+
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onGetExpectedOutputFrameCount == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pResampler->pBackendVTable->onGetExpectedOutputFrameCount(pResampler->pBackendUserData, pResampler->pBackend, inputFrameCount, pOutputFrameCount);
+}
+
+MA_API ma_result ma_resampler_reset(ma_resampler* pResampler)
+{
+    if (pResampler == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pResampler->pBackendVTable == NULL || pResampler->pBackendVTable->onReset == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pResampler->pBackendVTable->onReset(pResampler->pBackendUserData, pResampler->pBackend);
+}
+
+/**************************************************************************************************************************************************************
+
+Channel Conversion
+
+**************************************************************************************************************************************************************/
+#ifndef MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT
+#define MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT  12
+#endif
+
+#define MA_PLANE_LEFT      0
+#define MA_PLANE_RIGHT     1
+#define MA_PLANE_FRONT     2
+#define MA_PLANE_BACK      3
+#define MA_PLANE_BOTTOM    4
+#define MA_PLANE_TOP       5
+
+static float g_maChannelPlaneRatios[MA_CHANNEL_POSITION_COUNT][6] = {
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_NONE */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_MONO */
+    { 0.5f,  0.0f,  0.5f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_LEFT */
+    { 0.0f,  0.5f,  0.5f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_RIGHT */
+    { 0.0f,  0.0f,  1.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_CENTER */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_LFE */
+    { 0.5f,  0.0f,  0.0f,  0.5f,  0.0f,  0.0f},  /* MA_CHANNEL_BACK_LEFT */
+    { 0.0f,  0.5f,  0.0f,  0.5f,  0.0f,  0.0f},  /* MA_CHANNEL_BACK_RIGHT */
+    { 0.25f, 0.0f,  0.75f, 0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_LEFT_CENTER */
+    { 0.0f,  0.25f, 0.75f, 0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_FRONT_RIGHT_CENTER */
+    { 0.0f,  0.0f,  0.0f,  1.0f,  0.0f,  0.0f},  /* MA_CHANNEL_BACK_CENTER */
+    { 1.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_SIDE_LEFT */
+    { 0.0f,  1.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_SIDE_RIGHT */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  1.0f},  /* MA_CHANNEL_TOP_CENTER */
+    { 0.33f, 0.0f,  0.33f, 0.0f,  0.0f,  0.34f}, /* MA_CHANNEL_TOP_FRONT_LEFT */
+    { 0.0f,  0.0f,  0.5f,  0.0f,  0.0f,  0.5f},  /* MA_CHANNEL_TOP_FRONT_CENTER */
+    { 0.0f,  0.33f, 0.33f, 0.0f,  0.0f,  0.34f}, /* MA_CHANNEL_TOP_FRONT_RIGHT */
+    { 0.33f, 0.0f,  0.0f,  0.33f, 0.0f,  0.34f}, /* MA_CHANNEL_TOP_BACK_LEFT */
+    { 0.0f,  0.0f,  0.0f,  0.5f,  0.0f,  0.5f},  /* MA_CHANNEL_TOP_BACK_CENTER */
+    { 0.0f,  0.33f, 0.0f,  0.33f, 0.0f,  0.34f}, /* MA_CHANNEL_TOP_BACK_RIGHT */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_0 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_1 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_2 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_3 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_4 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_5 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_6 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_7 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_8 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_9 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_10 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_11 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_12 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_13 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_14 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_15 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_16 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_17 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_18 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_19 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_20 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_21 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_22 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_23 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_24 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_25 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_26 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_27 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_28 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_29 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_30 */
+    { 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f},  /* MA_CHANNEL_AUX_31 */
+};
+
+static float ma_calculate_channel_position_rectangular_weight(ma_channel channelPositionA, ma_channel channelPositionB)
+{
+    /*
+    Imagine the following simplified example: You have a single input speaker which is the front/left speaker which you want to convert to
+    the following output configuration:
+
+     - front/left
+     - side/left
+     - back/left
+
+    The front/left output is easy - it the same speaker position so it receives the full contribution of the front/left input. The amount
+    of contribution to apply to the side/left and back/left speakers, however, is a bit more complicated.
+
+    Imagine the front/left speaker as emitting audio from two planes - the front plane and the left plane. You can think of the front/left
+    speaker emitting half of its total volume from the front, and the other half from the left. Since part of its volume is being emitted
+    from the left side, and the side/left and back/left channels also emit audio from the left plane, one would expect that they would
+    receive some amount of contribution from front/left speaker. The amount of contribution depends on how many planes are shared between
+    the two speakers. Note that in the examples below I've added a top/front/left speaker as an example just to show how the math works
+    across 3 spatial dimensions.
+
+    The first thing to do is figure out how each speaker's volume is spread over each of plane:
+     - front/left:     2 planes (front and left)      = 1/2 = half its total volume on each plane
+     - side/left:      1 plane (left only)            = 1/1 = entire volume from left plane
+     - back/left:      2 planes (back and left)       = 1/2 = half its total volume on each plane
+     - top/front/left: 3 planes (top, front and left) = 1/3 = one third its total volume on each plane
+
+    The amount of volume each channel contributes to each of its planes is what controls how much it is willing to given and take to other
+    channels on the same plane. The volume that is willing to the given by one channel is multiplied by the volume that is willing to be
+    taken by the other to produce the final contribution.
+    */
+
+    /* Contribution = Sum(Volume to Give * Volume to Take) */
+    float contribution =
+        g_maChannelPlaneRatios[channelPositionA][0] * g_maChannelPlaneRatios[channelPositionB][0] +
+        g_maChannelPlaneRatios[channelPositionA][1] * g_maChannelPlaneRatios[channelPositionB][1] +
+        g_maChannelPlaneRatios[channelPositionA][2] * g_maChannelPlaneRatios[channelPositionB][2] +
+        g_maChannelPlaneRatios[channelPositionA][3] * g_maChannelPlaneRatios[channelPositionB][3] +
+        g_maChannelPlaneRatios[channelPositionA][4] * g_maChannelPlaneRatios[channelPositionB][4] +
+        g_maChannelPlaneRatios[channelPositionA][5] * g_maChannelPlaneRatios[channelPositionB][5];
+
+    return contribution;
+}
+
+MA_API ma_channel_converter_config ma_channel_converter_config_init(ma_format format, ma_uint32 channelsIn, const ma_channel* pChannelMapIn, ma_uint32 channelsOut, const ma_channel* pChannelMapOut, ma_channel_mix_mode mixingMode)
+{
+    ma_channel_converter_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format         = format;
+    config.channelsIn     = channelsIn;
+    config.channelsOut    = channelsOut;
+    config.pChannelMapIn  = pChannelMapIn;
+    config.pChannelMapOut = pChannelMapOut;
+    config.mixingMode     = mixingMode;
+
+    return config;
+}
+
+static ma_int32 ma_channel_converter_float_to_fixed(float x)
+{
+    return (ma_int32)(x * (1<<MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT));
+}
+
+static ma_uint32 ma_channel_map_get_spatial_channel_count(const ma_channel* pChannelMap, ma_uint32 channels)
+{
+    ma_uint32 spatialChannelCount = 0;
+    ma_uint32 iChannel;
+
+    MA_ASSERT(pChannelMap != NULL);
+    MA_ASSERT(channels > 0);
+
+    for (iChannel = 0; iChannel < channels; ++iChannel) {
+        if (ma_is_spatial_channel_position(ma_channel_map_get_channel(pChannelMap, channels, iChannel))) {
+            spatialChannelCount++;
+        }
+    }
+
+    return spatialChannelCount;
+}
+
+static ma_bool32 ma_is_spatial_channel_position(ma_channel channelPosition)
+{
+    int i;
+
+    if (channelPosition == MA_CHANNEL_NONE || channelPosition == MA_CHANNEL_MONO || channelPosition == MA_CHANNEL_LFE) {
+        return MA_FALSE;
+    }
+
+    if (channelPosition >= MA_CHANNEL_AUX_0 && channelPosition <= MA_CHANNEL_AUX_31) {
+        return MA_FALSE;
+    }
+
+    for (i = 0; i < 6; ++i) {   /* Each side of a cube. */
+        if (g_maChannelPlaneRatios[channelPosition][i] != 0) {
+            return MA_TRUE;
+        }
+    }
+
+    return MA_FALSE;
+}
+
+
+static ma_bool32 ma_channel_map_is_passthrough(const ma_channel* pChannelMapIn, ma_uint32 channelsIn, const ma_channel* pChannelMapOut, ma_uint32 channelsOut)
+{
+    if (channelsOut == channelsIn) {
+        return ma_channel_map_is_equal(pChannelMapOut, pChannelMapIn, channelsOut);
+    } else {
+        return MA_FALSE;    /* Channel counts differ, so cannot be a passthrough. */
+    }
+}
+
+static ma_channel_conversion_path ma_channel_map_get_conversion_path(const ma_channel* pChannelMapIn, ma_uint32 channelsIn, const ma_channel* pChannelMapOut, ma_uint32 channelsOut, ma_channel_mix_mode mode)
+{
+    if (ma_channel_map_is_passthrough(pChannelMapIn, channelsIn, pChannelMapOut, channelsOut)) {
+        return ma_channel_conversion_path_passthrough;
+    }
+
+    if (channelsOut == 1 && (pChannelMapOut == NULL || pChannelMapOut[0] == MA_CHANNEL_MONO)) {
+        return ma_channel_conversion_path_mono_out;
+    }
+
+    if (channelsIn == 1 && (pChannelMapIn == NULL || pChannelMapIn[0] == MA_CHANNEL_MONO)) {
+        return ma_channel_conversion_path_mono_in;
+    }
+
+    if (mode == ma_channel_mix_mode_custom_weights) {
+        return ma_channel_conversion_path_weights;
+    }
+
+    /*
+    We can use a simple shuffle if both channel maps have the same channel count and all channel
+    positions are present in both.
+    */
+    if (channelsIn == channelsOut) {
+        ma_uint32 iChannelIn;
+        ma_bool32 areAllChannelPositionsPresent = MA_TRUE;
+        for (iChannelIn = 0; iChannelIn < channelsIn; ++iChannelIn) {
+            ma_bool32 isInputChannelPositionInOutput = ma_channel_map_contains_channel_position(channelsOut, pChannelMapOut, ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn));
+            if (!isInputChannelPositionInOutput) {
+                areAllChannelPositionsPresent = MA_FALSE;
+                break;
+            }
+        }
+
+        if (areAllChannelPositionsPresent) {
+            return ma_channel_conversion_path_shuffle;
+        }
+    }
+
+    /* Getting here means we'll need to use weights. */
+    return ma_channel_conversion_path_weights;
+}
+
+
+static ma_result ma_channel_map_build_shuffle_table(const ma_channel* pChannelMapIn, ma_uint32 channelCountIn, const ma_channel* pChannelMapOut, ma_uint32 channelCountOut, ma_uint8* pShuffleTable)
+{
+    ma_uint32 iChannelIn;
+    ma_uint32 iChannelOut;
+
+    if (pShuffleTable == NULL || channelCountIn == 0 || channelCountOut == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    When building the shuffle table we just do a 1:1 mapping based on the first occurrence of a channel. If the
+    input channel has more than one occurrence of a channel position, the second one will be ignored.
+    */
+    for (iChannelOut = 0; iChannelOut < channelCountOut; iChannelOut += 1) {
+        ma_channel channelOut;
+
+        /* Default to MA_CHANNEL_INDEX_NULL so that if a mapping is not found it'll be set appropriately. */
+        pShuffleTable[iChannelOut] = MA_CHANNEL_INDEX_NULL;
+
+        channelOut = ma_channel_map_get_channel(pChannelMapOut, channelCountOut, iChannelOut);
+        for (iChannelIn = 0; iChannelIn < channelCountIn; iChannelIn += 1) {
+            ma_channel channelIn;
+
+            channelIn = ma_channel_map_get_channel(pChannelMapIn, channelCountIn, iChannelIn);
+            if (channelOut == channelIn) {
+                pShuffleTable[iChannelOut] = (ma_uint8)iChannelIn;
+                break;
+            }
+
+            /*
+            Getting here means the channels don't exactly match, but we are going to support some
+            relaxed matching for practicality. If, for example, there are two stereo channel maps,
+            but one uses front left/right and the other uses side left/right, it makes logical
+            sense to just map these. The way we'll do it is we'll check if there is a logical
+            corresponding mapping, and if so, apply it, but we will *not* break from the loop,
+            thereby giving the loop a chance to find an exact match later which will take priority.
+            */
+            switch (channelOut)
+            {
+                /* Left channels. */
+                case MA_CHANNEL_FRONT_LEFT:
+                case MA_CHANNEL_SIDE_LEFT:
+                {
+                    switch (channelIn) {
+                        case MA_CHANNEL_FRONT_LEFT:
+                        case MA_CHANNEL_SIDE_LEFT:
+                        {
+                            pShuffleTable[iChannelOut] = (ma_uint8)iChannelIn;
+                        } break;
+                    }
+                } break;
+
+                /* Right channels. */
+                case MA_CHANNEL_FRONT_RIGHT:
+                case MA_CHANNEL_SIDE_RIGHT:
+                {
+                    switch (channelIn) {
+                        case MA_CHANNEL_FRONT_RIGHT:
+                        case MA_CHANNEL_SIDE_RIGHT:
+                        {
+                            pShuffleTable[iChannelOut] = (ma_uint8)iChannelIn;
+                        } break;
+                    }
+                } break;
+
+                default: break;
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static void ma_channel_map_apply_shuffle_table_u8(ma_uint8* pFramesOut, ma_uint32 channelsOut, const ma_uint8* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannelOut;
+
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
+            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
+                pFramesOut[iChannelOut] = pFramesIn[iChannelIn];
+            } else {
+                pFramesOut[iChannelOut] = 0;
+            }
+        }
+
+        pFramesOut += channelsOut;
+        pFramesIn  += channelsIn;
+    }
+}
+
+static void ma_channel_map_apply_shuffle_table_s16(ma_int16* pFramesOut, ma_uint32 channelsOut, const ma_int16* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannelOut;
+
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
+            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
+                pFramesOut[iChannelOut] = pFramesIn[iChannelIn];
+            } else {
+                pFramesOut[iChannelOut] = 0;
+            }
+        }
+
+        pFramesOut += channelsOut;
+        pFramesIn  += channelsIn;
+    }
+}
+
+static void ma_channel_map_apply_shuffle_table_s24(ma_uint8* pFramesOut, ma_uint32 channelsOut, const ma_uint8* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannelOut;
+
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
+            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
+                pFramesOut[iChannelOut*3 + 0] = pFramesIn[iChannelIn*3 + 0];
+                pFramesOut[iChannelOut*3 + 1] = pFramesIn[iChannelIn*3 + 1];
+                pFramesOut[iChannelOut*3 + 2] = pFramesIn[iChannelIn*3 + 2];
+            } else {
+                pFramesOut[iChannelOut*3 + 0] = 0;
+            }   pFramesOut[iChannelOut*3 + 1] = 0;
+        }       pFramesOut[iChannelOut*3 + 2] = 0;
+
+        pFramesOut += channelsOut*3;
+        pFramesIn  += channelsIn*3;
+    }
+}
+
+static void ma_channel_map_apply_shuffle_table_s32(ma_int32* pFramesOut, ma_uint32 channelsOut, const ma_int32* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannelOut;
+
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
+            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
+                pFramesOut[iChannelOut] = pFramesIn[iChannelIn];
+            } else {
+                pFramesOut[iChannelOut] = 0;
+            }
+        }
+
+        pFramesOut += channelsOut;
+        pFramesIn  += channelsIn;
+    }
+}
+
+static void ma_channel_map_apply_shuffle_table_f32(float* pFramesOut, ma_uint32 channelsOut, const float* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannelOut;
+
+    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+            ma_uint8 iChannelIn = pShuffleTable[iChannelOut];
+            if (iChannelIn < channelsIn) {  /* For safety, and to deal with MA_CHANNEL_INDEX_NULL. */
+                pFramesOut[iChannelOut] = pFramesIn[iChannelIn];
+            } else {
+                pFramesOut[iChannelOut] = 0;
+            }
+        }
+
+        pFramesOut += channelsOut;
+        pFramesIn  += channelsIn;
+    }
+}
+
+static ma_result ma_channel_map_apply_shuffle_table(void* pFramesOut, ma_uint32 channelsOut, const void* pFramesIn, ma_uint32 channelsIn, ma_uint64 frameCount, const ma_uint8* pShuffleTable, ma_format format)
+{
+    if (pFramesOut == NULL || pFramesIn == NULL || channelsOut == 0 || pShuffleTable == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    switch (format)
+    {
+        case ma_format_u8:
+        {
+            ma_channel_map_apply_shuffle_table_u8((ma_uint8*)pFramesOut, channelsOut, (const ma_uint8*)pFramesIn, channelsIn, frameCount, pShuffleTable);
+        } break;
+
+        case ma_format_s16:
+        {
+            ma_channel_map_apply_shuffle_table_s16((ma_int16*)pFramesOut, channelsOut, (const ma_int16*)pFramesIn, channelsIn, frameCount, pShuffleTable);
+        } break;
+
+        case ma_format_s24:
+        {
+            ma_channel_map_apply_shuffle_table_s24((ma_uint8*)pFramesOut, channelsOut, (const ma_uint8*)pFramesIn, channelsIn, frameCount, pShuffleTable);
+        } break;
+
+        case ma_format_s32:
+        {
+            ma_channel_map_apply_shuffle_table_s32((ma_int32*)pFramesOut, channelsOut, (const ma_int32*)pFramesIn, channelsIn, frameCount, pShuffleTable);
+        } break;
+
+        case ma_format_f32:
+        {
+            ma_channel_map_apply_shuffle_table_f32((float*)pFramesOut, channelsOut, (const float*)pFramesIn, channelsIn, frameCount, pShuffleTable);
+        } break;
+
+        default: return MA_INVALID_ARGS;    /* Unknown format. */
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_channel_map_apply_mono_out_f32(float* pFramesOut, const float* pFramesIn, const ma_channel* pChannelMapIn, ma_uint32 channelsIn, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannelIn;
+    ma_uint32 accumulationCount;
+
+    if (pFramesOut == NULL || pFramesIn == NULL || channelsIn == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* In this case the output stream needs to be the average of all channels, ignoring NONE. */
+
+    /* A quick pre-processing step to get the accumulation counter since we're ignoring NONE channels. */
+    accumulationCount = 0;
+    for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
+        if (ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn) != MA_CHANNEL_NONE) {
+            accumulationCount += 1;
+        }
+    }
+
+    if (accumulationCount > 0) {    /* <-- Prevent a division by zero. */
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float accumulation = 0;
+
+            for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
+                ma_channel channelIn = ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn);
+                if (channelIn != MA_CHANNEL_NONE) {
+                    accumulation += pFramesIn[iChannelIn];
+                }
+            }
+
+            pFramesOut[0] = accumulation / accumulationCount;
+            pFramesOut += 1;
+            pFramesIn  += channelsIn;
+        }
+    } else {
+        ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, 1);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_channel_map_apply_mono_in_f32(float* MA_RESTRICT pFramesOut, const ma_channel* pChannelMapOut, ma_uint32 channelsOut, const float* MA_RESTRICT pFramesIn, ma_uint64 frameCount, ma_mono_expansion_mode monoExpansionMode)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannelOut;
+
+    if (pFramesOut == NULL || channelsOut == 0 || pFramesIn == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Note that the MA_CHANNEL_NONE channel must be ignored in all cases. */
+    switch (monoExpansionMode)
+    {
+        case ma_mono_expansion_mode_average:
+        {
+            float weight;
+            ma_uint32 validChannelCount = 0;
+
+            for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+                if (channelOut != MA_CHANNEL_NONE) {
+                    validChannelCount += 1;
+                }
+            }
+
+            weight = 1.0f / validChannelCount;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                    ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+                    if (channelOut != MA_CHANNEL_NONE) {
+                        pFramesOut[iChannelOut] = pFramesIn[0] * weight;
+                    }
+                }
+
+                pFramesOut += channelsOut;
+                pFramesIn  += 1;
+            }
+        } break;
+
+        case ma_mono_expansion_mode_stereo_only:
+        {
+            if (channelsOut >= 2) {
+                ma_uint32 iChannelLeft  = (ma_uint32)-1;
+                ma_uint32 iChannelRight = (ma_uint32)-1;
+
+                /*
+                We first need to find our stereo channels. We prefer front-left and front-right, but
+                if they're not available, we'll also try side-left and side-right. If neither are
+                available we'll fall through to the default case below.
+                */
+                for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                    ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+                    if (channelOut == MA_CHANNEL_SIDE_LEFT) {
+                        iChannelLeft  = iChannelOut;
+                    }
+                    if (channelOut == MA_CHANNEL_SIDE_RIGHT) {
+                        iChannelRight = iChannelOut;
+                    }
+                }
+
+                for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                    ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+                    if (channelOut == MA_CHANNEL_FRONT_LEFT) {
+                        iChannelLeft  = iChannelOut;
+                    }
+                    if (channelOut == MA_CHANNEL_FRONT_RIGHT) {
+                        iChannelRight = iChannelOut;
+                    }
+                }
+
+
+                if (iChannelLeft != (ma_uint32)-1 && iChannelRight != (ma_uint32)-1) {
+                    /* We found our stereo channels so we can duplicate the signal across those channels. */
+                    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                            ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+                            if (channelOut != MA_CHANNEL_NONE) {
+                                if (iChannelOut == iChannelLeft || iChannelOut == iChannelRight) {
+                                    pFramesOut[iChannelOut] = pFramesIn[0];
+                                } else {
+                                    pFramesOut[iChannelOut] = 0.0f;
+                                }
+                            }
+                        }
+
+                        pFramesOut += channelsOut;
+                        pFramesIn  += 1;
+                    }
+
+                    break;  /* Get out of the switch. */
+                } else {
+                    /* Fallthrough. Does not have left and right channels. */
+                    goto default_handler;
+                }
+            } else {
+                /* Fallthrough. Does not have stereo channels. */
+                goto default_handler;
+            }
+        };  /* Fallthrough. See comments above. */
+
+        case ma_mono_expansion_mode_duplicate:
+        default:
+        {
+            default_handler:
+            {
+                if (channelsOut <= MA_MAX_CHANNELS) {
+                    ma_bool32 hasEmptyChannel = MA_FALSE;
+                    ma_channel channelPositions[MA_MAX_CHANNELS];
+                    for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                        channelPositions[iChannelOut] = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+                        if (channelPositions[iChannelOut] == MA_CHANNEL_NONE) {
+                            hasEmptyChannel = MA_TRUE;
+                        }
+                    }
+
+                    if (hasEmptyChannel == MA_FALSE) {
+                        /*
+                        Faster path when there's no MA_CHANNEL_NONE channel positions. This should hopefully
+                        help the compiler with auto-vectorization.m
+                        */
+                        if (channelsOut == 2) {
+                        #if defined(MA_SUPPORT_SSE2)
+                            if (ma_has_sse2()) {
+                                /* We want to do two frames in each iteration. */
+                                ma_uint64 unrolledFrameCount = frameCount >> 1;
+
+                                for (iFrame = 0; iFrame < unrolledFrameCount; iFrame += 1) {
+                                    __m128 in0 = _mm_set1_ps(pFramesIn[iFrame*2 + 0]);
+                                    __m128 in1 = _mm_set1_ps(pFramesIn[iFrame*2 + 1]);
+                                    _mm_storeu_ps(&pFramesOut[iFrame*4 + 0], _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(0, 0, 0, 0)));
+                                }
+
+                                /* Tail. */
+                                iFrame = unrolledFrameCount << 1;
+                                goto generic_on_fastpath;
+                            } else
+                        #endif
+                            {
+                                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                                    for (iChannelOut = 0; iChannelOut < 2; iChannelOut += 1) {
+                                        pFramesOut[iFrame*2 + iChannelOut] = pFramesIn[iFrame];
+                                    }
+                                }
+                            }
+                        } else if (channelsOut == 6) {
+                        #if defined(MA_SUPPORT_SSE2)
+                            if (ma_has_sse2()) {
+                                /* We want to do two frames in each iteration so we can have a multiple of 4 samples. */
+                                ma_uint64 unrolledFrameCount = frameCount >> 1;
+
+                                for (iFrame = 0; iFrame < unrolledFrameCount; iFrame += 1) {
+                                    __m128 in0 = _mm_set1_ps(pFramesIn[iFrame*2 + 0]);
+                                    __m128 in1 = _mm_set1_ps(pFramesIn[iFrame*2 + 1]);
+
+                                    _mm_storeu_ps(&pFramesOut[iFrame*12 + 0], in0);
+                                    _mm_storeu_ps(&pFramesOut[iFrame*12 + 4], _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(0, 0, 0, 0)));
+                                    _mm_storeu_ps(&pFramesOut[iFrame*12 + 8], in1);
+                                }
+
+                                /* Tail. */
+                                iFrame = unrolledFrameCount << 1;
+                                goto generic_on_fastpath;
+                            } else
+                        #endif
+                            {
+                                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                                    for (iChannelOut = 0; iChannelOut < 6; iChannelOut += 1) {
+                                        pFramesOut[iFrame*6 + iChannelOut] = pFramesIn[iFrame];
+                                    }
+                                }
+                            }
+                        } else if (channelsOut == 8) {
+                        #if defined(MA_SUPPORT_SSE2)
+                            if (ma_has_sse2()) {
+                                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                                    __m128 in = _mm_set1_ps(pFramesIn[iFrame]);
+                                    _mm_storeu_ps(&pFramesOut[iFrame*8 + 0], in);
+                                    _mm_storeu_ps(&pFramesOut[iFrame*8 + 4], in);
+                                }
+                            } else
+                        #endif
+                            {
+                                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                                    for (iChannelOut = 0; iChannelOut < 8; iChannelOut += 1) {
+                                        pFramesOut[iFrame*8 + iChannelOut] = pFramesIn[iFrame];
+                                    }
+                                }
+                            }
+                        } else {
+                            iFrame = 0;
+
+                            #if defined(MA_SUPPORT_SSE2)    /* For silencing a warning with non-x86 builds. */
+                            generic_on_fastpath:
+                            #endif
+                            {
+                                for (; iFrame < frameCount; iFrame += 1) {
+                                    for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                                        pFramesOut[iFrame*channelsOut + iChannelOut] = pFramesIn[iFrame];
+                                    }
+                                }
+                            }
+                        }
+                    } else {
+                        /* Slow path. Need to handle MA_CHANNEL_NONE. */
+                        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                            for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                                if (channelPositions[iChannelOut] != MA_CHANNEL_NONE) {
+                                    pFramesOut[iFrame*channelsOut + iChannelOut] = pFramesIn[iFrame];
+                                }
+                            }
+                        }
+                    }
+                } else {
+                    /* Slow path. Too many channels to store on the stack. */
+                    for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                        for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                            ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+                            if (channelOut != MA_CHANNEL_NONE) {
+                                pFramesOut[iFrame*channelsOut + iChannelOut] = pFramesIn[iFrame];
+                            }
+                        }
+                    }
+                }
+            }
+        } break;
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_channel_map_apply_f32(float* pFramesOut, const ma_channel* pChannelMapOut, ma_uint32 channelsOut, const float* pFramesIn, const ma_channel* pChannelMapIn, ma_uint32 channelsIn, ma_uint64 frameCount, ma_channel_mix_mode mode, ma_mono_expansion_mode monoExpansionMode)
+{
+    ma_channel_conversion_path conversionPath = ma_channel_map_get_conversion_path(pChannelMapIn, channelsIn, pChannelMapOut, channelsOut, mode);
+
+    /* Optimized Path: Passthrough */
+    if (conversionPath == ma_channel_conversion_path_passthrough) {
+        ma_copy_pcm_frames(pFramesOut, pFramesIn, frameCount, ma_format_f32, channelsOut);
+        return;
+    }
+
+    /* Special Path: Mono Output. */
+    if (conversionPath == ma_channel_conversion_path_mono_out) {
+        ma_channel_map_apply_mono_out_f32(pFramesOut, pFramesIn, pChannelMapIn, channelsIn, frameCount);
+        return;
+    }
+
+    /* Special Path: Mono Input. */
+    if (conversionPath == ma_channel_conversion_path_mono_in) {
+        ma_channel_map_apply_mono_in_f32(pFramesOut, pChannelMapOut, channelsOut, pFramesIn, frameCount, monoExpansionMode);
+        return;
+    }
+
+    /* Getting here means we aren't running on an optimized conversion path. */
+    if (channelsOut <= MA_MAX_CHANNELS) {
+        ma_result result;
+
+        if (mode == ma_channel_mix_mode_simple) {
+            ma_channel shuffleTable[MA_MAX_CHANNELS];
+
+            result = ma_channel_map_build_shuffle_table(pChannelMapIn, channelsIn, pChannelMapOut, channelsOut, shuffleTable);
+            if (result != MA_SUCCESS) {
+                return;
+            }
+
+            result = ma_channel_map_apply_shuffle_table(pFramesOut, channelsOut, pFramesIn, channelsIn, frameCount, shuffleTable, ma_format_f32);
+            if (result != MA_SUCCESS) {
+                return;
+            }
+        } else {
+            ma_uint32 iFrame;
+            ma_uint32 iChannelOut;
+            ma_uint32 iChannelIn;
+            float weights[32][32];  /* Do not use MA_MAX_CHANNELS here! */
+
+            /*
+            If we have a small enough number of channels, pre-compute the weights. Otherwise we'll just need to
+            fall back to a slower path because otherwise we'll run out of stack space.
+            */
+            if (channelsIn <= ma_countof(weights) && channelsOut <= ma_countof(weights)) {
+                /* Pre-compute weights. */
+                for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                    ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+                    for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
+                        ma_channel channelIn = ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn);
+                        weights[iChannelOut][iChannelIn] = ma_calculate_channel_position_rectangular_weight(channelOut, channelIn);
+                    }
+                }
+
+                iFrame = 0;
+
+                /* Experiment: Try an optimized unroll for some specific cases to see how it improves performance. RESULT: Good gains. */
+                if (channelsOut == 8) {
+                    /* Experiment 2: Expand the inner loop to see what kind of different it makes. RESULT: Small, but worthwhile gain. */
+                    if (channelsIn == 2) {
+                        for (; iFrame < frameCount; iFrame += 1) {
+                            float accumulation[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+                            accumulation[0] += pFramesIn[iFrame*2 + 0] * weights[0][0];
+                            accumulation[1] += pFramesIn[iFrame*2 + 0] * weights[1][0];
+                            accumulation[2] += pFramesIn[iFrame*2 + 0] * weights[2][0];
+                            accumulation[3] += pFramesIn[iFrame*2 + 0] * weights[3][0];
+                            accumulation[4] += pFramesIn[iFrame*2 + 0] * weights[4][0];
+                            accumulation[5] += pFramesIn[iFrame*2 + 0] * weights[5][0];
+                            accumulation[6] += pFramesIn[iFrame*2 + 0] * weights[6][0];
+                            accumulation[7] += pFramesIn[iFrame*2 + 0] * weights[7][0];
+
+                            accumulation[0] += pFramesIn[iFrame*2 + 1] * weights[0][1];
+                            accumulation[1] += pFramesIn[iFrame*2 + 1] * weights[1][1];
+                            accumulation[2] += pFramesIn[iFrame*2 + 1] * weights[2][1];
+                            accumulation[3] += pFramesIn[iFrame*2 + 1] * weights[3][1];
+                            accumulation[4] += pFramesIn[iFrame*2 + 1] * weights[4][1];
+                            accumulation[5] += pFramesIn[iFrame*2 + 1] * weights[5][1];
+                            accumulation[6] += pFramesIn[iFrame*2 + 1] * weights[6][1];
+                            accumulation[7] += pFramesIn[iFrame*2 + 1] * weights[7][1];
+
+                            pFramesOut[iFrame*8 + 0] = accumulation[0];
+                            pFramesOut[iFrame*8 + 1] = accumulation[1];
+                            pFramesOut[iFrame*8 + 2] = accumulation[2];
+                            pFramesOut[iFrame*8 + 3] = accumulation[3];
+                            pFramesOut[iFrame*8 + 4] = accumulation[4];
+                            pFramesOut[iFrame*8 + 5] = accumulation[5];
+                            pFramesOut[iFrame*8 + 6] = accumulation[6];
+                            pFramesOut[iFrame*8 + 7] = accumulation[7];
+                        }
+                    } else {
+                        /* When outputting to 8 channels, we can do everything in groups of two 4x SIMD operations. */
+                        for (; iFrame < frameCount; iFrame += 1) {
+                            float accumulation[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+                            for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
+                                accumulation[0] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[0][iChannelIn];
+                                accumulation[1] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[1][iChannelIn];
+                                accumulation[2] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[2][iChannelIn];
+                                accumulation[3] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[3][iChannelIn];
+                                accumulation[4] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[4][iChannelIn];
+                                accumulation[5] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[5][iChannelIn];
+                                accumulation[6] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[6][iChannelIn];
+                                accumulation[7] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[7][iChannelIn];
+                            }
+
+                            pFramesOut[iFrame*8 + 0] = accumulation[0];
+                            pFramesOut[iFrame*8 + 1] = accumulation[1];
+                            pFramesOut[iFrame*8 + 2] = accumulation[2];
+                            pFramesOut[iFrame*8 + 3] = accumulation[3];
+                            pFramesOut[iFrame*8 + 4] = accumulation[4];
+                            pFramesOut[iFrame*8 + 5] = accumulation[5];
+                            pFramesOut[iFrame*8 + 6] = accumulation[6];
+                            pFramesOut[iFrame*8 + 7] = accumulation[7];
+                        }
+                    }
+                } else if (channelsOut == 6) {
+                    /*
+                    When outputting to 6 channels we unfortunately don't have a nice multiple of 4 to do 4x SIMD operations. Instead we'll
+                    expand our weights and do two frames at a time.
+                    */
+                    for (; iFrame < frameCount; iFrame += 1) {
+                        float accumulation[12] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+                        for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
+                            accumulation[0] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[0][iChannelIn];
+                            accumulation[1] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[1][iChannelIn];
+                            accumulation[2] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[2][iChannelIn];
+                            accumulation[3] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[3][iChannelIn];
+                            accumulation[4] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[4][iChannelIn];
+                            accumulation[5] += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[5][iChannelIn];
+                        }
+
+                        pFramesOut[iFrame*6 + 0] = accumulation[0];
+                        pFramesOut[iFrame*6 + 1] = accumulation[1];
+                        pFramesOut[iFrame*6 + 2] = accumulation[2];
+                        pFramesOut[iFrame*6 + 3] = accumulation[3];
+                        pFramesOut[iFrame*6 + 4] = accumulation[4];
+                        pFramesOut[iFrame*6 + 5] = accumulation[5];
+                    }
+                }
+
+                /* Leftover frames. */
+                for (; iFrame < frameCount; iFrame += 1) {
+                    for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                        float accumulation = 0;
+
+                        for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
+                            accumulation += pFramesIn[iFrame*channelsIn + iChannelIn] * weights[iChannelOut][iChannelIn];
+                        }
+
+                        pFramesOut[iFrame*channelsOut + iChannelOut] = accumulation;
+                    }
+                }
+            } else {
+                /* Cannot pre-compute weights because not enough room in stack-allocated buffer. */
+                for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                    for (iChannelOut = 0; iChannelOut < channelsOut; iChannelOut += 1) {
+                        float accumulation = 0;
+                        ma_channel channelOut = ma_channel_map_get_channel(pChannelMapOut, channelsOut, iChannelOut);
+
+                        for (iChannelIn = 0; iChannelIn < channelsIn; iChannelIn += 1) {
+                            ma_channel channelIn = ma_channel_map_get_channel(pChannelMapIn, channelsIn, iChannelIn);
+                            accumulation += pFramesIn[iFrame*channelsIn + iChannelIn] * ma_calculate_channel_position_rectangular_weight(channelOut, channelIn);
+                        }
+
+                        pFramesOut[iFrame*channelsOut + iChannelOut] = accumulation;
+                    }
+                }
+            }
+        }
+    } else {
+        /* Fall back to silence. If you hit this, what are you doing with so many channels?! */
+        ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, channelsOut);
+    }
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t channelMapInOffset;
+    size_t channelMapOutOffset;
+    size_t shuffleTableOffset;
+    size_t weightsOffset;
+} ma_channel_converter_heap_layout;
+
+static ma_channel_conversion_path ma_channel_converter_config_get_conversion_path(const ma_channel_converter_config* pConfig)
+{
+    return ma_channel_map_get_conversion_path(pConfig->pChannelMapIn, pConfig->channelsIn, pConfig->pChannelMapOut, pConfig->channelsOut, pConfig->mixingMode);
+}
+
+static ma_result ma_channel_converter_get_heap_layout(const ma_channel_converter_config* pConfig, ma_channel_converter_heap_layout* pHeapLayout)
+{
+    ma_channel_conversion_path conversionPath;
+
+    MA_ASSERT(pHeapLayout != NULL);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channelsIn == 0 || pConfig->channelsOut == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (!ma_channel_map_is_valid(pConfig->pChannelMapIn, pConfig->channelsIn)) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (!ma_channel_map_is_valid(pConfig->pChannelMapOut, pConfig->channelsOut)) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Input channel map. Only need to allocate this if we have an input channel map (otherwise default channel map is assumed). */
+    pHeapLayout->channelMapInOffset = pHeapLayout->sizeInBytes;
+    if (pConfig->pChannelMapIn != NULL) {
+        pHeapLayout->sizeInBytes += sizeof(ma_channel) * pConfig->channelsIn;
+    }
+
+    /* Output channel map. Only need to allocate this if we have an output channel map (otherwise default channel map is assumed). */
+    pHeapLayout->channelMapOutOffset = pHeapLayout->sizeInBytes;
+    if (pConfig->pChannelMapOut != NULL) {
+        pHeapLayout->sizeInBytes += sizeof(ma_channel) * pConfig->channelsOut;
+    }
+
+    /* Alignment for the next section. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    /* Whether or not we use weights of a shuffle table depends on the channel map themselves and the algorithm we've chosen. */
+    conversionPath = ma_channel_converter_config_get_conversion_path(pConfig);
+
+    /* Shuffle table */
+    pHeapLayout->shuffleTableOffset = pHeapLayout->sizeInBytes;
+    if (conversionPath == ma_channel_conversion_path_shuffle) {
+        pHeapLayout->sizeInBytes += sizeof(ma_uint8) * pConfig->channelsOut;
+    }
+
+    /* Weights */
+    pHeapLayout->weightsOffset = pHeapLayout->sizeInBytes;
+    if (conversionPath == ma_channel_conversion_path_weights) {
+        pHeapLayout->sizeInBytes += sizeof(float*) * pConfig->channelsIn;
+        pHeapLayout->sizeInBytes += sizeof(float ) * pConfig->channelsIn * pConfig->channelsOut;
+    }
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_channel_converter_get_heap_size(const ma_channel_converter_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_channel_converter_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_channel_converter_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_channel_converter_init_preallocated(const ma_channel_converter_config* pConfig, void* pHeap, ma_channel_converter* pConverter)
+{
+    ma_result result;
+    ma_channel_converter_heap_layout heapLayout;
+
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pConverter);
+
+    result = ma_channel_converter_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pConverter->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pConverter->_pHeap, heapLayout.sizeInBytes);
+
+    pConverter->format      = pConfig->format;
+    pConverter->channelsIn  = pConfig->channelsIn;
+    pConverter->channelsOut = pConfig->channelsOut;
+    pConverter->mixingMode  = pConfig->mixingMode;
+
+    if (pConfig->pChannelMapIn != NULL) {
+        pConverter->pChannelMapIn = (ma_channel*)ma_offset_ptr(pHeap, heapLayout.channelMapInOffset);
+        ma_channel_map_copy_or_default(pConverter->pChannelMapIn, pConfig->channelsIn, pConfig->pChannelMapIn, pConfig->channelsIn);
+    } else {
+        pConverter->pChannelMapIn = NULL;   /* Use default channel map. */
+    }
+
+    if (pConfig->pChannelMapOut != NULL) {
+        pConverter->pChannelMapOut = (ma_channel*)ma_offset_ptr(pHeap, heapLayout.channelMapOutOffset);
+        ma_channel_map_copy_or_default(pConverter->pChannelMapOut, pConfig->channelsOut, pConfig->pChannelMapOut, pConfig->channelsOut);
+    } else {
+        pConverter->pChannelMapOut = NULL;  /* Use default channel map. */
+    }
+
+    pConverter->conversionPath = ma_channel_converter_config_get_conversion_path(pConfig);
+
+    if (pConverter->conversionPath == ma_channel_conversion_path_shuffle) {
+        pConverter->pShuffleTable = (ma_uint8*)ma_offset_ptr(pHeap, heapLayout.shuffleTableOffset);
+        ma_channel_map_build_shuffle_table(pConverter->pChannelMapIn, pConverter->channelsIn, pConverter->pChannelMapOut, pConverter->channelsOut, pConverter->pShuffleTable);
+    }
+
+    if (pConverter->conversionPath == ma_channel_conversion_path_weights) {
+        ma_uint32 iChannelIn;
+        ma_uint32 iChannelOut;
+
+        if (pConverter->format == ma_format_f32) {
+            pConverter->weights.f32 = (float**   )ma_offset_ptr(pHeap, heapLayout.weightsOffset);
+            for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; iChannelIn += 1) {
+                pConverter->weights.f32[iChannelIn] = (float*)ma_offset_ptr(pHeap, heapLayout.weightsOffset + ((sizeof(float*) * pConverter->channelsIn) + (sizeof(float) * pConverter->channelsOut * iChannelIn)));
+            }
+        } else {
+            pConverter->weights.s16 = (ma_int32**)ma_offset_ptr(pHeap, heapLayout.weightsOffset);
+            for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; iChannelIn += 1) {
+                pConverter->weights.s16[iChannelIn] = (ma_int32*)ma_offset_ptr(pHeap, heapLayout.weightsOffset + ((sizeof(ma_int32*) * pConverter->channelsIn) + (sizeof(ma_int32) * pConverter->channelsOut * iChannelIn)));
+            }
+        }
+
+        /* Silence our weights by default. */
+        for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; iChannelIn += 1) {
+            for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; iChannelOut += 1) {
+                if (pConverter->format == ma_format_f32) {
+                    pConverter->weights.f32[iChannelIn][iChannelOut] = 0.0f;
+                } else {
+                    pConverter->weights.s16[iChannelIn][iChannelOut] = 0;
+                }
+            }
+        }
+
+        /*
+        We now need to fill out our weights table. This is determined by the mixing mode.
+        */
+
+        /* In all cases we need to make sure all channels that are present in both channel maps have a 1:1 mapping. */
+        for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+            ma_channel channelPosIn = ma_channel_map_get_channel(pConverter->pChannelMapIn, pConverter->channelsIn, iChannelIn);
+
+            for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
+                ma_channel channelPosOut = ma_channel_map_get_channel(pConverter->pChannelMapOut, pConverter->channelsOut, iChannelOut);
+
+                if (channelPosIn == channelPosOut) {
+                    float weight = 1;
+
+                    if (pConverter->format == ma_format_f32) {
+                        pConverter->weights.f32[iChannelIn][iChannelOut] = weight;
+                    } else {
+                        pConverter->weights.s16[iChannelIn][iChannelOut] = ma_channel_converter_float_to_fixed(weight);
+                    }
+                }
+            }
+        }
+
+        switch (pConverter->mixingMode)
+        {
+            case ma_channel_mix_mode_custom_weights:
+            {
+                if (pConfig->ppWeights == NULL) {
+                    return MA_INVALID_ARGS; /* Config specified a custom weights mixing mode, but no custom weights have been specified. */
+                }
+
+                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; iChannelIn += 1) {
+                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; iChannelOut += 1) {
+                        float weight = pConfig->ppWeights[iChannelIn][iChannelOut];
+
+                        if (pConverter->format == ma_format_f32) {
+                            pConverter->weights.f32[iChannelIn][iChannelOut] = weight;
+                        } else {
+                            pConverter->weights.s16[iChannelIn][iChannelOut] = ma_channel_converter_float_to_fixed(weight);
+                        }
+                    }
+                }
+            } break;
+
+            case ma_channel_mix_mode_simple:
+            {
+                /*
+                In simple mode, only set weights for channels that have exactly matching types, leave the rest at
+                zero. The 1:1 mappings have already been covered before this switch statement.
+                */
+            } break;
+
+            case ma_channel_mix_mode_rectangular:
+            default:
+            {
+                /* Unmapped input channels. */
+                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+                    ma_channel channelPosIn = ma_channel_map_get_channel(pConverter->pChannelMapIn, pConverter->channelsIn, iChannelIn);
+
+                    if (ma_is_spatial_channel_position(channelPosIn)) {
+                        if (!ma_channel_map_contains_channel_position(pConverter->channelsOut, pConverter->pChannelMapOut, channelPosIn)) {
+                            for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
+                                ma_channel channelPosOut = ma_channel_map_get_channel(pConverter->pChannelMapOut, pConverter->channelsOut, iChannelOut);
+
+                                if (ma_is_spatial_channel_position(channelPosOut)) {
+                                    float weight = 0;
+                                    if (pConverter->mixingMode == ma_channel_mix_mode_rectangular) {
+                                        weight = ma_calculate_channel_position_rectangular_weight(channelPosIn, channelPosOut);
+                                    }
+
+                                    /* Only apply the weight if we haven't already got some contribution from the respective channels. */
+                                    if (pConverter->format == ma_format_f32) {
+                                        if (pConverter->weights.f32[iChannelIn][iChannelOut] == 0) {
+                                            pConverter->weights.f32[iChannelIn][iChannelOut] = weight;
+                                        }
+                                    } else {
+                                        if (pConverter->weights.s16[iChannelIn][iChannelOut] == 0) {
+                                            pConverter->weights.s16[iChannelIn][iChannelOut] = ma_channel_converter_float_to_fixed(weight);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
+                /* Unmapped output channels. */
+                for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
+                    ma_channel channelPosOut = ma_channel_map_get_channel(pConverter->pChannelMapOut, pConverter->channelsOut, iChannelOut);
+
+                    if (ma_is_spatial_channel_position(channelPosOut)) {
+                        if (!ma_channel_map_contains_channel_position(pConverter->channelsIn, pConverter->pChannelMapIn, channelPosOut)) {
+                            for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+                                ma_channel channelPosIn = ma_channel_map_get_channel(pConverter->pChannelMapIn, pConverter->channelsIn, iChannelIn);
+
+                                if (ma_is_spatial_channel_position(channelPosIn)) {
+                                    float weight = 0;
+                                    if (pConverter->mixingMode == ma_channel_mix_mode_rectangular) {
+                                        weight = ma_calculate_channel_position_rectangular_weight(channelPosIn, channelPosOut);
+                                    }
+
+                                    /* Only apply the weight if we haven't already got some contribution from the respective channels. */
+                                    if (pConverter->format == ma_format_f32) {
+                                        if (pConverter->weights.f32[iChannelIn][iChannelOut] == 0) {
+                                            pConverter->weights.f32[iChannelIn][iChannelOut] = weight;
+                                        }
+                                    } else {
+                                        if (pConverter->weights.s16[iChannelIn][iChannelOut] == 0) {
+                                            pConverter->weights.s16[iChannelIn][iChannelOut] = ma_channel_converter_float_to_fixed(weight);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
+                /* If LFE is in the output channel map but was not present in the input channel map, configure its weight now */
+                if (pConfig->calculateLFEFromSpatialChannels) {
+                    if (!ma_channel_map_contains_channel_position(pConverter->channelsIn, pConverter->pChannelMapIn, MA_CHANNEL_LFE)) {
+                        ma_uint32 spatialChannelCount = ma_channel_map_get_spatial_channel_count(pConverter->pChannelMapIn, pConverter->channelsIn);
+                        ma_uint32 iChannelOutLFE;
+
+                        if (spatialChannelCount > 0 && ma_channel_map_find_channel_position(pConverter->channelsOut, pConverter->pChannelMapOut, MA_CHANNEL_LFE, &iChannelOutLFE)) {
+                            const float weightForLFE = 1.0f / spatialChannelCount;
+                            for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+                                const ma_channel channelPosIn = ma_channel_map_get_channel(pConverter->pChannelMapIn, pConverter->channelsIn, iChannelIn);
+                                if (ma_is_spatial_channel_position(channelPosIn)) {
+                                    if (pConverter->format == ma_format_f32) {
+                                        if (pConverter->weights.f32[iChannelIn][iChannelOutLFE] == 0) {
+                                            pConverter->weights.f32[iChannelIn][iChannelOutLFE] = weightForLFE;
+                                        }
+                                    } else {
+                                        if (pConverter->weights.s16[iChannelIn][iChannelOutLFE] == 0) {
+                                            pConverter->weights.s16[iChannelIn][iChannelOutLFE] = ma_channel_converter_float_to_fixed(weightForLFE);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            } break;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_channel_converter_init(const ma_channel_converter_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_channel_converter* pConverter)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_channel_converter_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_channel_converter_init_preallocated(pConfig, pHeap, pConverter);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pConverter->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_channel_converter_uninit(ma_channel_converter* pConverter, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pConverter == NULL) {
+        return;
+    }
+
+    if (pConverter->_ownsHeap) {
+        ma_free(pConverter->_pHeap, pAllocationCallbacks);
+    }
+}
+
+static ma_result ma_channel_converter_process_pcm_frames__passthrough(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    MA_ASSERT(pConverter != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+    MA_ASSERT(pFramesIn  != NULL);
+
+    ma_copy_memory_64(pFramesOut, pFramesIn, frameCount * ma_get_bytes_per_frame(pConverter->format, pConverter->channelsOut));
+    return MA_SUCCESS;
+}
+
+static ma_result ma_channel_converter_process_pcm_frames__shuffle(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    MA_ASSERT(pConverter != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+    MA_ASSERT(pFramesIn  != NULL);
+    MA_ASSERT(pConverter->channelsIn == pConverter->channelsOut);
+
+    return ma_channel_map_apply_shuffle_table(pFramesOut, pConverter->channelsOut, pFramesIn, pConverter->channelsIn, frameCount, pConverter->pShuffleTable, pConverter->format);
+}
+
+static ma_result ma_channel_converter_process_pcm_frames__mono_in(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+
+    MA_ASSERT(pConverter != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+    MA_ASSERT(pFramesIn  != NULL);
+    MA_ASSERT(pConverter->channelsIn == 1);
+
+    switch (pConverter->format)
+    {
+        case ma_format_u8:
+        {
+            /* */ ma_uint8* pFramesOutU8 = (      ma_uint8*)pFramesOut;
+            const ma_uint8* pFramesInU8  = (const ma_uint8*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
+                    pFramesOutU8[iFrame*pConverter->channelsOut + iChannel] = pFramesInU8[iFrame];
+                }
+            }
+        } break;
+
+        case ma_format_s16:
+        {
+            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
+            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
+
+            if (pConverter->channelsOut == 2) {
+                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                    pFramesOutS16[iFrame*2 + 0] = pFramesInS16[iFrame];
+                    pFramesOutS16[iFrame*2 + 1] = pFramesInS16[iFrame];
+                }
+            } else {
+                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                    ma_uint32 iChannel;
+                    for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
+                        pFramesOutS16[iFrame*pConverter->channelsOut + iChannel] = pFramesInS16[iFrame];
+                    }
+                }
+            }
+        } break;
+
+        case ma_format_s24:
+        {
+            /* */ ma_uint8* pFramesOutS24 = (      ma_uint8*)pFramesOut;
+            const ma_uint8* pFramesInS24  = (const ma_uint8*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
+                    ma_uint64 iSampleOut = iFrame*pConverter->channelsOut + iChannel;
+                    ma_uint64 iSampleIn  = iFrame;
+                    pFramesOutS24[iSampleOut*3 + 0] = pFramesInS24[iSampleIn*3 + 0];
+                    pFramesOutS24[iSampleOut*3 + 1] = pFramesInS24[iSampleIn*3 + 1];
+                    pFramesOutS24[iSampleOut*3 + 2] = pFramesInS24[iSampleIn*3 + 2];
+                }
+            }
+        } break;
+
+        case ma_format_s32:
+        {
+            /* */ ma_int32* pFramesOutS32 = (      ma_int32*)pFramesOut;
+            const ma_int32* pFramesInS32  = (const ma_int32*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                ma_uint32 iChannel;
+                for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
+                    pFramesOutS32[iFrame*pConverter->channelsOut + iChannel] = pFramesInS32[iFrame];
+                }
+            }
+        } break;
+
+        case ma_format_f32:
+        {
+            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
+            const float* pFramesInF32  = (const float*)pFramesIn;
+
+            if (pConverter->channelsOut == 2) {
+                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                    pFramesOutF32[iFrame*2 + 0] = pFramesInF32[iFrame];
+                    pFramesOutF32[iFrame*2 + 1] = pFramesInF32[iFrame];
+                }
+            } else {
+                for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                    ma_uint32 iChannel;
+                    for (iChannel = 0; iChannel < pConverter->channelsOut; iChannel += 1) {
+                        pFramesOutF32[iFrame*pConverter->channelsOut + iChannel] = pFramesInF32[iFrame];
+                    }
+                }
+            }
+        } break;
+
+        default: return MA_INVALID_OPERATION;   /* Unknown format. */
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_channel_converter_process_pcm_frames__mono_out(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannel;
+
+    MA_ASSERT(pConverter != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+    MA_ASSERT(pFramesIn  != NULL);
+    MA_ASSERT(pConverter->channelsOut == 1);
+
+    switch (pConverter->format)
+    {
+        case ma_format_u8:
+        {
+            /* */ ma_uint8* pFramesOutU8 = (      ma_uint8*)pFramesOut;
+            const ma_uint8* pFramesInU8  = (const ma_uint8*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                ma_int32 t = 0;
+                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
+                    t += ma_pcm_sample_u8_to_s16_no_scale(pFramesInU8[iFrame*pConverter->channelsIn + iChannel]);
+                }
+
+                pFramesOutU8[iFrame] = ma_clip_u8(t / pConverter->channelsOut);
+            }
+        } break;
+
+        case ma_format_s16:
+        {
+            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
+            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                ma_int32 t = 0;
+                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
+                    t += pFramesInS16[iFrame*pConverter->channelsIn + iChannel];
+                }
+
+                pFramesOutS16[iFrame] = (ma_int16)(t / pConverter->channelsIn);
+            }
+        } break;
+
+        case ma_format_s24:
+        {
+            /* */ ma_uint8* pFramesOutS24 = (      ma_uint8*)pFramesOut;
+            const ma_uint8* pFramesInS24  = (const ma_uint8*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                ma_int64 t = 0;
+                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
+                    t += ma_pcm_sample_s24_to_s32_no_scale(&pFramesInS24[(iFrame*pConverter->channelsIn + iChannel)*3]);
+                }
+
+                ma_pcm_sample_s32_to_s24_no_scale(t / pConverter->channelsIn, &pFramesOutS24[iFrame*3]);
+            }
+        } break;
+
+        case ma_format_s32:
+        {
+            /* */ ma_int32* pFramesOutS32 = (      ma_int32*)pFramesOut;
+            const ma_int32* pFramesInS32  = (const ma_int32*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                ma_int64 t = 0;
+                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
+                    t += pFramesInS32[iFrame*pConverter->channelsIn + iChannel];
+                }
+
+                pFramesOutS32[iFrame] = (ma_int32)(t / pConverter->channelsIn);
+            }
+        } break;
+
+        case ma_format_f32:
+        {
+            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
+            const float* pFramesInF32  = (const float*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; ++iFrame) {
+                float t = 0;
+                for (iChannel = 0; iChannel < pConverter->channelsIn; iChannel += 1) {
+                    t += pFramesInF32[iFrame*pConverter->channelsIn + iChannel];
+                }
+
+                pFramesOutF32[iFrame] = t / pConverter->channelsIn;
+            }
+        } break;
+
+        default: return MA_INVALID_OPERATION;   /* Unknown format. */
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_channel_converter_process_pcm_frames__weights(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    ma_uint32 iFrame;
+    ma_uint32 iChannelIn;
+    ma_uint32 iChannelOut;
+
+    MA_ASSERT(pConverter != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+    MA_ASSERT(pFramesIn  != NULL);
+
+    /* This is the more complicated case. Each of the output channels is accumulated with 0 or more input channels. */
+
+    /* Clear. */
+    ma_zero_memory_64(pFramesOut, frameCount * ma_get_bytes_per_frame(pConverter->format, pConverter->channelsOut));
+
+    /* Accumulate. */
+    switch (pConverter->format)
+    {
+        case ma_format_u8:
+        {
+            /* */ ma_uint8* pFramesOutU8 = (      ma_uint8*)pFramesOut;
+            const ma_uint8* pFramesInU8  = (const ma_uint8*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
+                        ma_int16 u8_O = ma_pcm_sample_u8_to_s16_no_scale(pFramesOutU8[iFrame*pConverter->channelsOut + iChannelOut]);
+                        ma_int16 u8_I = ma_pcm_sample_u8_to_s16_no_scale(pFramesInU8 [iFrame*pConverter->channelsIn  + iChannelIn ]);
+                        ma_int32 s    = (ma_int32)ma_clamp(u8_O + ((u8_I * pConverter->weights.s16[iChannelIn][iChannelOut]) >> MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT), -128, 127);
+                        pFramesOutU8[iFrame*pConverter->channelsOut + iChannelOut] = ma_clip_u8((ma_int16)s);
+                    }
+                }
+            }
+        } break;
+
+        case ma_format_s16:
+        {
+            /* */ ma_int16* pFramesOutS16 = (      ma_int16*)pFramesOut;
+            const ma_int16* pFramesInS16  = (const ma_int16*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
+                        ma_int32 s = pFramesOutS16[iFrame*pConverter->channelsOut + iChannelOut];
+                        s += (pFramesInS16[iFrame*pConverter->channelsIn + iChannelIn] * pConverter->weights.s16[iChannelIn][iChannelOut]) >> MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT;
+
+                        pFramesOutS16[iFrame*pConverter->channelsOut + iChannelOut] = (ma_int16)ma_clamp(s, -32768, 32767);
+                    }
+                }
+            }
+        } break;
+
+        case ma_format_s24:
+        {
+            /* */ ma_uint8* pFramesOutS24 = (      ma_uint8*)pFramesOut;
+            const ma_uint8* pFramesInS24  = (const ma_uint8*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
+                        ma_int64 s24_O = ma_pcm_sample_s24_to_s32_no_scale(&pFramesOutS24[(iFrame*pConverter->channelsOut + iChannelOut)*3]);
+                        ma_int64 s24_I = ma_pcm_sample_s24_to_s32_no_scale(&pFramesInS24 [(iFrame*pConverter->channelsIn  + iChannelIn )*3]);
+                        ma_int64 s24   = (ma_int32)ma_clamp(s24_O + ((s24_I * pConverter->weights.s16[iChannelIn][iChannelOut]) >> MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT), -8388608, 8388607);
+                        ma_pcm_sample_s32_to_s24_no_scale(s24, &pFramesOutS24[(iFrame*pConverter->channelsOut + iChannelOut)*3]);
+                    }
+                }
+            }
+        } break;
+
+        case ma_format_s32:
+        {
+            /* */ ma_int32* pFramesOutS32 = (      ma_int32*)pFramesOut;
+            const ma_int32* pFramesInS32  = (const ma_int32*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
+                        ma_int64 s = pFramesOutS32[iFrame*pConverter->channelsOut + iChannelOut];
+                        s += ((ma_int64)pFramesInS32[iFrame*pConverter->channelsIn + iChannelIn] * pConverter->weights.s16[iChannelIn][iChannelOut]) >> MA_CHANNEL_CONVERTER_FIXED_POINT_SHIFT;
+
+                        pFramesOutS32[iFrame*pConverter->channelsOut + iChannelOut] = ma_clip_s32(s);
+                    }
+                }
+            }
+        } break;
+
+        case ma_format_f32:
+        {
+            /* */ float* pFramesOutF32 = (      float*)pFramesOut;
+            const float* pFramesInF32  = (const float*)pFramesIn;
+
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannelIn = 0; iChannelIn < pConverter->channelsIn; ++iChannelIn) {
+                    for (iChannelOut = 0; iChannelOut < pConverter->channelsOut; ++iChannelOut) {
+                        pFramesOutF32[iFrame*pConverter->channelsOut + iChannelOut] += pFramesInF32[iFrame*pConverter->channelsIn + iChannelIn] * pConverter->weights.f32[iChannelIn][iChannelOut];
+                    }
+                }
+            }
+        } break;
+
+        default: return MA_INVALID_OPERATION;   /* Unknown format. */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_channel_converter_process_pcm_frames(ma_channel_converter* pConverter, void* pFramesOut, const void* pFramesIn, ma_uint64 frameCount)
+{
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pFramesOut == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pFramesIn == NULL) {
+        ma_zero_memory_64(pFramesOut, frameCount * ma_get_bytes_per_frame(pConverter->format, pConverter->channelsOut));
+        return MA_SUCCESS;
+    }
+
+    switch (pConverter->conversionPath)
+    {
+        case ma_channel_conversion_path_passthrough: return ma_channel_converter_process_pcm_frames__passthrough(pConverter, pFramesOut, pFramesIn, frameCount);
+        case ma_channel_conversion_path_mono_out:    return ma_channel_converter_process_pcm_frames__mono_out(pConverter, pFramesOut, pFramesIn, frameCount);
+        case ma_channel_conversion_path_mono_in:     return ma_channel_converter_process_pcm_frames__mono_in(pConverter, pFramesOut, pFramesIn, frameCount);
+        case ma_channel_conversion_path_shuffle:     return ma_channel_converter_process_pcm_frames__shuffle(pConverter, pFramesOut, pFramesIn, frameCount);
+        case ma_channel_conversion_path_weights:
+        default:
+        {
+            return ma_channel_converter_process_pcm_frames__weights(pConverter, pFramesOut, pFramesIn, frameCount);
+        }
+    }
+}
+
+MA_API ma_result ma_channel_converter_get_input_channel_map(const ma_channel_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    if (pConverter == NULL || pChannelMap == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_channel_map_copy_or_default(pChannelMap, channelMapCap, pConverter->pChannelMapIn, pConverter->channelsIn);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_channel_converter_get_output_channel_map(const ma_channel_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    if (pConverter == NULL || pChannelMap == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_channel_map_copy_or_default(pChannelMap, channelMapCap, pConverter->pChannelMapOut, pConverter->channelsOut);
+
+    return MA_SUCCESS;
+}
+
+
+/**************************************************************************************************************************************************************
+
+Data Conversion
+
+**************************************************************************************************************************************************************/
+MA_API ma_data_converter_config ma_data_converter_config_init_default(void)
+{
+    ma_data_converter_config config;
+    MA_ZERO_OBJECT(&config);
+
+    config.ditherMode = ma_dither_mode_none;
+    config.resampling.algorithm = ma_resample_algorithm_linear;
+    config.allowDynamicSampleRate = MA_FALSE; /* Disable dynamic sample rates by default because dynamic rate adjustments should be quite rare and it allows an optimization for cases when the in and out sample rates are the same. */
+
+    /* Linear resampling defaults. */
+    config.resampling.linear.lpfOrder = 1;
+
+    return config;
+}
+
+MA_API ma_data_converter_config ma_data_converter_config_init(ma_format formatIn, ma_format formatOut, ma_uint32 channelsIn, ma_uint32 channelsOut, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
+{
+    ma_data_converter_config config = ma_data_converter_config_init_default();
+    config.formatIn      = formatIn;
+    config.formatOut     = formatOut;
+    config.channelsIn    = channelsIn;
+    config.channelsOut   = channelsOut;
+    config.sampleRateIn  = sampleRateIn;
+    config.sampleRateOut = sampleRateOut;
+
+    return config;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t channelConverterOffset;
+    size_t resamplerOffset;
+} ma_data_converter_heap_layout;
+
+static ma_bool32 ma_data_converter_config_is_resampler_required(const ma_data_converter_config* pConfig)
+{
+    MA_ASSERT(pConfig != NULL);
+
+    return pConfig->allowDynamicSampleRate || pConfig->sampleRateIn != pConfig->sampleRateOut;
+}
+
+static ma_format ma_data_converter_config_get_mid_format(const ma_data_converter_config* pConfig)
+{
+    MA_ASSERT(pConfig != NULL);
+
+    /*
+    We want to avoid as much data conversion as possible. The channel converter and linear
+    resampler both support s16 and f32 natively. We need to decide on the format to use for this
+    stage. We call this the mid format because it's used in the middle stage of the conversion
+    pipeline. If the output format is either s16 or f32 we use that one. If that is not the case it
+    will do the same thing for the input format. If it's neither we just use f32. If we are using a
+    custom resampling backend, we can only guarantee that f32 will be supported so we'll be forced
+    to use that if resampling is required.
+    */
+    if (ma_data_converter_config_is_resampler_required(pConfig) && pConfig->resampling.algorithm != ma_resample_algorithm_linear) {
+        return ma_format_f32;  /* <-- Force f32 since that is the only one we can guarantee will be supported by the resampler. */
+    } else {
+        /*  */ if (pConfig->formatOut == ma_format_s16 || pConfig->formatOut == ma_format_f32) {
+            return pConfig->formatOut;
+        } else if (pConfig->formatIn  == ma_format_s16 || pConfig->formatIn  == ma_format_f32) {
+            return pConfig->formatIn;
+        } else {
+            return ma_format_f32;
+        }
+    }
+}
+
+static ma_channel_converter_config ma_channel_converter_config_init_from_data_converter_config(const ma_data_converter_config* pConfig)
+{
+    ma_channel_converter_config channelConverterConfig;
+
+    MA_ASSERT(pConfig != NULL);
+
+    channelConverterConfig = ma_channel_converter_config_init(ma_data_converter_config_get_mid_format(pConfig), pConfig->channelsIn, pConfig->pChannelMapIn, pConfig->channelsOut, pConfig->pChannelMapOut, pConfig->channelMixMode);
+    channelConverterConfig.ppWeights = pConfig->ppChannelWeights;
+    channelConverterConfig.calculateLFEFromSpatialChannels = pConfig->calculateLFEFromSpatialChannels;
+
+    return channelConverterConfig;
+}
+
+static ma_resampler_config ma_resampler_config_init_from_data_converter_config(const ma_data_converter_config* pConfig)
+{
+    ma_resampler_config resamplerConfig;
+    ma_uint32 resamplerChannels;
+
+    MA_ASSERT(pConfig != NULL);
+
+    /* The resampler is the most expensive part of the conversion process, so we need to do it at the stage where the channel count is at it's lowest. */
+    if (pConfig->channelsIn < pConfig->channelsOut) {
+        resamplerChannels = pConfig->channelsIn;
+    } else {
+        resamplerChannels = pConfig->channelsOut;
+    }
+
+    resamplerConfig = ma_resampler_config_init(ma_data_converter_config_get_mid_format(pConfig), resamplerChannels, pConfig->sampleRateIn, pConfig->sampleRateOut, pConfig->resampling.algorithm);
+    resamplerConfig.linear           = pConfig->resampling.linear;
+    resamplerConfig.pBackendVTable   = pConfig->resampling.pBackendVTable;
+    resamplerConfig.pBackendUserData = pConfig->resampling.pBackendUserData;
+
+    return resamplerConfig;
+}
+
+static ma_result ma_data_converter_get_heap_layout(const ma_data_converter_config* pConfig, ma_data_converter_heap_layout* pHeapLayout)
+{
+    ma_result result;
+
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channelsIn == 0 || pConfig->channelsOut == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Channel converter. */
+    pHeapLayout->channelConverterOffset = pHeapLayout->sizeInBytes;
+    {
+        size_t heapSizeInBytes;
+        ma_channel_converter_config channelConverterConfig = ma_channel_converter_config_init_from_data_converter_config(pConfig);
+
+        result = ma_channel_converter_get_heap_size(&channelConverterConfig, &heapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->sizeInBytes += heapSizeInBytes;
+    }
+
+    /* Resampler. */
+    pHeapLayout->resamplerOffset = pHeapLayout->sizeInBytes;
+    if (ma_data_converter_config_is_resampler_required(pConfig)) {
+        size_t heapSizeInBytes;
+        ma_resampler_config resamplerConfig = ma_resampler_config_init_from_data_converter_config(pConfig);
+
+        result = ma_resampler_get_heap_size(&resamplerConfig, &heapSizeInBytes);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->sizeInBytes += heapSizeInBytes;
+    }
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_converter_get_heap_size(const ma_data_converter_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_data_converter_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_data_converter_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_converter_init_preallocated(const ma_data_converter_config* pConfig, void* pHeap, ma_data_converter* pConverter)
+{
+    ma_result result;
+    ma_data_converter_heap_layout heapLayout;
+    ma_format midFormat;
+    ma_bool32 isResamplingRequired;
+
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pConverter);
+
+    result = ma_data_converter_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pConverter->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pConverter->formatIn      = pConfig->formatIn;
+    pConverter->formatOut     = pConfig->formatOut;
+    pConverter->channelsIn    = pConfig->channelsIn;
+    pConverter->channelsOut   = pConfig->channelsOut;
+    pConverter->sampleRateIn  = pConfig->sampleRateIn;
+    pConverter->sampleRateOut = pConfig->sampleRateOut;
+    pConverter->ditherMode    = pConfig->ditherMode;
+
+    /*
+    Determine if resampling is required. We need to do this so we can determine an appropriate
+    mid format to use. If resampling is required, the mid format must be ma_format_f32 since
+    that is the only one that is guaranteed to supported by custom resampling backends.
+    */
+    isResamplingRequired = ma_data_converter_config_is_resampler_required(pConfig);
+    midFormat = ma_data_converter_config_get_mid_format(pConfig);
+
+
+    /* Channel converter. We always initialize this, but we check if it configures itself as a passthrough to determine whether or not it's needed. */
+    {
+        ma_channel_converter_config channelConverterConfig = ma_channel_converter_config_init_from_data_converter_config(pConfig);
+
+        result = ma_channel_converter_init_preallocated(&channelConverterConfig, ma_offset_ptr(pHeap, heapLayout.channelConverterOffset), &pConverter->channelConverter);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        /* If the channel converter is not a passthrough we need to enable it. Otherwise we can skip it. */
+        if (pConverter->channelConverter.conversionPath != ma_channel_conversion_path_passthrough) {
+            pConverter->hasChannelConverter = MA_TRUE;
+        }
+    }
+
+
+    /* Resampler. */
+    if (isResamplingRequired) {
+        ma_resampler_config resamplerConfig = ma_resampler_config_init_from_data_converter_config(pConfig);
+
+        result = ma_resampler_init_preallocated(&resamplerConfig, ma_offset_ptr(pHeap, heapLayout.resamplerOffset), &pConverter->resampler);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pConverter->hasResampler = MA_TRUE;
+    }
+
+
+    /* We can simplify pre- and post-format conversion if we have neither channel conversion nor resampling. */
+    if (pConverter->hasChannelConverter == MA_FALSE && pConverter->hasResampler == MA_FALSE) {
+        /* We have neither channel conversion nor resampling so we'll only need one of pre- or post-format conversion, or none if the input and output formats are the same. */
+        if (pConverter->formatIn == pConverter->formatOut) {
+            /* The formats are the same so we can just pass through. */
+            pConverter->hasPreFormatConversion  = MA_FALSE;
+            pConverter->hasPostFormatConversion = MA_FALSE;
+        } else {
+            /* The formats are different so we need to do either pre- or post-format conversion. It doesn't matter which. */
+            pConverter->hasPreFormatConversion  = MA_FALSE;
+            pConverter->hasPostFormatConversion = MA_TRUE;
+        }
+    } else {
+        /* We have a channel converter and/or resampler so we'll need channel conversion based on the mid format. */
+        if (pConverter->formatIn != midFormat) {
+            pConverter->hasPreFormatConversion  = MA_TRUE;
+        }
+        if (pConverter->formatOut != midFormat) {
+            pConverter->hasPostFormatConversion = MA_TRUE;
+        }
+    }
+
+    /* We can enable passthrough optimizations if applicable. Note that we'll only be able to do this if the sample rate is static. */
+    if (pConverter->hasPreFormatConversion  == MA_FALSE &&
+        pConverter->hasPostFormatConversion == MA_FALSE &&
+        pConverter->hasChannelConverter     == MA_FALSE &&
+        pConverter->hasResampler            == MA_FALSE) {
+        pConverter->isPassthrough = MA_TRUE;
+    }
+
+
+    /* We now need to determine our execution path. */
+    if (pConverter->isPassthrough) {
+        pConverter->executionPath = ma_data_converter_execution_path_passthrough;
+    } else {
+        if (pConverter->channelsIn < pConverter->channelsOut) {
+            /* Do resampling first, if necessary. */
+            MA_ASSERT(pConverter->hasChannelConverter == MA_TRUE);
+
+            if (pConverter->hasResampler) {
+                pConverter->executionPath = ma_data_converter_execution_path_resample_first;
+            } else {
+                pConverter->executionPath = ma_data_converter_execution_path_channels_only;
+            }
+        } else {
+            /* Do channel conversion first, if necessary. */
+            if (pConverter->hasChannelConverter) {
+                if (pConverter->hasResampler) {
+                    pConverter->executionPath = ma_data_converter_execution_path_channels_first;
+                } else {
+                    pConverter->executionPath = ma_data_converter_execution_path_channels_only;
+                }
+            } else {
+                /* Channel routing not required. */
+                if (pConverter->hasResampler) {
+                    pConverter->executionPath = ma_data_converter_execution_path_resample_only;
+                } else {
+                    pConverter->executionPath = ma_data_converter_execution_path_format_only;
+                }
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_converter_init(const ma_data_converter_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_converter* pConverter)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_data_converter_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_data_converter_init_preallocated(pConfig, pHeap, pConverter);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pConverter->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_data_converter_uninit(ma_data_converter* pConverter, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pConverter == NULL) {
+        return;
+    }
+
+    if (pConverter->hasResampler) {
+        ma_resampler_uninit(&pConverter->resampler, pAllocationCallbacks);
+    }
+
+    ma_channel_converter_uninit(&pConverter->channelConverter, pAllocationCallbacks);
+
+    if (pConverter->_ownsHeap) {
+        ma_free(pConverter->_pHeap, pAllocationCallbacks);
+    }
+}
+
+static ma_result ma_data_converter_process_pcm_frames__passthrough(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 frameCount;
+
+    MA_ASSERT(pConverter != NULL);
+
+    frameCountIn = 0;
+    if (pFrameCountIn != NULL) {
+        frameCountIn = *pFrameCountIn;
+    }
+
+    frameCountOut = 0;
+    if (pFrameCountOut != NULL) {
+        frameCountOut = *pFrameCountOut;
+    }
+
+    frameCount = ma_min(frameCountIn, frameCountOut);
+
+    if (pFramesOut != NULL) {
+        if (pFramesIn != NULL) {
+            ma_copy_memory_64(pFramesOut, pFramesIn, frameCount * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
+        } else {
+            ma_zero_memory_64(pFramesOut,            frameCount * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
+        }
+    }
+
+    if (pFrameCountIn != NULL) {
+        *pFrameCountIn = frameCount;
+    }
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = frameCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_data_converter_process_pcm_frames__format_only(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 frameCount;
+
+    MA_ASSERT(pConverter != NULL);
+
+    frameCountIn = 0;
+    if (pFrameCountIn != NULL) {
+        frameCountIn = *pFrameCountIn;
+    }
+
+    frameCountOut = 0;
+    if (pFrameCountOut != NULL) {
+        frameCountOut = *pFrameCountOut;
+    }
+
+    frameCount = ma_min(frameCountIn, frameCountOut);
+
+    if (pFramesOut != NULL) {
+        if (pFramesIn != NULL) {
+            ma_convert_pcm_frames_format(pFramesOut, pConverter->formatOut, pFramesIn, pConverter->formatIn, frameCount, pConverter->channelsIn, pConverter->ditherMode);
+        } else {
+            ma_zero_memory_64(pFramesOut, frameCount * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
+        }
+    }
+
+    if (pFrameCountIn != NULL) {
+        *pFrameCountIn = frameCount;
+    }
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = frameCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_data_converter_process_pcm_frames__resample_with_format_conversion(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 framesProcessedIn;
+    ma_uint64 framesProcessedOut;
+
+    MA_ASSERT(pConverter != NULL);
+
+    frameCountIn = 0;
+    if (pFrameCountIn != NULL) {
+        frameCountIn = *pFrameCountIn;
+    }
+
+    frameCountOut = 0;
+    if (pFrameCountOut != NULL) {
+        frameCountOut = *pFrameCountOut;
+    }
+
+    framesProcessedIn  = 0;
+    framesProcessedOut = 0;
+
+    while (framesProcessedOut < frameCountOut) {
+        ma_uint8 pTempBufferOut[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+        const ma_uint32 tempBufferOutCap = sizeof(pTempBufferOut) / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
+        const void* pFramesInThisIteration;
+        /* */ void* pFramesOutThisIteration;
+        ma_uint64 frameCountInThisIteration;
+        ma_uint64 frameCountOutThisIteration;
+
+        if (pFramesIn != NULL) {
+            pFramesInThisIteration = ma_offset_ptr(pFramesIn, framesProcessedIn * ma_get_bytes_per_frame(pConverter->formatIn, pConverter->channelsIn));
+        } else {
+            pFramesInThisIteration = NULL;
+        }
+
+        if (pFramesOut != NULL) {
+            pFramesOutThisIteration = ma_offset_ptr(pFramesOut, framesProcessedOut * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
+        } else {
+            pFramesOutThisIteration = NULL;
+        }
+
+        /* Do a pre format conversion if necessary. */
+        if (pConverter->hasPreFormatConversion) {
+            ma_uint8 pTempBufferIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+            const ma_uint32 tempBufferInCap = sizeof(pTempBufferIn) / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
+
+            frameCountInThisIteration  = (frameCountIn - framesProcessedIn);
+            if (frameCountInThisIteration > tempBufferInCap) {
+                frameCountInThisIteration = tempBufferInCap;
+            }
+
+            if (pConverter->hasPostFormatConversion) {
+               if (frameCountInThisIteration > tempBufferOutCap) {
+                   frameCountInThisIteration = tempBufferOutCap;
+               }
+            }
+
+            if (pFramesInThisIteration != NULL) {
+                ma_convert_pcm_frames_format(pTempBufferIn, pConverter->resampler.format, pFramesInThisIteration, pConverter->formatIn, frameCountInThisIteration, pConverter->channelsIn, pConverter->ditherMode);
+            } else {
+                MA_ZERO_MEMORY(pTempBufferIn, sizeof(pTempBufferIn));
+            }
+
+            frameCountOutThisIteration = (frameCountOut - framesProcessedOut);
+
+            if (pConverter->hasPostFormatConversion) {
+                /* Both input and output conversion required. Output to the temp buffer. */
+                if (frameCountOutThisIteration > tempBufferOutCap) {
+                    frameCountOutThisIteration = tempBufferOutCap;
+                }
+
+                result = ma_resampler_process_pcm_frames(&pConverter->resampler, pTempBufferIn, &frameCountInThisIteration, pTempBufferOut, &frameCountOutThisIteration);
+            } else {
+                /* Only pre-format required. Output straight to the output buffer. */
+                result = ma_resampler_process_pcm_frames(&pConverter->resampler, pTempBufferIn, &frameCountInThisIteration, pFramesOutThisIteration, &frameCountOutThisIteration);
+            }
+
+            if (result != MA_SUCCESS) {
+                break;
+            }
+        } else {
+            /* No pre-format required. Just read straight from the input buffer. */
+            MA_ASSERT(pConverter->hasPostFormatConversion == MA_TRUE);
+
+            frameCountInThisIteration  = (frameCountIn  - framesProcessedIn);
+            frameCountOutThisIteration = (frameCountOut - framesProcessedOut);
+            if (frameCountOutThisIteration > tempBufferOutCap) {
+                frameCountOutThisIteration = tempBufferOutCap;
+            }
+
+            result = ma_resampler_process_pcm_frames(&pConverter->resampler, pFramesInThisIteration, &frameCountInThisIteration, pTempBufferOut, &frameCountOutThisIteration);
+            if (result != MA_SUCCESS) {
+                break;
+            }
+        }
+
+        /* If we are doing a post format conversion we need to do that now. */
+        if (pConverter->hasPostFormatConversion) {
+            if (pFramesOutThisIteration != NULL) {
+                ma_convert_pcm_frames_format(pFramesOutThisIteration, pConverter->formatOut, pTempBufferOut, pConverter->resampler.format, frameCountOutThisIteration, pConverter->resampler.channels, pConverter->ditherMode);
+            }
+        }
+
+        framesProcessedIn  += frameCountInThisIteration;
+        framesProcessedOut += frameCountOutThisIteration;
+
+        MA_ASSERT(framesProcessedIn  <= frameCountIn);
+        MA_ASSERT(framesProcessedOut <= frameCountOut);
+
+        if (frameCountOutThisIteration == 0) {
+            break;  /* Consumed all of our input data. */
+        }
+    }
+
+    if (pFrameCountIn != NULL) {
+        *pFrameCountIn = framesProcessedIn;
+    }
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = framesProcessedOut;
+    }
+
+    return result;
+}
+
+static ma_result ma_data_converter_process_pcm_frames__resample_only(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    MA_ASSERT(pConverter != NULL);
+
+    if (pConverter->hasPreFormatConversion == MA_FALSE && pConverter->hasPostFormatConversion == MA_FALSE) {
+        /* Neither pre- nor post-format required. This is simple case where only resampling is required. */
+        return ma_resampler_process_pcm_frames(&pConverter->resampler, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+    } else {
+        /* Format conversion required. */
+        return ma_data_converter_process_pcm_frames__resample_with_format_conversion(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+    }
+}
+
+static ma_result ma_data_converter_process_pcm_frames__channels_only(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    ma_result result;
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 frameCount;
+
+    MA_ASSERT(pConverter != NULL);
+
+    frameCountIn = 0;
+    if (pFrameCountIn != NULL) {
+        frameCountIn = *pFrameCountIn;
+    }
+
+    frameCountOut = 0;
+    if (pFrameCountOut != NULL) {
+        frameCountOut = *pFrameCountOut;
+    }
+
+    frameCount = ma_min(frameCountIn, frameCountOut);
+
+    if (pConverter->hasPreFormatConversion == MA_FALSE && pConverter->hasPostFormatConversion == MA_FALSE) {
+        /* No format conversion required. */
+        result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pFramesOut, pFramesIn, frameCount);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    } else {
+        /* Format conversion required. */
+        ma_uint64 framesProcessed = 0;
+
+        while (framesProcessed < frameCount) {
+            ma_uint8 pTempBufferOut[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+            const ma_uint32 tempBufferOutCap = sizeof(pTempBufferOut) / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsOut);
+            const void* pFramesInThisIteration;
+            /* */ void* pFramesOutThisIteration;
+            ma_uint64 frameCountThisIteration;
+
+            if (pFramesIn != NULL) {
+                pFramesInThisIteration = ma_offset_ptr(pFramesIn, framesProcessed * ma_get_bytes_per_frame(pConverter->formatIn, pConverter->channelsIn));
+            } else {
+                pFramesInThisIteration = NULL;
+            }
+
+            if (pFramesOut != NULL) {
+                pFramesOutThisIteration = ma_offset_ptr(pFramesOut, framesProcessed * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
+            } else {
+                pFramesOutThisIteration = NULL;
+            }
+
+            /* Do a pre format conversion if necessary. */
+            if (pConverter->hasPreFormatConversion) {
+                ma_uint8 pTempBufferIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+                const ma_uint32 tempBufferInCap = sizeof(pTempBufferIn) / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsIn);
+
+                frameCountThisIteration = (frameCount - framesProcessed);
+                if (frameCountThisIteration > tempBufferInCap) {
+                    frameCountThisIteration = tempBufferInCap;
+                }
+
+                if (pConverter->hasPostFormatConversion) {
+                    if (frameCountThisIteration > tempBufferOutCap) {
+                        frameCountThisIteration = tempBufferOutCap;
+                    }
+                }
+
+                if (pFramesInThisIteration != NULL) {
+                    ma_convert_pcm_frames_format(pTempBufferIn, pConverter->channelConverter.format, pFramesInThisIteration, pConverter->formatIn, frameCountThisIteration, pConverter->channelsIn, pConverter->ditherMode);
+                } else {
+                    MA_ZERO_MEMORY(pTempBufferIn, sizeof(pTempBufferIn));
+                }
+
+                if (pConverter->hasPostFormatConversion) {
+                    /* Both input and output conversion required. Output to the temp buffer. */
+                    result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pTempBufferOut, pTempBufferIn, frameCountThisIteration);
+                } else {
+                    /* Only pre-format required. Output straight to the output buffer. */
+                    result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pFramesOutThisIteration, pTempBufferIn, frameCountThisIteration);
+                }
+
+                if (result != MA_SUCCESS) {
+                    break;
+                }
+            } else {
+                /* No pre-format required. Just read straight from the input buffer. */
+                MA_ASSERT(pConverter->hasPostFormatConversion == MA_TRUE);
+
+                frameCountThisIteration = (frameCount - framesProcessed);
+                if (frameCountThisIteration > tempBufferOutCap) {
+                    frameCountThisIteration = tempBufferOutCap;
+                }
+
+                result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pTempBufferOut, pFramesInThisIteration, frameCountThisIteration);
+                if (result != MA_SUCCESS) {
+                    break;
+                }
+            }
+
+            /* If we are doing a post format conversion we need to do that now. */
+            if (pConverter->hasPostFormatConversion) {
+                if (pFramesOutThisIteration != NULL) {
+                    ma_convert_pcm_frames_format(pFramesOutThisIteration, pConverter->formatOut, pTempBufferOut, pConverter->channelConverter.format, frameCountThisIteration, pConverter->channelConverter.channelsOut, pConverter->ditherMode);
+                }
+            }
+
+            framesProcessed += frameCountThisIteration;
+        }
+    }
+
+    if (pFrameCountIn != NULL) {
+        *pFrameCountIn = frameCount;
+    }
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = frameCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_data_converter_process_pcm_frames__resample_first(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    ma_result result;
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 framesProcessedIn;
+    ma_uint64 framesProcessedOut;
+    ma_uint8  pTempBufferIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];   /* In resampler format. */
+    ma_uint64 tempBufferInCap;
+    ma_uint8  pTempBufferMid[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In resampler format, channel converter input format. */
+    ma_uint64 tempBufferMidCap;
+    ma_uint8  pTempBufferOut[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In channel converter output format. */
+    ma_uint64 tempBufferOutCap;
+
+    MA_ASSERT(pConverter != NULL);
+    MA_ASSERT(pConverter->resampler.format   == pConverter->channelConverter.format);
+    MA_ASSERT(pConverter->resampler.channels == pConverter->channelConverter.channelsIn);
+    MA_ASSERT(pConverter->resampler.channels <  pConverter->channelConverter.channelsOut);
+
+    frameCountIn = 0;
+    if (pFrameCountIn != NULL) {
+        frameCountIn = *pFrameCountIn;
+    }
+
+    frameCountOut = 0;
+    if (pFrameCountOut != NULL) {
+        frameCountOut = *pFrameCountOut;
+    }
+
+    framesProcessedIn  = 0;
+    framesProcessedOut = 0;
+
+    tempBufferInCap  = sizeof(pTempBufferIn)  / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
+    tempBufferMidCap = sizeof(pTempBufferIn)  / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
+    tempBufferOutCap = sizeof(pTempBufferOut) / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsOut);
+
+    while (framesProcessedOut < frameCountOut) {
+        ma_uint64 frameCountInThisIteration;
+        ma_uint64 frameCountOutThisIteration;
+        const void* pRunningFramesIn = NULL;
+        void* pRunningFramesOut = NULL;
+        const void* pResampleBufferIn;
+        void* pChannelsBufferOut;
+
+        if (pFramesIn != NULL) {
+            pRunningFramesIn  = ma_offset_ptr(pFramesIn,  framesProcessedIn  * ma_get_bytes_per_frame(pConverter->formatIn, pConverter->channelsIn));
+        }
+        if (pFramesOut != NULL) {
+            pRunningFramesOut = ma_offset_ptr(pFramesOut, framesProcessedOut * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
+        }
+
+        /* Run input data through the resampler and output it to the temporary buffer. */
+        frameCountInThisIteration = (frameCountIn - framesProcessedIn);
+
+        if (pConverter->hasPreFormatConversion) {
+            if (frameCountInThisIteration > tempBufferInCap) {
+                frameCountInThisIteration = tempBufferInCap;
+            }
+        }
+
+        frameCountOutThisIteration = (frameCountOut - framesProcessedOut);
+        if (frameCountOutThisIteration > tempBufferMidCap) {
+            frameCountOutThisIteration = tempBufferMidCap;
+        }
+
+        /* We can't read more frames than can fit in the output buffer. */
+        if (pConverter->hasPostFormatConversion) {
+            if (frameCountOutThisIteration > tempBufferOutCap) {
+                frameCountOutThisIteration = tempBufferOutCap;
+            }
+        }
+
+        /* We need to ensure we don't try to process too many input frames that we run out of room in the output buffer. If this happens we'll end up glitching. */
+
+        /*
+        We need to try to predict how many input frames will be required for the resampler. If the
+        resampler can tell us, we'll use that. Otherwise we'll need to make a best guess. The further
+        off we are from this, the more wasted format conversions we'll end up doing.
+        */
+        #if 1
+        {
+            ma_uint64 requiredInputFrameCount;
+
+            result = ma_resampler_get_required_input_frame_count(&pConverter->resampler, frameCountOutThisIteration, &requiredInputFrameCount);
+            if (result != MA_SUCCESS) {
+                /* Fall back to a best guess. */
+                requiredInputFrameCount = (frameCountOutThisIteration * pConverter->resampler.sampleRateIn) / pConverter->resampler.sampleRateOut;
+            }
+
+            if (frameCountInThisIteration > requiredInputFrameCount) {
+                frameCountInThisIteration = requiredInputFrameCount;
+            }
+        }
+        #endif
+
+        if (pConverter->hasPreFormatConversion) {
+            if (pFramesIn != NULL) {
+                ma_convert_pcm_frames_format(pTempBufferIn, pConverter->resampler.format, pRunningFramesIn, pConverter->formatIn, frameCountInThisIteration, pConverter->channelsIn, pConverter->ditherMode);
+                pResampleBufferIn = pTempBufferIn;
+            } else {
+                pResampleBufferIn = NULL;
+            }
+        } else {
+            pResampleBufferIn = pRunningFramesIn;
+        }
+
+        result = ma_resampler_process_pcm_frames(&pConverter->resampler, pResampleBufferIn, &frameCountInThisIteration, pTempBufferMid, &frameCountOutThisIteration);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+
+        /*
+        The input data has been resampled so now we need to run it through the channel converter. The input data is always contained in pTempBufferMid. We only need to do
+        this part if we have an output buffer.
+        */
+        if (pFramesOut != NULL) {
+            if (pConverter->hasPostFormatConversion) {
+                pChannelsBufferOut = pTempBufferOut;
+            } else {
+                pChannelsBufferOut = pRunningFramesOut;
+            }
+
+            result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pChannelsBufferOut, pTempBufferMid, frameCountOutThisIteration);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+
+            /* Finally we do post format conversion. */
+            if (pConverter->hasPostFormatConversion) {
+                ma_convert_pcm_frames_format(pRunningFramesOut, pConverter->formatOut, pChannelsBufferOut, pConverter->channelConverter.format, frameCountOutThisIteration, pConverter->channelConverter.channelsOut, pConverter->ditherMode);
+            }
+        }
+
+
+        framesProcessedIn  += frameCountInThisIteration;
+        framesProcessedOut += frameCountOutThisIteration;
+
+        MA_ASSERT(framesProcessedIn  <= frameCountIn);
+        MA_ASSERT(framesProcessedOut <= frameCountOut);
+
+        if (frameCountOutThisIteration == 0) {
+            break;  /* Consumed all of our input data. */
+        }
+    }
+
+    if (pFrameCountIn != NULL) {
+        *pFrameCountIn = framesProcessedIn;
+    }
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = framesProcessedOut;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_data_converter_process_pcm_frames__channels_first(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    ma_result result;
+    ma_uint64 frameCountIn;
+    ma_uint64 frameCountOut;
+    ma_uint64 framesProcessedIn;
+    ma_uint64 framesProcessedOut;
+    ma_uint8  pTempBufferIn[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];   /* In resampler format. */
+    ma_uint64 tempBufferInCap;
+    ma_uint8  pTempBufferMid[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In resampler format, channel converter input format. */
+    ma_uint64 tempBufferMidCap;
+    ma_uint8  pTempBufferOut[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In channel converter output format. */
+    ma_uint64 tempBufferOutCap;
+
+    MA_ASSERT(pConverter != NULL);
+    MA_ASSERT(pConverter->resampler.format   == pConverter->channelConverter.format);
+    MA_ASSERT(pConverter->resampler.channels == pConverter->channelConverter.channelsOut);
+    MA_ASSERT(pConverter->resampler.channels <= pConverter->channelConverter.channelsIn);
+
+    frameCountIn = 0;
+    if (pFrameCountIn != NULL) {
+        frameCountIn = *pFrameCountIn;
+    }
+
+    frameCountOut = 0;
+    if (pFrameCountOut != NULL) {
+        frameCountOut = *pFrameCountOut;
+    }
+
+    framesProcessedIn  = 0;
+    framesProcessedOut = 0;
+
+    tempBufferInCap  = sizeof(pTempBufferIn)  / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsIn);
+    tempBufferMidCap = sizeof(pTempBufferIn)  / ma_get_bytes_per_frame(pConverter->channelConverter.format, pConverter->channelConverter.channelsOut);
+    tempBufferOutCap = sizeof(pTempBufferOut) / ma_get_bytes_per_frame(pConverter->resampler.format, pConverter->resampler.channels);
+
+    while (framesProcessedOut < frameCountOut) {
+        ma_uint64 frameCountInThisIteration;
+        ma_uint64 frameCountOutThisIteration;
+        const void* pRunningFramesIn = NULL;
+        void* pRunningFramesOut = NULL;
+        const void* pChannelsBufferIn;
+        void* pResampleBufferOut;
+
+        if (pFramesIn != NULL) {
+            pRunningFramesIn  = ma_offset_ptr(pFramesIn,  framesProcessedIn  * ma_get_bytes_per_frame(pConverter->formatIn, pConverter->channelsIn));
+        }
+        if (pFramesOut != NULL) {
+            pRunningFramesOut = ma_offset_ptr(pFramesOut, framesProcessedOut * ma_get_bytes_per_frame(pConverter->formatOut, pConverter->channelsOut));
+        }
+
+        /*
+        Before doing any processing we need to determine how many frames we should try processing
+        this iteration, for both input and output. The resampler requires us to perform format and
+        channel conversion before passing any data into it. If we get our input count wrong, we'll
+        end up performing redundant pre-processing. This isn't the end of the world, but it does
+        result in some inefficiencies proportionate to how far our estimates are off.
+
+        If the resampler has a means to calculate exactly how much we'll need, we'll use that.
+        Otherwise we'll make a best guess. In order to do this, we'll need to calculate the output
+        frame count first.
+        */
+        frameCountOutThisIteration = (frameCountOut - framesProcessedOut);
+        if (frameCountOutThisIteration > tempBufferMidCap) {
+            frameCountOutThisIteration = tempBufferMidCap;
+        }
+
+        if (pConverter->hasPostFormatConversion) {
+            if (frameCountOutThisIteration > tempBufferOutCap) {
+                frameCountOutThisIteration = tempBufferOutCap;
+            }
+        }
+
+        /* Now that we have the output frame count we can determine the input frame count. */
+        frameCountInThisIteration = (frameCountIn - framesProcessedIn);
+        if (pConverter->hasPreFormatConversion) {
+            if (frameCountInThisIteration > tempBufferInCap) {
+                frameCountInThisIteration = tempBufferInCap;
+            }
+        }
+
+        if (frameCountInThisIteration > tempBufferMidCap) {
+            frameCountInThisIteration = tempBufferMidCap;
+        }
+
+        #if 1
+        {
+            ma_uint64 requiredInputFrameCount;
+
+            result = ma_resampler_get_required_input_frame_count(&pConverter->resampler, frameCountOutThisIteration, &requiredInputFrameCount);
+            if (result != MA_SUCCESS) {
+                /* Fall back to a best guess. */
+                requiredInputFrameCount = (frameCountOutThisIteration * pConverter->resampler.sampleRateIn) / pConverter->resampler.sampleRateOut;
+            }
+
+            if (frameCountInThisIteration > requiredInputFrameCount) {
+                frameCountInThisIteration = requiredInputFrameCount;
+            }
+        }
+        #endif
+
+
+        /* Pre format conversion. */
+        if (pConverter->hasPreFormatConversion) {
+            if (pRunningFramesIn != NULL) {
+                ma_convert_pcm_frames_format(pTempBufferIn, pConverter->channelConverter.format, pRunningFramesIn, pConverter->formatIn, frameCountInThisIteration, pConverter->channelsIn, pConverter->ditherMode);
+                pChannelsBufferIn = pTempBufferIn;
+            } else {
+                pChannelsBufferIn = NULL;
+            }
+        } else {
+            pChannelsBufferIn = pRunningFramesIn;
+        }
+
+
+        /* Channel conversion. */
+        result = ma_channel_converter_process_pcm_frames(&pConverter->channelConverter, pTempBufferMid, pChannelsBufferIn, frameCountInThisIteration);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+
+        /* Resampling. */
+        if (pConverter->hasPostFormatConversion) {
+            pResampleBufferOut = pTempBufferOut;
+        } else {
+            pResampleBufferOut = pRunningFramesOut;
+        }
+
+        result = ma_resampler_process_pcm_frames(&pConverter->resampler, pTempBufferMid, &frameCountInThisIteration, pResampleBufferOut, &frameCountOutThisIteration);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+
+        /* Post format conversion. */
+        if (pConverter->hasPostFormatConversion) {
+            if (pRunningFramesOut != NULL) {
+                ma_convert_pcm_frames_format(pRunningFramesOut, pConverter->formatOut, pResampleBufferOut, pConverter->resampler.format, frameCountOutThisIteration, pConverter->channelsOut, pConverter->ditherMode);
+            }
+        }
+
+
+        framesProcessedIn  += frameCountInThisIteration;
+        framesProcessedOut += frameCountOutThisIteration;
+
+        MA_ASSERT(framesProcessedIn  <= frameCountIn);
+        MA_ASSERT(framesProcessedOut <= frameCountOut);
+
+        if (frameCountOutThisIteration == 0) {
+            break;  /* Consumed all of our input data. */
+        }
+    }
+
+    if (pFrameCountIn != NULL) {
+        *pFrameCountIn = framesProcessedIn;
+    }
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = framesProcessedOut;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_converter_process_pcm_frames(ma_data_converter* pConverter, const void* pFramesIn, ma_uint64* pFrameCountIn, void* pFramesOut, ma_uint64* pFrameCountOut)
+{
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    switch (pConverter->executionPath)
+    {
+        case ma_data_converter_execution_path_passthrough:    return ma_data_converter_process_pcm_frames__passthrough(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+        case ma_data_converter_execution_path_format_only:    return ma_data_converter_process_pcm_frames__format_only(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+        case ma_data_converter_execution_path_channels_only:  return ma_data_converter_process_pcm_frames__channels_only(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+        case ma_data_converter_execution_path_resample_only:  return ma_data_converter_process_pcm_frames__resample_only(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+        case ma_data_converter_execution_path_resample_first: return ma_data_converter_process_pcm_frames__resample_first(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+        case ma_data_converter_execution_path_channels_first: return ma_data_converter_process_pcm_frames__channels_first(pConverter, pFramesIn, pFrameCountIn, pFramesOut, pFrameCountOut);
+        default: return MA_INVALID_OPERATION;   /* Should never hit this. */
+    }
+}
+
+MA_API ma_result ma_data_converter_set_rate(ma_data_converter* pConverter, ma_uint32 sampleRateIn, ma_uint32 sampleRateOut)
+{
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConverter->hasResampler == MA_FALSE) {
+        return MA_INVALID_OPERATION;    /* Dynamic resampling not enabled. */
+    }
+
+    return ma_resampler_set_rate(&pConverter->resampler, sampleRateIn, sampleRateOut);
+}
+
+MA_API ma_result ma_data_converter_set_rate_ratio(ma_data_converter* pConverter, float ratioInOut)
+{
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConverter->hasResampler == MA_FALSE) {
+        return MA_INVALID_OPERATION;    /* Dynamic resampling not enabled. */
+    }
+
+    return ma_resampler_set_rate_ratio(&pConverter->resampler, ratioInOut);
+}
+
+MA_API ma_uint64 ma_data_converter_get_input_latency(const ma_data_converter* pConverter)
+{
+    if (pConverter == NULL) {
+        return 0;
+    }
+
+    if (pConverter->hasResampler) {
+        return ma_resampler_get_input_latency(&pConverter->resampler);
+    }
+
+    return 0;   /* No latency without a resampler. */
+}
+
+MA_API ma_uint64 ma_data_converter_get_output_latency(const ma_data_converter* pConverter)
+{
+    if (pConverter == NULL) {
+        return 0;
+    }
+
+    if (pConverter->hasResampler) {
+        return ma_resampler_get_output_latency(&pConverter->resampler);
+    }
+
+    return 0;   /* No latency without a resampler. */
+}
+
+MA_API ma_result ma_data_converter_get_required_input_frame_count(const ma_data_converter* pConverter, ma_uint64 outputFrameCount, ma_uint64* pInputFrameCount)
+{
+    if (pInputFrameCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pInputFrameCount = 0;
+
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConverter->hasResampler) {
+        return ma_resampler_get_required_input_frame_count(&pConverter->resampler, outputFrameCount, pInputFrameCount);
+    } else {
+        *pInputFrameCount = outputFrameCount;   /* 1:1 */
+        return MA_SUCCESS;
+    }
+}
+
+MA_API ma_result ma_data_converter_get_expected_output_frame_count(const ma_data_converter* pConverter, ma_uint64 inputFrameCount, ma_uint64* pOutputFrameCount)
+{
+    if (pOutputFrameCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pOutputFrameCount = 0;
+
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConverter->hasResampler) {
+        return ma_resampler_get_expected_output_frame_count(&pConverter->resampler, inputFrameCount, pOutputFrameCount);
+    } else {
+        *pOutputFrameCount = inputFrameCount;   /* 1:1 */
+        return MA_SUCCESS;
+    }
+}
+
+MA_API ma_result ma_data_converter_get_input_channel_map(const ma_data_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    if (pConverter == NULL || pChannelMap == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConverter->hasChannelConverter) {
+        ma_channel_converter_get_output_channel_map(&pConverter->channelConverter, pChannelMap, channelMapCap);
+    } else {
+        ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pConverter->channelsOut);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_converter_get_output_channel_map(const ma_data_converter* pConverter, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    if (pConverter == NULL || pChannelMap == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConverter->hasChannelConverter) {
+        ma_channel_converter_get_input_channel_map(&pConverter->channelConverter, pChannelMap, channelMapCap);
+    } else {
+        ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pConverter->channelsIn);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_converter_reset(ma_data_converter* pConverter)
+{
+    if (pConverter == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* There's nothing to do if we're not resampling. */
+    if (pConverter->hasResampler == MA_FALSE) {
+        return MA_SUCCESS;
+    }
+
+    return ma_resampler_reset(&pConverter->resampler);
+}
+
+
+
+/**************************************************************************************************************************************************************
+
+Channel Maps
+
+**************************************************************************************************************************************************************/
+static ma_channel ma_channel_map_init_standard_channel(ma_standard_channel_map standardChannelMap, ma_uint32 channelCount, ma_uint32 channelIndex);
+
+MA_API ma_channel ma_channel_map_get_channel(const ma_channel* pChannelMap, ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    if (pChannelMap == NULL) {
+        return ma_channel_map_init_standard_channel(ma_standard_channel_map_default, channelCount, channelIndex);
+    } else {
+        if (channelIndex >= channelCount) {
+            return MA_CHANNEL_NONE;
+        }
+
+        return pChannelMap[channelIndex];
+    }
+}
+
+MA_API void ma_channel_map_init_blank(ma_channel* pChannelMap, ma_uint32 channels)
+{
+    if (pChannelMap == NULL) {
+        return;
+    }
+
+    MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channels);
+}
+
+
+static ma_channel ma_channel_map_init_standard_channel_microsoft(ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    if (channelCount == 0 || channelIndex >= channelCount) {
+        return MA_CHANNEL_NONE;
+    }
+
+    /* This is the Microsoft channel map. Based off the speaker configurations mentioned here: https://docs.microsoft.com/en-us/windows-hardware/drivers/ddi/content/ksmedia/ns-ksmedia-ksaudio_channel_config */
+    switch (channelCount)
+    {
+        case 0: return MA_CHANNEL_NONE;
+
+        case 1:
+        {
+            return MA_CHANNEL_MONO;
+        } break;
+
+        case 2:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+            }
+        } break;
+
+        case 3: /* No defined, but best guess. */
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+            }
+        } break;
+
+        case 4:
+        {
+            switch (channelIndex) {
+            #ifndef MA_USE_QUAD_MICROSOFT_CHANNEL_MAP
+                /* Surround. Using the Surround profile has the advantage of the 3rd channel (MA_CHANNEL_FRONT_CENTER) mapping nicely with higher channel counts. */
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_BACK_CENTER;
+            #else
+                /* Quad. */
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+            #endif
+            }
+        } break;
+
+        case 5: /* Not defined, but best guess. */
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_BACK_LEFT;
+                case 4: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 6:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_LFE;
+                case 4: return MA_CHANNEL_SIDE_LEFT;
+                case 5: return MA_CHANNEL_SIDE_RIGHT;
+            }
+        } break;
+
+        case 7: /* Not defined, but best guess. */
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_LFE;
+                case 4: return MA_CHANNEL_BACK_CENTER;
+                case 5: return MA_CHANNEL_SIDE_LEFT;
+                case 6: return MA_CHANNEL_SIDE_RIGHT;
+            }
+        } break;
+
+        case 8:
+        default:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_LFE;
+                case 4: return MA_CHANNEL_BACK_LEFT;
+                case 5: return MA_CHANNEL_BACK_RIGHT;
+                case 6: return MA_CHANNEL_SIDE_LEFT;
+                case 7: return MA_CHANNEL_SIDE_RIGHT;
+            }
+        } break;
+    }
+
+    if (channelCount > 8) {
+        if (channelIndex < 32) {    /* We have 32 AUX channels. */
+            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
+        }
+    }
+
+    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
+    return MA_CHANNEL_NONE;
+}
+
+static ma_channel ma_channel_map_init_standard_channel_alsa(ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    switch (channelCount)
+    {
+        case 0: return MA_CHANNEL_NONE;
+
+        case 1:
+        {
+            return MA_CHANNEL_MONO;
+        } break;
+
+        case 2:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+            }
+        } break;
+
+        case 3:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+            }
+        } break;
+
+        case 4:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 5:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+                case 4: return MA_CHANNEL_FRONT_CENTER;
+            }
+        } break;
+
+        case 6:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+                case 4: return MA_CHANNEL_FRONT_CENTER;
+                case 5: return MA_CHANNEL_LFE;
+            }
+        } break;
+
+        case 7:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+                case 4: return MA_CHANNEL_FRONT_CENTER;
+                case 5: return MA_CHANNEL_LFE;
+                case 6: return MA_CHANNEL_BACK_CENTER;
+            }
+        } break;
+
+        case 8:
+        default:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+                case 4: return MA_CHANNEL_FRONT_CENTER;
+                case 5: return MA_CHANNEL_LFE;
+                case 6: return MA_CHANNEL_SIDE_LEFT;
+                case 7: return MA_CHANNEL_SIDE_RIGHT;
+            }
+        } break;
+    }
+
+    if (channelCount > 8) {
+        if (channelIndex < 32) {    /* We have 32 AUX channels. */
+            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
+        }
+    }
+
+    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
+    return MA_CHANNEL_NONE;
+}
+
+static ma_channel ma_channel_map_init_standard_channel_rfc3551(ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    switch (channelCount)
+    {
+        case 0: return MA_CHANNEL_NONE;
+
+        case 1:
+        {
+            return MA_CHANNEL_MONO;
+        } break;
+
+        case 2:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+            }
+        } break;
+
+        case 3:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+            }
+        } break;
+
+        case 4:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 3: return MA_CHANNEL_BACK_CENTER;
+            }
+        } break;
+
+        case 5:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_BACK_LEFT;
+                case 4: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 6:
+        default:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_SIDE_LEFT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_FRONT_RIGHT;
+                case 4: return MA_CHANNEL_SIDE_RIGHT;
+                case 5: return MA_CHANNEL_BACK_CENTER;
+            }
+        } break;
+    }
+
+    if (channelCount > 6) {
+        if (channelIndex < 32) {    /* We have 32 AUX channels. */
+            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 6));
+        }
+    }
+
+    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
+    return MA_CHANNEL_NONE;
+}
+
+static ma_channel ma_channel_map_init_standard_channel_flac(ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    switch (channelCount)
+    {
+        case 0: return MA_CHANNEL_NONE;
+
+        case 1:
+        {
+            return MA_CHANNEL_MONO;
+        } break;
+
+        case 2:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+            }
+        } break;
+
+        case 3:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+            }
+        } break;
+
+        case 4:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 5:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_BACK_LEFT;
+                case 4: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 6:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_LFE;
+                case 4: return MA_CHANNEL_BACK_LEFT;
+                case 5: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 7:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_LFE;
+                case 4: return MA_CHANNEL_BACK_CENTER;
+                case 5: return MA_CHANNEL_SIDE_LEFT;
+                case 6: return MA_CHANNEL_SIDE_RIGHT;
+            }
+        } break;
+
+        case 8:
+        default:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_LFE;
+                case 4: return MA_CHANNEL_BACK_LEFT;
+                case 5: return MA_CHANNEL_BACK_RIGHT;
+                case 6: return MA_CHANNEL_SIDE_LEFT;
+                case 7: return MA_CHANNEL_SIDE_RIGHT;
+            }
+        } break;
+    }
+
+    if (channelCount > 8) {
+        if (channelIndex < 32) {    /* We have 32 AUX channels. */
+            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
+        }
+    }
+
+    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
+    return MA_CHANNEL_NONE;
+}
+
+static ma_channel ma_channel_map_init_standard_channel_vorbis(ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    switch (channelCount)
+    {
+        case 0: return MA_CHANNEL_NONE;
+
+        case 1:
+        {
+            return MA_CHANNEL_MONO;
+        } break;
+
+        case 2:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+            }
+        } break;
+
+        case 3:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_CENTER;
+                case 2: return MA_CHANNEL_FRONT_RIGHT;
+            }
+        } break;
+
+        case 4:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 5:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_CENTER;
+                case 2: return MA_CHANNEL_FRONT_RIGHT;
+                case 3: return MA_CHANNEL_BACK_LEFT;
+                case 4: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 6:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_CENTER;
+                case 2: return MA_CHANNEL_FRONT_RIGHT;
+                case 3: return MA_CHANNEL_BACK_LEFT;
+                case 4: return MA_CHANNEL_BACK_RIGHT;
+                case 5: return MA_CHANNEL_LFE;
+            }
+        } break;
+
+        case 7:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_CENTER;
+                case 2: return MA_CHANNEL_FRONT_RIGHT;
+                case 3: return MA_CHANNEL_SIDE_LEFT;
+                case 4: return MA_CHANNEL_SIDE_RIGHT;
+                case 5: return MA_CHANNEL_BACK_CENTER;
+                case 6: return MA_CHANNEL_LFE;
+            }
+        } break;
+
+        case 8:
+        default:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_CENTER;
+                case 2: return MA_CHANNEL_FRONT_RIGHT;
+                case 3: return MA_CHANNEL_SIDE_LEFT;
+                case 4: return MA_CHANNEL_SIDE_RIGHT;
+                case 5: return MA_CHANNEL_BACK_LEFT;
+                case 6: return MA_CHANNEL_BACK_RIGHT;
+                case 7: return MA_CHANNEL_LFE;
+            }
+        } break;
+    }
+
+    if (channelCount > 8) {
+        if (channelIndex < 32) {    /* We have 32 AUX channels. */
+            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
+        }
+    }
+
+    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
+    return MA_CHANNEL_NONE;
+}
+
+static ma_channel ma_channel_map_init_standard_channel_sound4(ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    switch (channelCount)
+    {
+        case 0: return MA_CHANNEL_NONE;
+
+        case 1:
+        {
+            return MA_CHANNEL_MONO;
+        } break;
+
+        case 2:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+            }
+        } break;
+
+        case 3:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+            }
+        } break;
+
+        case 4:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 5:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+                case 3: return MA_CHANNEL_BACK_LEFT;
+                case 4: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 6:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_CENTER;
+                case 2: return MA_CHANNEL_FRONT_RIGHT;
+                case 3: return MA_CHANNEL_BACK_LEFT;
+                case 4: return MA_CHANNEL_BACK_RIGHT;
+                case 5: return MA_CHANNEL_LFE;
+            }
+        } break;
+
+        case 7:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_CENTER;
+                case 2: return MA_CHANNEL_FRONT_RIGHT;
+                case 3: return MA_CHANNEL_SIDE_LEFT;
+                case 4: return MA_CHANNEL_SIDE_RIGHT;
+                case 5: return MA_CHANNEL_BACK_CENTER;
+                case 6: return MA_CHANNEL_LFE;
+            }
+        } break;
+
+        case 8:
+        default:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_CENTER;
+                case 2: return MA_CHANNEL_FRONT_RIGHT;
+                case 3: return MA_CHANNEL_SIDE_LEFT;
+                case 4: return MA_CHANNEL_SIDE_RIGHT;
+                case 5: return MA_CHANNEL_BACK_LEFT;
+                case 6: return MA_CHANNEL_BACK_RIGHT;
+                case 7: return MA_CHANNEL_LFE;
+            }
+        } break;
+    }
+
+    if (channelCount > 8) {
+        if (channelIndex < 32) {    /* We have 32 AUX channels. */
+            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 8));
+        }
+    }
+
+    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
+    return MA_CHANNEL_NONE;
+}
+
+static ma_channel ma_channel_map_init_standard_channel_sndio(ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    switch (channelCount)
+    {
+        case 0: return MA_CHANNEL_NONE;
+
+        case 1:
+        {
+            return MA_CHANNEL_MONO;
+        } break;
+
+        case 2:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+            }
+        } break;
+
+        case 3: /* No defined, but best guess. */
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_FRONT_CENTER;
+            }
+        } break;
+
+        case 4:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+            }
+        } break;
+
+        case 5: /* Not defined, but best guess. */
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+                case 4: return MA_CHANNEL_FRONT_CENTER;
+            }
+        } break;
+
+        case 6:
+        default:
+        {
+            switch (channelIndex) {
+                case 0: return MA_CHANNEL_FRONT_LEFT;
+                case 1: return MA_CHANNEL_FRONT_RIGHT;
+                case 2: return MA_CHANNEL_BACK_LEFT;
+                case 3: return MA_CHANNEL_BACK_RIGHT;
+                case 4: return MA_CHANNEL_FRONT_CENTER;
+                case 5: return MA_CHANNEL_LFE;
+            }
+        } break;
+    }
+
+    if (channelCount > 6) {
+        if (channelIndex < 32) {    /* We have 32 AUX channels. */
+            return (ma_channel)(MA_CHANNEL_AUX_0 + (channelIndex - 6));
+        }
+    }
+
+    /* Getting here means we don't know how to map the channel position so just return MA_CHANNEL_NONE. */
+    return MA_CHANNEL_NONE;
+}
+
+
+static ma_channel ma_channel_map_init_standard_channel(ma_standard_channel_map standardChannelMap, ma_uint32 channelCount, ma_uint32 channelIndex)
+{
+    if (channelCount == 0 || channelIndex >= channelCount) {
+        return MA_CHANNEL_NONE;
+    }
+
+    switch (standardChannelMap)
+    {
+        case ma_standard_channel_map_alsa:
+        {
+            return ma_channel_map_init_standard_channel_alsa(channelCount, channelIndex);
+        } break;
+
+        case ma_standard_channel_map_rfc3551:
+        {
+            return ma_channel_map_init_standard_channel_rfc3551(channelCount, channelIndex);
+        } break;
+
+        case ma_standard_channel_map_flac:
+        {
+            return ma_channel_map_init_standard_channel_flac(channelCount, channelIndex);
+        } break;
+
+        case ma_standard_channel_map_vorbis:
+        {
+            return ma_channel_map_init_standard_channel_vorbis(channelCount, channelIndex);
+        } break;
+
+        case ma_standard_channel_map_sound4:
+        {
+            return ma_channel_map_init_standard_channel_sound4(channelCount, channelIndex);
+        } break;
+
+        case ma_standard_channel_map_sndio:
+        {
+            return ma_channel_map_init_standard_channel_sndio(channelCount, channelIndex);
+        } break;
+
+        case ma_standard_channel_map_microsoft: /* Also default. */
+        /*case ma_standard_channel_map_default;*/
+        default:
+        {
+            return ma_channel_map_init_standard_channel_microsoft(channelCount, channelIndex);
+        } break;
+    }
+}
+
+MA_API void ma_channel_map_init_standard(ma_standard_channel_map standardChannelMap, ma_channel* pChannelMap, size_t channelMapCap, ma_uint32 channels)
+{
+    ma_uint32 iChannel;
+
+    if (pChannelMap == NULL || channelMapCap == 0 || channels == 0) {
+        return;
+    }
+
+    for (iChannel = 0; iChannel < channels; iChannel += 1) {
+        if (channelMapCap == 0) {
+            break;  /* Ran out of room. */
+        }
+
+        pChannelMap[0] = ma_channel_map_init_standard_channel(standardChannelMap, channels, iChannel);
+        pChannelMap   += 1;
+        channelMapCap -= 1;
+    }
+}
+
+MA_API void ma_channel_map_copy(ma_channel* pOut, const ma_channel* pIn, ma_uint32 channels)
+{
+    if (pOut != NULL && pIn != NULL && channels > 0) {
+        MA_COPY_MEMORY(pOut, pIn, sizeof(*pOut) * channels);
+    }
+}
+
+MA_API void ma_channel_map_copy_or_default(ma_channel* pOut, size_t channelMapCapOut, const ma_channel* pIn, ma_uint32 channels)
+{
+    if (pOut == NULL || channels == 0) {
+        return;
+    }
+
+    if (pIn != NULL) {
+        ma_channel_map_copy(pOut, pIn, channels);
+    } else {
+        ma_channel_map_init_standard(ma_standard_channel_map_default, pOut, channelMapCapOut, channels);
+    }
+}
+
+MA_API ma_bool32 ma_channel_map_is_valid(const ma_channel* pChannelMap, ma_uint32 channels)
+{
+    /* A channel count of 0 is invalid. */
+    if (channels == 0) {
+        return MA_FALSE;
+    }
+
+    /* It does not make sense to have a mono channel when there is more than 1 channel. */
+    if (channels > 1) {
+        ma_uint32 iChannel;
+        for (iChannel = 0; iChannel < channels; ++iChannel) {
+            if (ma_channel_map_get_channel(pChannelMap, channels, iChannel) == MA_CHANNEL_MONO) {
+                return MA_FALSE;
+            }
+        }
+    }
+
+    return MA_TRUE;
+}
+
+MA_API ma_bool32 ma_channel_map_is_equal(const ma_channel* pChannelMapA, const ma_channel* pChannelMapB, ma_uint32 channels)
+{
+    ma_uint32 iChannel;
+
+    if (pChannelMapA == pChannelMapB) {
+        return MA_TRUE;
+    }
+
+    for (iChannel = 0; iChannel < channels; ++iChannel) {
+        if (ma_channel_map_get_channel(pChannelMapA, channels, iChannel) != ma_channel_map_get_channel(pChannelMapB, channels, iChannel)) {
+            return MA_FALSE;
+        }
+    }
+
+    return MA_TRUE;
+}
+
+MA_API ma_bool32 ma_channel_map_is_blank(const ma_channel* pChannelMap, ma_uint32 channels)
+{
+    ma_uint32 iChannel;
+
+    /* A null channel map is equivalent to the default channel map. */
+    if (pChannelMap == NULL) {
+        return MA_FALSE;
+    }
+
+    for (iChannel = 0; iChannel < channels; ++iChannel) {
+        if (pChannelMap[iChannel] != MA_CHANNEL_NONE) {
+            return MA_FALSE;
+        }
+    }
+
+    return MA_TRUE;
+}
+
+MA_API ma_bool32 ma_channel_map_contains_channel_position(ma_uint32 channels, const ma_channel* pChannelMap, ma_channel channelPosition)
+{
+    return ma_channel_map_find_channel_position(channels, pChannelMap, channelPosition, NULL);
+}
+
+MA_API ma_bool32 ma_channel_map_find_channel_position(ma_uint32 channels, const ma_channel* pChannelMap, ma_channel channelPosition, ma_uint32* pChannelIndex)
+{
+    ma_uint32 iChannel;
+
+    if (pChannelIndex != NULL) {
+        *pChannelIndex = (ma_uint32)-1;
+    }
+
+    for (iChannel = 0; iChannel < channels; ++iChannel) {
+        if (ma_channel_map_get_channel(pChannelMap, channels, iChannel) == channelPosition) {
+            if (pChannelIndex != NULL) {
+                *pChannelIndex = iChannel;
+            }
+
+            return MA_TRUE;
+        }
+    }
+
+    /* Getting here means the channel position was not found. */
+    return MA_FALSE;
+}
+
+MA_API size_t ma_channel_map_to_string(const ma_channel* pChannelMap, ma_uint32 channels, char* pBufferOut, size_t bufferCap)
+{
+    size_t len;
+    ma_uint32 iChannel;
+
+    len = 0;
+
+    for (iChannel = 0; iChannel < channels; iChannel += 1) {
+        const char* pChannelStr = ma_channel_position_to_string(ma_channel_map_get_channel(pChannelMap, channels, iChannel));
+        size_t channelStrLen = strlen(pChannelStr);
+
+        /* Append the string if necessary. */
+        if (pBufferOut != NULL && bufferCap > len + channelStrLen) {
+            MA_COPY_MEMORY(pBufferOut + len, pChannelStr, channelStrLen);
+        }
+        len += channelStrLen;
+
+        /* Append a space if it's not the last item. */
+        if (iChannel+1 < channels) {
+            if (pBufferOut != NULL && bufferCap > len + 1) {
+                pBufferOut[len] = ' ';
+            }
+            len += 1;
+        }
+    }
+
+    /* Null terminate. Don't increment the length here. */
+    if (pBufferOut != NULL && bufferCap > len + 1) {
+        pBufferOut[len] = '\0';
+    }
+
+    return len;
+}
+
+MA_API const char* ma_channel_position_to_string(ma_channel channel)
+{
+    switch (channel)
+    {
+        case MA_CHANNEL_NONE              : return "CHANNEL_NONE";
+        case MA_CHANNEL_MONO              : return "CHANNEL_MONO";
+        case MA_CHANNEL_FRONT_LEFT        : return "CHANNEL_FRONT_LEFT";
+        case MA_CHANNEL_FRONT_RIGHT       : return "CHANNEL_FRONT_RIGHT";
+        case MA_CHANNEL_FRONT_CENTER      : return "CHANNEL_FRONT_CENTER";
+        case MA_CHANNEL_LFE               : return "CHANNEL_LFE";
+        case MA_CHANNEL_BACK_LEFT         : return "CHANNEL_BACK_LEFT";
+        case MA_CHANNEL_BACK_RIGHT        : return "CHANNEL_BACK_RIGHT";
+        case MA_CHANNEL_FRONT_LEFT_CENTER : return "CHANNEL_FRONT_LEFT_CENTER";
+        case MA_CHANNEL_FRONT_RIGHT_CENTER: return "CHANNEL_FRONT_RIGHT_CENTER";
+        case MA_CHANNEL_BACK_CENTER       : return "CHANNEL_BACK_CENTER";
+        case MA_CHANNEL_SIDE_LEFT         : return "CHANNEL_SIDE_LEFT";
+        case MA_CHANNEL_SIDE_RIGHT        : return "CHANNEL_SIDE_RIGHT";
+        case MA_CHANNEL_TOP_CENTER        : return "CHANNEL_TOP_CENTER";
+        case MA_CHANNEL_TOP_FRONT_LEFT    : return "CHANNEL_TOP_FRONT_LEFT";
+        case MA_CHANNEL_TOP_FRONT_CENTER  : return "CHANNEL_TOP_FRONT_CENTER";
+        case MA_CHANNEL_TOP_FRONT_RIGHT   : return "CHANNEL_TOP_FRONT_RIGHT";
+        case MA_CHANNEL_TOP_BACK_LEFT     : return "CHANNEL_TOP_BACK_LEFT";
+        case MA_CHANNEL_TOP_BACK_CENTER   : return "CHANNEL_TOP_BACK_CENTER";
+        case MA_CHANNEL_TOP_BACK_RIGHT    : return "CHANNEL_TOP_BACK_RIGHT";
+        case MA_CHANNEL_AUX_0             : return "CHANNEL_AUX_0";
+        case MA_CHANNEL_AUX_1             : return "CHANNEL_AUX_1";
+        case MA_CHANNEL_AUX_2             : return "CHANNEL_AUX_2";
+        case MA_CHANNEL_AUX_3             : return "CHANNEL_AUX_3";
+        case MA_CHANNEL_AUX_4             : return "CHANNEL_AUX_4";
+        case MA_CHANNEL_AUX_5             : return "CHANNEL_AUX_5";
+        case MA_CHANNEL_AUX_6             : return "CHANNEL_AUX_6";
+        case MA_CHANNEL_AUX_7             : return "CHANNEL_AUX_7";
+        case MA_CHANNEL_AUX_8             : return "CHANNEL_AUX_8";
+        case MA_CHANNEL_AUX_9             : return "CHANNEL_AUX_9";
+        case MA_CHANNEL_AUX_10            : return "CHANNEL_AUX_10";
+        case MA_CHANNEL_AUX_11            : return "CHANNEL_AUX_11";
+        case MA_CHANNEL_AUX_12            : return "CHANNEL_AUX_12";
+        case MA_CHANNEL_AUX_13            : return "CHANNEL_AUX_13";
+        case MA_CHANNEL_AUX_14            : return "CHANNEL_AUX_14";
+        case MA_CHANNEL_AUX_15            : return "CHANNEL_AUX_15";
+        case MA_CHANNEL_AUX_16            : return "CHANNEL_AUX_16";
+        case MA_CHANNEL_AUX_17            : return "CHANNEL_AUX_17";
+        case MA_CHANNEL_AUX_18            : return "CHANNEL_AUX_18";
+        case MA_CHANNEL_AUX_19            : return "CHANNEL_AUX_19";
+        case MA_CHANNEL_AUX_20            : return "CHANNEL_AUX_20";
+        case MA_CHANNEL_AUX_21            : return "CHANNEL_AUX_21";
+        case MA_CHANNEL_AUX_22            : return "CHANNEL_AUX_22";
+        case MA_CHANNEL_AUX_23            : return "CHANNEL_AUX_23";
+        case MA_CHANNEL_AUX_24            : return "CHANNEL_AUX_24";
+        case MA_CHANNEL_AUX_25            : return "CHANNEL_AUX_25";
+        case MA_CHANNEL_AUX_26            : return "CHANNEL_AUX_26";
+        case MA_CHANNEL_AUX_27            : return "CHANNEL_AUX_27";
+        case MA_CHANNEL_AUX_28            : return "CHANNEL_AUX_28";
+        case MA_CHANNEL_AUX_29            : return "CHANNEL_AUX_29";
+        case MA_CHANNEL_AUX_30            : return "CHANNEL_AUX_30";
+        case MA_CHANNEL_AUX_31            : return "CHANNEL_AUX_31";
+        default: break;
+    }
+
+    return "UNKNOWN";
+}
+
+
+
+/**************************************************************************************************************************************************************
+
+Conversion Helpers
+
+**************************************************************************************************************************************************************/
+MA_API ma_uint64 ma_convert_frames(void* pOut, ma_uint64 frameCountOut, ma_format formatOut, ma_uint32 channelsOut, ma_uint32 sampleRateOut, const void* pIn, ma_uint64 frameCountIn, ma_format formatIn, ma_uint32 channelsIn, ma_uint32 sampleRateIn)
+{
+    ma_data_converter_config config;
+
+    config = ma_data_converter_config_init(formatIn, formatOut, channelsIn, channelsOut, sampleRateIn, sampleRateOut);
+    config.resampling.linear.lpfOrder = ma_min(MA_DEFAULT_RESAMPLER_LPF_ORDER, MA_MAX_FILTER_ORDER);
+
+    return ma_convert_frames_ex(pOut, frameCountOut, pIn, frameCountIn, &config);
+}
+
+MA_API ma_uint64 ma_convert_frames_ex(void* pOut, ma_uint64 frameCountOut, const void* pIn, ma_uint64 frameCountIn, const ma_data_converter_config* pConfig)
+{
+    ma_result result;
+    ma_data_converter converter;
+
+    if (frameCountIn == 0 || pConfig == NULL) {
+        return 0;
+    }
+
+    result = ma_data_converter_init(pConfig, NULL, &converter);
+    if (result != MA_SUCCESS) {
+        return 0;   /* Failed to initialize the data converter. */
+    }
+
+    if (pOut == NULL) {
+        result = ma_data_converter_get_expected_output_frame_count(&converter, frameCountIn, &frameCountOut);
+        if (result != MA_SUCCESS) {
+            if (result == MA_NOT_IMPLEMENTED) {
+                /* No way to calculate the number of frames, so we'll need to brute force it and loop. */
+                frameCountOut = 0;
+
+                while (frameCountIn > 0) {
+                    ma_uint64 framesProcessedIn  = frameCountIn;
+                    ma_uint64 framesProcessedOut = 0xFFFFFFFF;
+
+                    result = ma_data_converter_process_pcm_frames(&converter, pIn, &framesProcessedIn, NULL, &framesProcessedOut);
+                    if (result != MA_SUCCESS) {
+                        break;
+                    }
+
+                    frameCountIn  -= framesProcessedIn;
+                }
+            }
+        }
+    } else {
+        result = ma_data_converter_process_pcm_frames(&converter, pIn, &frameCountIn, pOut, &frameCountOut);
+        if (result != MA_SUCCESS) {
+            frameCountOut = 0;
+        }
+    }
+
+    ma_data_converter_uninit(&converter, NULL);
+    return frameCountOut;
+}
+
+
+/**************************************************************************************************************************************************************
+
+Ring Buffer
+
+**************************************************************************************************************************************************************/
+static MA_INLINE ma_uint32 ma_rb__extract_offset_in_bytes(ma_uint32 encodedOffset)
+{
+    return encodedOffset & 0x7FFFFFFF;
+}
+
+static MA_INLINE ma_uint32 ma_rb__extract_offset_loop_flag(ma_uint32 encodedOffset)
+{
+    return encodedOffset & 0x80000000;
+}
+
+static MA_INLINE void* ma_rb__get_read_ptr(ma_rb* pRB)
+{
+    MA_ASSERT(pRB != NULL);
+    return ma_offset_ptr(pRB->pBuffer, ma_rb__extract_offset_in_bytes(ma_atomic_load_32(&pRB->encodedReadOffset)));
+}
+
+static MA_INLINE void* ma_rb__get_write_ptr(ma_rb* pRB)
+{
+    MA_ASSERT(pRB != NULL);
+    return ma_offset_ptr(pRB->pBuffer, ma_rb__extract_offset_in_bytes(ma_atomic_load_32(&pRB->encodedWriteOffset)));
+}
+
+static MA_INLINE ma_uint32 ma_rb__construct_offset(ma_uint32 offsetInBytes, ma_uint32 offsetLoopFlag)
+{
+    return offsetLoopFlag | offsetInBytes;
+}
+
+static MA_INLINE void ma_rb__deconstruct_offset(ma_uint32 encodedOffset, ma_uint32* pOffsetInBytes, ma_uint32* pOffsetLoopFlag)
+{
+    MA_ASSERT(pOffsetInBytes != NULL);
+    MA_ASSERT(pOffsetLoopFlag != NULL);
+
+    *pOffsetInBytes  = ma_rb__extract_offset_in_bytes(encodedOffset);
+    *pOffsetLoopFlag = ma_rb__extract_offset_loop_flag(encodedOffset);
+}
+
+
+MA_API ma_result ma_rb_init_ex(size_t subbufferSizeInBytes, size_t subbufferCount, size_t subbufferStrideInBytes, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_rb* pRB)
+{
+    ma_result result;
+    const ma_uint32 maxSubBufferSize = 0x7FFFFFFF - (MA_SIMD_ALIGNMENT-1);
+
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (subbufferSizeInBytes == 0 || subbufferCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (subbufferSizeInBytes > maxSubBufferSize) {
+        return MA_INVALID_ARGS;    /* Maximum buffer size is ~2GB. The most significant bit is a flag for use internally. */
+    }
+
+
+    MA_ZERO_OBJECT(pRB);
+
+    result = ma_allocation_callbacks_init_copy(&pRB->allocationCallbacks, pAllocationCallbacks);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pRB->subbufferSizeInBytes = (ma_uint32)subbufferSizeInBytes;
+    pRB->subbufferCount = (ma_uint32)subbufferCount;
+
+    if (pOptionalPreallocatedBuffer != NULL) {
+        pRB->subbufferStrideInBytes = (ma_uint32)subbufferStrideInBytes;
+        pRB->pBuffer = pOptionalPreallocatedBuffer;
+    } else {
+        size_t bufferSizeInBytes;
+
+        /*
+        Here is where we allocate our own buffer. We always want to align this to MA_SIMD_ALIGNMENT for future SIMD optimization opportunity. To do this
+        we need to make sure the stride is a multiple of MA_SIMD_ALIGNMENT.
+        */
+        pRB->subbufferStrideInBytes = (pRB->subbufferSizeInBytes + (MA_SIMD_ALIGNMENT-1)) & ~MA_SIMD_ALIGNMENT;
+
+        bufferSizeInBytes = (size_t)pRB->subbufferCount*pRB->subbufferStrideInBytes;
+        pRB->pBuffer = ma_aligned_malloc(bufferSizeInBytes, MA_SIMD_ALIGNMENT, &pRB->allocationCallbacks);
+        if (pRB->pBuffer == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        MA_ZERO_MEMORY(pRB->pBuffer, bufferSizeInBytes);
+        pRB->ownsBuffer = MA_TRUE;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_rb_init(size_t bufferSizeInBytes, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_rb* pRB)
+{
+    return ma_rb_init_ex(bufferSizeInBytes, 1, 0, pOptionalPreallocatedBuffer, pAllocationCallbacks, pRB);
+}
+
+MA_API void ma_rb_uninit(ma_rb* pRB)
+{
+    if (pRB == NULL) {
+        return;
+    }
+
+    if (pRB->ownsBuffer) {
+        ma_aligned_free(pRB->pBuffer, &pRB->allocationCallbacks);
+    }
+}
+
+MA_API void ma_rb_reset(ma_rb* pRB)
+{
+    if (pRB == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_32(&pRB->encodedReadOffset, 0);
+    ma_atomic_exchange_32(&pRB->encodedWriteOffset, 0);
+}
+
+MA_API ma_result ma_rb_acquire_read(ma_rb* pRB, size_t* pSizeInBytes, void** ppBufferOut)
+{
+    ma_uint32 writeOffset;
+    ma_uint32 writeOffsetInBytes;
+    ma_uint32 writeOffsetLoopFlag;
+    ma_uint32 readOffset;
+    ma_uint32 readOffsetInBytes;
+    ma_uint32 readOffsetLoopFlag;
+    size_t bytesAvailable;
+    size_t bytesRequested;
+
+    if (pRB == NULL || pSizeInBytes == NULL || ppBufferOut == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The returned buffer should never move ahead of the write pointer. */
+    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
+    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
+
+    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
+    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
+
+    /*
+    The number of bytes available depends on whether or not the read and write pointers are on the same loop iteration. If so, we
+    can only read up to the write pointer. If not, we can only read up to the end of the buffer.
+    */
+    if (readOffsetLoopFlag == writeOffsetLoopFlag) {
+        bytesAvailable = writeOffsetInBytes - readOffsetInBytes;
+    } else {
+        bytesAvailable = pRB->subbufferSizeInBytes - readOffsetInBytes;
+    }
+
+    bytesRequested = *pSizeInBytes;
+    if (bytesRequested > bytesAvailable) {
+        bytesRequested = bytesAvailable;
+    }
+
+    *pSizeInBytes = bytesRequested;
+    (*ppBufferOut) = ma_rb__get_read_ptr(pRB);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_rb_commit_read(ma_rb* pRB, size_t sizeInBytes)
+{
+    ma_uint32 readOffset;
+    ma_uint32 readOffsetInBytes;
+    ma_uint32 readOffsetLoopFlag;
+    ma_uint32 newReadOffsetInBytes;
+    ma_uint32 newReadOffsetLoopFlag;
+
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
+    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
+
+    /* Check that sizeInBytes is correct. It should never go beyond the end of the buffer. */
+    newReadOffsetInBytes = (ma_uint32)(readOffsetInBytes + sizeInBytes);
+    if (newReadOffsetInBytes > pRB->subbufferSizeInBytes) {
+        return MA_INVALID_ARGS;    /* <-- sizeInBytes will cause the read offset to overflow. */
+    }
+
+    /* Move the read pointer back to the start if necessary. */
+    newReadOffsetLoopFlag = readOffsetLoopFlag;
+    if (newReadOffsetInBytes == pRB->subbufferSizeInBytes) {
+        newReadOffsetInBytes = 0;
+        newReadOffsetLoopFlag ^= 0x80000000;
+    }
+
+    ma_atomic_exchange_32(&pRB->encodedReadOffset, ma_rb__construct_offset(newReadOffsetInBytes, newReadOffsetLoopFlag));
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_rb_acquire_write(ma_rb* pRB, size_t* pSizeInBytes, void** ppBufferOut)
+{
+    ma_uint32 readOffset;
+    ma_uint32 readOffsetInBytes;
+    ma_uint32 readOffsetLoopFlag;
+    ma_uint32 writeOffset;
+    ma_uint32 writeOffsetInBytes;
+    ma_uint32 writeOffsetLoopFlag;
+    size_t bytesAvailable;
+    size_t bytesRequested;
+
+    if (pRB == NULL || pSizeInBytes == NULL || ppBufferOut == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The returned buffer should never overtake the read buffer. */
+    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
+    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
+
+    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
+    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
+
+    /*
+    In the case of writing, if the write pointer and the read pointer are on the same loop iteration we can only
+    write up to the end of the buffer. Otherwise we can only write up to the read pointer. The write pointer should
+    never overtake the read pointer.
+    */
+    if (writeOffsetLoopFlag == readOffsetLoopFlag) {
+        bytesAvailable = pRB->subbufferSizeInBytes - writeOffsetInBytes;
+    } else {
+        bytesAvailable = readOffsetInBytes - writeOffsetInBytes;
+    }
+
+    bytesRequested = *pSizeInBytes;
+    if (bytesRequested > bytesAvailable) {
+        bytesRequested = bytesAvailable;
+    }
+
+    *pSizeInBytes = bytesRequested;
+    *ppBufferOut  = ma_rb__get_write_ptr(pRB);
+
+    /* Clear the buffer if desired. */
+    if (pRB->clearOnWriteAcquire) {
+        MA_ZERO_MEMORY(*ppBufferOut, *pSizeInBytes);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_rb_commit_write(ma_rb* pRB, size_t sizeInBytes)
+{
+    ma_uint32 writeOffset;
+    ma_uint32 writeOffsetInBytes;
+    ma_uint32 writeOffsetLoopFlag;
+    ma_uint32 newWriteOffsetInBytes;
+    ma_uint32 newWriteOffsetLoopFlag;
+
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
+    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
+
+    /* Check that sizeInBytes is correct. It should never go beyond the end of the buffer. */
+    newWriteOffsetInBytes = (ma_uint32)(writeOffsetInBytes + sizeInBytes);
+    if (newWriteOffsetInBytes > pRB->subbufferSizeInBytes) {
+        return MA_INVALID_ARGS;    /* <-- sizeInBytes will cause the read offset to overflow. */
+    }
+
+    /* Move the read pointer back to the start if necessary. */
+    newWriteOffsetLoopFlag = writeOffsetLoopFlag;
+    if (newWriteOffsetInBytes == pRB->subbufferSizeInBytes) {
+        newWriteOffsetInBytes = 0;
+        newWriteOffsetLoopFlag ^= 0x80000000;
+    }
+
+    ma_atomic_exchange_32(&pRB->encodedWriteOffset, ma_rb__construct_offset(newWriteOffsetInBytes, newWriteOffsetLoopFlag));
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_rb_seek_read(ma_rb* pRB, size_t offsetInBytes)
+{
+    ma_uint32 readOffset;
+    ma_uint32 readOffsetInBytes;
+    ma_uint32 readOffsetLoopFlag;
+    ma_uint32 writeOffset;
+    ma_uint32 writeOffsetInBytes;
+    ma_uint32 writeOffsetLoopFlag;
+    ma_uint32 newReadOffsetInBytes;
+    ma_uint32 newReadOffsetLoopFlag;
+
+    if (pRB == NULL || offsetInBytes > pRB->subbufferSizeInBytes) {
+        return MA_INVALID_ARGS;
+    }
+
+    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
+    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
+
+    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
+    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
+
+    newReadOffsetLoopFlag = readOffsetLoopFlag;
+
+    /* We cannot go past the write buffer. */
+    if (readOffsetLoopFlag == writeOffsetLoopFlag) {
+        if ((readOffsetInBytes + offsetInBytes) > writeOffsetInBytes) {
+            newReadOffsetInBytes = writeOffsetInBytes;
+        } else {
+            newReadOffsetInBytes = (ma_uint32)(readOffsetInBytes + offsetInBytes);
+        }
+    } else {
+        /* May end up looping. */
+        if ((readOffsetInBytes + offsetInBytes) >= pRB->subbufferSizeInBytes) {
+            newReadOffsetInBytes = (ma_uint32)(readOffsetInBytes + offsetInBytes) - pRB->subbufferSizeInBytes;
+            newReadOffsetLoopFlag ^= 0x80000000;    /* <-- Looped. */
+        } else {
+            newReadOffsetInBytes = (ma_uint32)(readOffsetInBytes + offsetInBytes);
+        }
+    }
+
+    ma_atomic_exchange_32(&pRB->encodedReadOffset, ma_rb__construct_offset(newReadOffsetInBytes, newReadOffsetLoopFlag));
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_rb_seek_write(ma_rb* pRB, size_t offsetInBytes)
+{
+    ma_uint32 readOffset;
+    ma_uint32 readOffsetInBytes;
+    ma_uint32 readOffsetLoopFlag;
+    ma_uint32 writeOffset;
+    ma_uint32 writeOffsetInBytes;
+    ma_uint32 writeOffsetLoopFlag;
+    ma_uint32 newWriteOffsetInBytes;
+    ma_uint32 newWriteOffsetLoopFlag;
+
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
+    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
+
+    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
+    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
+
+    newWriteOffsetLoopFlag = writeOffsetLoopFlag;
+
+    /* We cannot go past the write buffer. */
+    if (readOffsetLoopFlag == writeOffsetLoopFlag) {
+        /* May end up looping. */
+        if ((writeOffsetInBytes + offsetInBytes) >= pRB->subbufferSizeInBytes) {
+            newWriteOffsetInBytes = (ma_uint32)(writeOffsetInBytes + offsetInBytes) - pRB->subbufferSizeInBytes;
+            newWriteOffsetLoopFlag ^= 0x80000000;    /* <-- Looped. */
+        } else {
+            newWriteOffsetInBytes = (ma_uint32)(writeOffsetInBytes + offsetInBytes);
+        }
+    } else {
+        if ((writeOffsetInBytes + offsetInBytes) > readOffsetInBytes) {
+            newWriteOffsetInBytes = readOffsetInBytes;
+        } else {
+            newWriteOffsetInBytes = (ma_uint32)(writeOffsetInBytes + offsetInBytes);
+        }
+    }
+
+    ma_atomic_exchange_32(&pRB->encodedWriteOffset, ma_rb__construct_offset(newWriteOffsetInBytes, newWriteOffsetLoopFlag));
+    return MA_SUCCESS;
+}
+
+MA_API ma_int32 ma_rb_pointer_distance(ma_rb* pRB)
+{
+    ma_uint32 readOffset;
+    ma_uint32 readOffsetInBytes;
+    ma_uint32 readOffsetLoopFlag;
+    ma_uint32 writeOffset;
+    ma_uint32 writeOffsetInBytes;
+    ma_uint32 writeOffsetLoopFlag;
+
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    readOffset = ma_atomic_load_32(&pRB->encodedReadOffset);
+    ma_rb__deconstruct_offset(readOffset, &readOffsetInBytes, &readOffsetLoopFlag);
+
+    writeOffset = ma_atomic_load_32(&pRB->encodedWriteOffset);
+    ma_rb__deconstruct_offset(writeOffset, &writeOffsetInBytes, &writeOffsetLoopFlag);
+
+    if (readOffsetLoopFlag == writeOffsetLoopFlag) {
+        return writeOffsetInBytes - readOffsetInBytes;
+    } else {
+        return writeOffsetInBytes + (pRB->subbufferSizeInBytes - readOffsetInBytes);
+    }
+}
+
+MA_API ma_uint32 ma_rb_available_read(ma_rb* pRB)
+{
+    ma_int32 dist;
+
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    dist = ma_rb_pointer_distance(pRB);
+    if (dist < 0) {
+        return 0;
+    }
+
+    return dist;
+}
+
+MA_API ma_uint32 ma_rb_available_write(ma_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return (ma_uint32)(ma_rb_get_subbuffer_size(pRB) - ma_rb_pointer_distance(pRB));
+}
+
+MA_API size_t ma_rb_get_subbuffer_size(ma_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return pRB->subbufferSizeInBytes;
+}
+
+MA_API size_t ma_rb_get_subbuffer_stride(ma_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    if (pRB->subbufferStrideInBytes == 0) {
+        return (size_t)pRB->subbufferSizeInBytes;
+    }
+
+    return (size_t)pRB->subbufferStrideInBytes;
+}
+
+MA_API size_t ma_rb_get_subbuffer_offset(ma_rb* pRB, size_t subbufferIndex)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return subbufferIndex * ma_rb_get_subbuffer_stride(pRB);
+}
+
+MA_API void* ma_rb_get_subbuffer_ptr(ma_rb* pRB, size_t subbufferIndex, void* pBuffer)
+{
+    if (pRB == NULL) {
+        return NULL;
+    }
+
+    return ma_offset_ptr(pBuffer, ma_rb_get_subbuffer_offset(pRB, subbufferIndex));
+}
+
+
+
+static ma_result ma_pcm_rb_data_source__on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    /* Since there's no notion of an end, we don't ever want to return MA_AT_END here. But it is possible to return 0. */
+    ma_pcm_rb* pRB = (ma_pcm_rb*)pDataSource;
+    ma_result result;
+    ma_uint64 totalFramesRead;
+
+    MA_ASSERT(pRB != NULL);
+
+    /* We need to run this in a loop since the ring buffer itself may loop. */
+    totalFramesRead = 0;
+    while (totalFramesRead < frameCount) {
+        void* pMappedBuffer;
+        ma_uint32 mappedFrameCount;
+        ma_uint64 framesToRead = frameCount - totalFramesRead;
+        if (framesToRead > 0xFFFFFFFF) {
+            framesToRead = 0xFFFFFFFF;
+        }
+
+        mappedFrameCount = (ma_uint32)framesToRead;
+        result = ma_pcm_rb_acquire_read(pRB, &mappedFrameCount, &pMappedBuffer);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        if (mappedFrameCount == 0) {
+            break;  /* <-- End of ring buffer. */
+        }
+
+        ma_copy_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, pRB->format, pRB->channels), pMappedBuffer, mappedFrameCount, pRB->format, pRB->channels);
+
+        result = ma_pcm_rb_commit_read(pRB, mappedFrameCount);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        totalFramesRead += mappedFrameCount;
+    }
+
+    /*
+    There is no notion of an "end" in a ring buffer. If we didn't have enough data to fill the requested frame
+    count we'll need to pad with silence. If we don't do this, totalFramesRead might equal 0 which will result
+    in the data source layer at a higher level translating this to MA_AT_END which is incorrect for a ring buffer.
+    */
+    if (totalFramesRead < frameCount) {
+        ma_silence_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, pRB->format, pRB->channels), (frameCount - totalFramesRead), pRB->format, pRB->channels);
+        totalFramesRead = frameCount;
+    }
+
+    *pFramesRead = totalFramesRead;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_pcm_rb_data_source__on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    ma_pcm_rb* pRB = (ma_pcm_rb*)pDataSource;
+    MA_ASSERT(pRB != NULL);
+
+    if (pFormat != NULL) {
+        *pFormat = pRB->format;
+    }
+
+    if (pChannels != NULL) {
+        *pChannels = pRB->channels;
+    }
+
+    if (pSampleRate != NULL) {
+        *pSampleRate = pRB->sampleRate;
+    }
+
+    /* Just assume the default channel map. */
+    if (pChannelMap != NULL) {
+        ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pRB->channels);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_data_source_vtable ma_gRBDataSourceVTable =
+{
+    ma_pcm_rb_data_source__on_read,
+    NULL,   /* onSeek */
+    ma_pcm_rb_data_source__on_get_data_format,
+    NULL,   /* onGetCursor */
+    NULL,   /* onGetLength */
+    NULL,   /* onSetLooping */
+    0
+};
+
+static MA_INLINE ma_uint32 ma_pcm_rb_get_bpf(ma_pcm_rb* pRB)
+{
+    MA_ASSERT(pRB != NULL);
+
+    return ma_get_bytes_per_frame(pRB->format, pRB->channels);
+}
+
+MA_API ma_result ma_pcm_rb_init_ex(ma_format format, ma_uint32 channels, ma_uint32 subbufferSizeInFrames, ma_uint32 subbufferCount, ma_uint32 subbufferStrideInFrames, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_pcm_rb* pRB)
+{
+    ma_uint32 bpf;
+    ma_result result;
+
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pRB);
+
+    bpf = ma_get_bytes_per_frame(format, channels);
+    if (bpf == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_rb_init_ex(subbufferSizeInFrames*bpf, subbufferCount, subbufferStrideInFrames*bpf, pOptionalPreallocatedBuffer, pAllocationCallbacks, &pRB->rb);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pRB->format     = format;
+    pRB->channels   = channels;
+    pRB->sampleRate = 0;    /* The sample rate is not passed in as a parameter. */
+
+    /* The PCM ring buffer is a data source. We need to get that set up as well. */
+    {
+        ma_data_source_config dataSourceConfig = ma_data_source_config_init();
+        dataSourceConfig.vtable = &ma_gRBDataSourceVTable;
+
+        result = ma_data_source_init(&dataSourceConfig, &pRB->ds);
+        if (result != MA_SUCCESS) {
+            ma_rb_uninit(&pRB->rb);
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_pcm_rb_init(ma_format format, ma_uint32 channels, ma_uint32 bufferSizeInFrames, void* pOptionalPreallocatedBuffer, const ma_allocation_callbacks* pAllocationCallbacks, ma_pcm_rb* pRB)
+{
+    return ma_pcm_rb_init_ex(format, channels, bufferSizeInFrames, 1, 0, pOptionalPreallocatedBuffer, pAllocationCallbacks, pRB);
+}
+
+MA_API void ma_pcm_rb_uninit(ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return;
+    }
+
+    ma_data_source_uninit(&pRB->ds);
+    ma_rb_uninit(&pRB->rb);
+}
+
+MA_API void ma_pcm_rb_reset(ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return;
+    }
+
+    ma_rb_reset(&pRB->rb);
+}
+
+MA_API ma_result ma_pcm_rb_acquire_read(ma_pcm_rb* pRB, ma_uint32* pSizeInFrames, void** ppBufferOut)
+{
+    size_t sizeInBytes;
+    ma_result result;
+
+    if (pRB == NULL || pSizeInFrames == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    sizeInBytes = *pSizeInFrames * ma_pcm_rb_get_bpf(pRB);
+
+    result = ma_rb_acquire_read(&pRB->rb, &sizeInBytes, ppBufferOut);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pSizeInFrames = (ma_uint32)(sizeInBytes / (size_t)ma_pcm_rb_get_bpf(pRB));
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_pcm_rb_commit_read(ma_pcm_rb* pRB, ma_uint32 sizeInFrames)
+{
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_rb_commit_read(&pRB->rb, sizeInFrames * ma_pcm_rb_get_bpf(pRB));
+}
+
+MA_API ma_result ma_pcm_rb_acquire_write(ma_pcm_rb* pRB, ma_uint32* pSizeInFrames, void** ppBufferOut)
+{
+    size_t sizeInBytes;
+    ma_result result;
+
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    sizeInBytes = *pSizeInFrames * ma_pcm_rb_get_bpf(pRB);
+
+    result = ma_rb_acquire_write(&pRB->rb, &sizeInBytes, ppBufferOut);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pSizeInFrames = (ma_uint32)(sizeInBytes / ma_pcm_rb_get_bpf(pRB));
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_pcm_rb_commit_write(ma_pcm_rb* pRB, ma_uint32 sizeInFrames)
+{
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_rb_commit_write(&pRB->rb, sizeInFrames * ma_pcm_rb_get_bpf(pRB));
+}
+
+MA_API ma_result ma_pcm_rb_seek_read(ma_pcm_rb* pRB, ma_uint32 offsetInFrames)
+{
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_rb_seek_read(&pRB->rb, offsetInFrames * ma_pcm_rb_get_bpf(pRB));
+}
+
+MA_API ma_result ma_pcm_rb_seek_write(ma_pcm_rb* pRB, ma_uint32 offsetInFrames)
+{
+    if (pRB == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_rb_seek_write(&pRB->rb, offsetInFrames * ma_pcm_rb_get_bpf(pRB));
+}
+
+MA_API ma_int32 ma_pcm_rb_pointer_distance(ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return ma_rb_pointer_distance(&pRB->rb) / ma_pcm_rb_get_bpf(pRB);
+}
+
+MA_API ma_uint32 ma_pcm_rb_available_read(ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return ma_rb_available_read(&pRB->rb) / ma_pcm_rb_get_bpf(pRB);
+}
+
+MA_API ma_uint32 ma_pcm_rb_available_write(ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return ma_rb_available_write(&pRB->rb) / ma_pcm_rb_get_bpf(pRB);
+}
+
+MA_API ma_uint32 ma_pcm_rb_get_subbuffer_size(ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return (ma_uint32)(ma_rb_get_subbuffer_size(&pRB->rb) / ma_pcm_rb_get_bpf(pRB));
+}
+
+MA_API ma_uint32 ma_pcm_rb_get_subbuffer_stride(ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return (ma_uint32)(ma_rb_get_subbuffer_stride(&pRB->rb) / ma_pcm_rb_get_bpf(pRB));
+}
+
+MA_API ma_uint32 ma_pcm_rb_get_subbuffer_offset(ma_pcm_rb* pRB, ma_uint32 subbufferIndex)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return (ma_uint32)(ma_rb_get_subbuffer_offset(&pRB->rb, subbufferIndex) / ma_pcm_rb_get_bpf(pRB));
+}
+
+MA_API void* ma_pcm_rb_get_subbuffer_ptr(ma_pcm_rb* pRB, ma_uint32 subbufferIndex, void* pBuffer)
+{
+    if (pRB == NULL) {
+        return NULL;
+    }
+
+    return ma_rb_get_subbuffer_ptr(&pRB->rb, subbufferIndex, pBuffer);
+}
+
+MA_API ma_format ma_pcm_rb_get_format(const ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return ma_format_unknown;
+    }
+
+    return pRB->format;
+}
+
+MA_API ma_uint32 ma_pcm_rb_get_channels(const ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return pRB->channels;
+}
+
+MA_API ma_uint32 ma_pcm_rb_get_sample_rate(const ma_pcm_rb* pRB)
+{
+    if (pRB == NULL) {
+        return 0;
+    }
+
+    return pRB->sampleRate;
+}
+
+MA_API void ma_pcm_rb_set_sample_rate(ma_pcm_rb* pRB, ma_uint32 sampleRate)
+{
+    if (pRB == NULL) {
+        return;
+    }
+
+    pRB->sampleRate = sampleRate;
+}
+
+
+
+MA_API ma_result ma_duplex_rb_init(ma_format captureFormat, ma_uint32 captureChannels, ma_uint32 sampleRate, ma_uint32 captureInternalSampleRate, ma_uint32 captureInternalPeriodSizeInFrames, const ma_allocation_callbacks* pAllocationCallbacks, ma_duplex_rb* pRB)
+{
+    ma_result result;
+    ma_uint32 sizeInFrames;
+
+    sizeInFrames = (ma_uint32)ma_calculate_frame_count_after_resampling(sampleRate, captureInternalSampleRate, captureInternalPeriodSizeInFrames * 5);
+    if (sizeInFrames == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_pcm_rb_init(captureFormat, captureChannels, sizeInFrames, NULL, pAllocationCallbacks, &pRB->rb);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* Seek forward a bit so we have a bit of a buffer in case of desyncs. */
+    ma_pcm_rb_seek_write((ma_pcm_rb*)pRB, captureInternalPeriodSizeInFrames * 2);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_duplex_rb_uninit(ma_duplex_rb* pRB)
+{
+    ma_pcm_rb_uninit((ma_pcm_rb*)pRB);
+    return MA_SUCCESS;
+}
+
+
+
+/**************************************************************************************************************************************************************
+
+Miscellaneous Helpers
+
+**************************************************************************************************************************************************************/
+MA_API const char* ma_result_description(ma_result result)
+{
+    switch (result)
+    {
+        case MA_SUCCESS:                       return "No error";
+        case MA_ERROR:                         return "Unknown error";
+        case MA_INVALID_ARGS:                  return "Invalid argument";
+        case MA_INVALID_OPERATION:             return "Invalid operation";
+        case MA_OUT_OF_MEMORY:                 return "Out of memory";
+        case MA_OUT_OF_RANGE:                  return "Out of range";
+        case MA_ACCESS_DENIED:                 return "Permission denied";
+        case MA_DOES_NOT_EXIST:                return "Resource does not exist";
+        case MA_ALREADY_EXISTS:                return "Resource already exists";
+        case MA_TOO_MANY_OPEN_FILES:           return "Too many open files";
+        case MA_INVALID_FILE:                  return "Invalid file";
+        case MA_TOO_BIG:                       return "Too large";
+        case MA_PATH_TOO_LONG:                 return "Path too long";
+        case MA_NAME_TOO_LONG:                 return "Name too long";
+        case MA_NOT_DIRECTORY:                 return "Not a directory";
+        case MA_IS_DIRECTORY:                  return "Is a directory";
+        case MA_DIRECTORY_NOT_EMPTY:           return "Directory not empty";
+        case MA_AT_END:                        return "At end";
+        case MA_NO_SPACE:                      return "No space available";
+        case MA_BUSY:                          return "Device or resource busy";
+        case MA_IO_ERROR:                      return "Input/output error";
+        case MA_INTERRUPT:                     return "Interrupted";
+        case MA_UNAVAILABLE:                   return "Resource unavailable";
+        case MA_ALREADY_IN_USE:                return "Resource already in use";
+        case MA_BAD_ADDRESS:                   return "Bad address";
+        case MA_BAD_SEEK:                      return "Illegal seek";
+        case MA_BAD_PIPE:                      return "Broken pipe";
+        case MA_DEADLOCK:                      return "Deadlock";
+        case MA_TOO_MANY_LINKS:                return "Too many links";
+        case MA_NOT_IMPLEMENTED:               return "Not implemented";
+        case MA_NO_MESSAGE:                    return "No message of desired type";
+        case MA_BAD_MESSAGE:                   return "Invalid message";
+        case MA_NO_DATA_AVAILABLE:             return "No data available";
+        case MA_INVALID_DATA:                  return "Invalid data";
+        case MA_TIMEOUT:                       return "Timeout";
+        case MA_NO_NETWORK:                    return "Network unavailable";
+        case MA_NOT_UNIQUE:                    return "Not unique";
+        case MA_NOT_SOCKET:                    return "Socket operation on non-socket";
+        case MA_NO_ADDRESS:                    return "Destination address required";
+        case MA_BAD_PROTOCOL:                  return "Protocol wrong type for socket";
+        case MA_PROTOCOL_UNAVAILABLE:          return "Protocol not available";
+        case MA_PROTOCOL_NOT_SUPPORTED:        return "Protocol not supported";
+        case MA_PROTOCOL_FAMILY_NOT_SUPPORTED: return "Protocol family not supported";
+        case MA_ADDRESS_FAMILY_NOT_SUPPORTED:  return "Address family not supported";
+        case MA_SOCKET_NOT_SUPPORTED:          return "Socket type not supported";
+        case MA_CONNECTION_RESET:              return "Connection reset";
+        case MA_ALREADY_CONNECTED:             return "Already connected";
+        case MA_NOT_CONNECTED:                 return "Not connected";
+        case MA_CONNECTION_REFUSED:            return "Connection refused";
+        case MA_NO_HOST:                       return "No host";
+        case MA_IN_PROGRESS:                   return "Operation in progress";
+        case MA_CANCELLED:                     return "Operation cancelled";
+        case MA_MEMORY_ALREADY_MAPPED:         return "Memory already mapped";
+
+        case MA_FORMAT_NOT_SUPPORTED:          return "Format not supported";
+        case MA_DEVICE_TYPE_NOT_SUPPORTED:     return "Device type not supported";
+        case MA_SHARE_MODE_NOT_SUPPORTED:      return "Share mode not supported";
+        case MA_NO_BACKEND:                    return "No backend";
+        case MA_NO_DEVICE:                     return "No device";
+        case MA_API_NOT_FOUND:                 return "API not found";
+        case MA_INVALID_DEVICE_CONFIG:         return "Invalid device config";
+
+        case MA_DEVICE_NOT_INITIALIZED:        return "Device not initialized";
+        case MA_DEVICE_NOT_STARTED:            return "Device not started";
+
+        case MA_FAILED_TO_INIT_BACKEND:        return "Failed to initialize backend";
+        case MA_FAILED_TO_OPEN_BACKEND_DEVICE: return "Failed to open backend device";
+        case MA_FAILED_TO_START_BACKEND_DEVICE: return "Failed to start backend device";
+        case MA_FAILED_TO_STOP_BACKEND_DEVICE: return "Failed to stop backend device";
+
+        default:                               return "Unknown error";
+    }
+}
+
+MA_API void* ma_malloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        if (pAllocationCallbacks->onMalloc != NULL) {
+            return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
+        } else {
+            return NULL;    /* Do not fall back to the default implementation. */
+        }
+    } else {
+        return ma__malloc_default(sz, NULL);
+    }
+}
+
+MA_API void* ma_calloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    void* p = ma_malloc(sz, pAllocationCallbacks);
+    if (p != NULL) {
+        MA_ZERO_MEMORY(p, sz);
+    }
+
+    return p;
+}
+
+MA_API void* ma_realloc(void* p, size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        if (pAllocationCallbacks->onRealloc != NULL) {
+            return pAllocationCallbacks->onRealloc(p, sz, pAllocationCallbacks->pUserData);
+        } else {
+            return NULL;    /* Do not fall back to the default implementation. */
+        }
+    } else {
+        return ma__realloc_default(p, sz, NULL);
+    }
+}
+
+MA_API void ma_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (p == NULL) {
+        return;
+    }
+
+    if (pAllocationCallbacks != NULL) {
+        if (pAllocationCallbacks->onFree != NULL) {
+            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+        } else {
+            return; /* Do no fall back to the default implementation. */
+        }
+    } else {
+        ma__free_default(p, NULL);
+    }
+}
+
+MA_API void* ma_aligned_malloc(size_t sz, size_t alignment, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    size_t extraBytes;
+    void* pUnaligned;
+    void* pAligned;
+
+    if (alignment == 0) {
+        return 0;
+    }
+
+    extraBytes = alignment-1 + sizeof(void*);
+
+    pUnaligned = ma_malloc(sz + extraBytes, pAllocationCallbacks);
+    if (pUnaligned == NULL) {
+        return NULL;
+    }
+
+    pAligned = (void*)(((ma_uintptr)pUnaligned + extraBytes) & ~((ma_uintptr)(alignment-1)));
+    ((void**)pAligned)[-1] = pUnaligned;
+
+    return pAligned;
+}
+
+MA_API void ma_aligned_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_free(((void**)p)[-1], pAllocationCallbacks);
+}
+
+MA_API const char* ma_get_format_name(ma_format format)
+{
+    switch (format)
+    {
+        case ma_format_unknown: return "Unknown";
+        case ma_format_u8:      return "8-bit Unsigned Integer";
+        case ma_format_s16:     return "16-bit Signed Integer";
+        case ma_format_s24:     return "24-bit Signed Integer (Tightly Packed)";
+        case ma_format_s32:     return "32-bit Signed Integer";
+        case ma_format_f32:     return "32-bit IEEE Floating Point";
+        default:                return "Invalid";
+    }
+}
+
+MA_API void ma_blend_f32(float* pOut, float* pInA, float* pInB, float factor, ma_uint32 channels)
+{
+    ma_uint32 i;
+    for (i = 0; i < channels; ++i) {
+        pOut[i] = ma_mix_f32(pInA[i], pInB[i], factor);
+    }
+}
+
+
+MA_API ma_uint32 ma_get_bytes_per_sample(ma_format format)
+{
+    ma_uint32 sizes[] = {
+        0,  /* unknown */
+        1,  /* u8 */
+        2,  /* s16 */
+        3,  /* s24 */
+        4,  /* s32 */
+        4,  /* f32 */
+    };
+    return sizes[format];
+}
+
+
+
+#define MA_DATA_SOURCE_DEFAULT_RANGE_BEG        0
+#define MA_DATA_SOURCE_DEFAULT_RANGE_END        ~((ma_uint64)0)
+#define MA_DATA_SOURCE_DEFAULT_LOOP_POINT_BEG   0
+#define MA_DATA_SOURCE_DEFAULT_LOOP_POINT_END   ~((ma_uint64)0)
+
+MA_API ma_data_source_config ma_data_source_config_init(void)
+{
+    ma_data_source_config config;
+
+    MA_ZERO_OBJECT(&config);
+
+    return config;
+}
+
+
+MA_API ma_result ma_data_source_init(const ma_data_source_config* pConfig, ma_data_source* pDataSource)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDataSourceBase);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->vtable == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pDataSourceBase->vtable           = pConfig->vtable;
+    pDataSourceBase->rangeBegInFrames = MA_DATA_SOURCE_DEFAULT_RANGE_BEG;
+    pDataSourceBase->rangeEndInFrames = MA_DATA_SOURCE_DEFAULT_RANGE_END;
+    pDataSourceBase->loopBegInFrames  = MA_DATA_SOURCE_DEFAULT_LOOP_POINT_BEG;
+    pDataSourceBase->loopEndInFrames  = MA_DATA_SOURCE_DEFAULT_LOOP_POINT_END;
+    pDataSourceBase->pCurrent         = pDataSource;    /* Always read from ourself by default. */
+    pDataSourceBase->pNext            = NULL;
+    pDataSourceBase->onGetNext        = NULL;
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_data_source_uninit(ma_data_source* pDataSource)
+{
+    if (pDataSource == NULL) {
+        return;
+    }
+
+    /*
+    This is placeholder in case we need this later. Data sources need to call this in their
+    uninitialization routine to ensure things work later on if something is added here.
+    */
+}
+
+static ma_result ma_data_source_resolve_current(ma_data_source* pDataSource, ma_data_source** ppCurrentDataSource)
+{
+    ma_data_source_base* pCurrentDataSource = (ma_data_source_base*)pDataSource;
+
+    MA_ASSERT(pDataSource         != NULL);
+    MA_ASSERT(ppCurrentDataSource != NULL);
+
+    if (pCurrentDataSource->pCurrent == NULL) {
+        /*
+        The current data source is NULL. If we're using this in the context of a chain we need to return NULL
+        here so that we don't end up looping. Otherwise we just return the data source itself.
+        */
+        if (pCurrentDataSource->pNext != NULL || pCurrentDataSource->onGetNext != NULL) {
+            pCurrentDataSource = NULL;
+        } else {
+            pCurrentDataSource = (ma_data_source_base*)pDataSource; /* Not being used in a chain. Make sure we just always read from the data source itself at all times. */
+        }
+    } else {
+        pCurrentDataSource = (ma_data_source_base*)pCurrentDataSource->pCurrent;
+    }
+
+    *ppCurrentDataSource = pCurrentDataSource;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_data_source_read_pcm_frames_from_backend(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    MA_ASSERT(pDataSourceBase                 != NULL);
+    MA_ASSERT(pDataSourceBase->vtable         != NULL);
+    MA_ASSERT(pDataSourceBase->vtable->onRead != NULL);
+    MA_ASSERT(pFramesRead != NULL);
+
+    if (pFramesOut != NULL) {
+        return pDataSourceBase->vtable->onRead(pDataSourceBase, pFramesOut, frameCount, pFramesRead);
+    } else {
+        /*
+        No output buffer. Probably seeking forward. Read and discard. Can probably optimize this in terms of
+        onSeek and onGetCursor, but need to keep in mind that the data source may not implement these functions.
+        */
+        ma_result result;
+        ma_uint64 framesRead;
+        ma_format format;
+        ma_uint32 channels;
+        ma_uint64 discardBufferCapInFrames;
+        ma_uint8  pDiscardBuffer[4096];
+
+        result = ma_data_source_get_data_format(pDataSource, &format, &channels, NULL, NULL, 0);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        discardBufferCapInFrames = sizeof(pDiscardBuffer) / ma_get_bytes_per_frame(format, channels);
+
+        framesRead = 0;
+        while (framesRead < frameCount) {
+            ma_uint64 framesReadThisIteration = 0;
+            ma_uint64 framesToRead = frameCount - framesRead;
+            if (framesToRead > discardBufferCapInFrames) {
+                framesToRead = discardBufferCapInFrames;
+            }
+
+            result = pDataSourceBase->vtable->onRead(pDataSourceBase, pDiscardBuffer, framesToRead, &framesReadThisIteration);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+
+            framesRead += framesReadThisIteration;
+        }
+
+        *pFramesRead = framesRead;
+
+        return MA_SUCCESS;
+    }
+}
+
+static ma_result ma_data_source_read_pcm_frames_within_range(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+    ma_result result;
+    ma_uint64 framesRead = 0;
+    ma_bool32 loop = ma_data_source_is_looping(pDataSource);
+
+    if (pDataSourceBase == NULL) {
+        return MA_AT_END;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ASSERT(pDataSourceBase->vtable != NULL);
+
+    if ((pDataSourceBase->vtable->flags & MA_DATA_SOURCE_SELF_MANAGED_RANGE_AND_LOOP_POINT) != 0 || (pDataSourceBase->rangeEndInFrames == ~((ma_uint64)0) && (pDataSourceBase->loopEndInFrames == ~((ma_uint64)0) || loop == MA_FALSE))) {
+        /* Either the data source is self-managing the range, or no range is set - just read like normal. The data source itself will tell us when the end is reached. */
+        result = ma_data_source_read_pcm_frames_from_backend(pDataSource, pFramesOut, frameCount, &framesRead);
+    } else {
+        /* Need to clamp to within the range. */
+        ma_uint64 relativeCursor;
+        ma_uint64 absoluteCursor;
+
+        result = ma_data_source_get_cursor_in_pcm_frames(pDataSourceBase, &relativeCursor);
+        if (result != MA_SUCCESS) {
+            /* Failed to retrieve the cursor. Cannot read within a range or loop points. Just read like normal - this may happen for things like noise data sources where it doesn't really matter. */
+            result = ma_data_source_read_pcm_frames_from_backend(pDataSource, pFramesOut, frameCount, &framesRead);
+        } else {
+            ma_uint64 rangeBeg;
+            ma_uint64 rangeEnd;
+
+            /* We have the cursor. We need to make sure we don't read beyond our range. */
+            rangeBeg = pDataSourceBase->rangeBegInFrames;
+            rangeEnd = pDataSourceBase->rangeEndInFrames;
+
+            absoluteCursor = rangeBeg + relativeCursor;
+
+            /* If looping, make sure we're within range. */
+            if (loop) {
+                if (pDataSourceBase->loopEndInFrames != ~((ma_uint64)0)) {
+                    rangeEnd = ma_min(rangeEnd, pDataSourceBase->rangeBegInFrames + pDataSourceBase->loopEndInFrames);
+                }
+            }
+
+            if (frameCount > (rangeEnd - absoluteCursor) && rangeEnd != ~((ma_uint64)0)) {
+                frameCount = (rangeEnd - absoluteCursor);
+            }
+
+            /*
+            If the cursor is sitting on the end of the range the frame count will be set to 0 which can
+            result in MA_INVALID_ARGS. In this case, we don't want to try reading, but instead return
+            MA_AT_END so the higher level function can know about it.
+            */
+            if (frameCount > 0) {
+                result = ma_data_source_read_pcm_frames_from_backend(pDataSource, pFramesOut, frameCount, &framesRead);
+            } else {
+                result = MA_AT_END; /* The cursor is sitting on the end of the range which means we're at the end. */
+            }
+        }
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = framesRead;
+    }
+
+    /* We need to make sure MA_AT_END is returned if we hit the end of the range. */
+    if (result == MA_SUCCESS && framesRead == 0) {
+        result  = MA_AT_END;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_data_source_read_pcm_frames(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+    ma_data_source_base* pCurrentDataSource;
+    void* pRunningFramesOut = pFramesOut;
+    ma_uint64 totalFramesProcessed = 0;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 emptyLoopCounter = 0; /* Keeps track of how many times 0 frames have been read. For infinite loop detection of sounds with no audio data. */
+    ma_bool32 loop;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDataSourceBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    loop = ma_data_source_is_looping(pDataSource);
+
+    /*
+    We need to know the data format so we can advance the output buffer as we read frames. If this
+    fails, chaining will not work and we'll just read as much as we can from the current source.
+    */
+    if (ma_data_source_get_data_format(pDataSource, &format, &channels, NULL, NULL, 0) != MA_SUCCESS) {
+        result = ma_data_source_resolve_current(pDataSource, (ma_data_source**)&pCurrentDataSource);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        return ma_data_source_read_pcm_frames_within_range(pCurrentDataSource, pFramesOut, frameCount, pFramesRead);
+    }
+
+    /*
+    Looping is a bit of a special case. When the `loop` argument is true, chaining will not work and
+    only the current data source will be read from.
+    */
+
+    /* Keep reading until we've read as many frames as possible. */
+    while (totalFramesProcessed < frameCount) {
+        ma_uint64 framesProcessed;
+        ma_uint64 framesRemaining = frameCount - totalFramesProcessed;
+
+        /* We need to resolve the data source that we'll actually be reading from. */
+        result = ma_data_source_resolve_current(pDataSource, (ma_data_source**)&pCurrentDataSource);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        if (pCurrentDataSource == NULL) {
+            break;
+        }
+
+        result = ma_data_source_read_pcm_frames_within_range(pCurrentDataSource, pRunningFramesOut, framesRemaining, &framesProcessed);
+        totalFramesProcessed += framesProcessed;
+
+        /*
+        If we encountered an error from the read callback, make sure it's propagated to the caller. The caller may need to know whether or not MA_BUSY is returned which is
+        not necessarily considered an error.
+        */
+        if (result != MA_SUCCESS && result != MA_AT_END) {
+            break;
+        }
+
+        /*
+        We can determine if we've reached the end by checking if ma_data_source_read_pcm_frames_within_range() returned
+        MA_AT_END. To loop back to the start, all we need to do is seek back to the first frame.
+        */
+        if (result == MA_AT_END) {
+            /*
+            The result needs to be reset back to MA_SUCCESS (from MA_AT_END) so that we don't
+            accidentally return MA_AT_END when data has been read in prior loop iterations. at the
+            end of this function, the result will be checked for MA_SUCCESS, and if the total
+            number of frames processed is 0, will be explicitly set to MA_AT_END.
+            */
+            result = MA_SUCCESS;
+
+            /*
+            We reached the end. If we're looping, we just loop back to the start of the current
+            data source. If we're not looping we need to check if we have another in the chain, and
+            if so, switch to it.
+            */
+            if (loop) {
+                if (framesProcessed == 0) {
+                    emptyLoopCounter += 1;
+                    if (emptyLoopCounter > 1) {
+                        break;  /* Infinite loop detected. Get out. */
+                    }
+                } else {
+                    emptyLoopCounter = 0;
+                }
+
+                result = ma_data_source_seek_to_pcm_frame(pCurrentDataSource, pCurrentDataSource->loopBegInFrames);
+                if (result != MA_SUCCESS) {
+                    break;  /* Failed to loop. Abort. */
+                }
+
+                /* Don't return MA_AT_END for looping sounds. */
+                result = MA_SUCCESS;
+            } else {
+                if (pCurrentDataSource->pNext != NULL) {
+                    pDataSourceBase->pCurrent = pCurrentDataSource->pNext;
+                } else if (pCurrentDataSource->onGetNext != NULL) {
+                    pDataSourceBase->pCurrent = pCurrentDataSource->onGetNext(pCurrentDataSource);
+                    if (pDataSourceBase->pCurrent == NULL) {
+                        break;  /* Our callback did not return a next data source. We're done. */
+                    }
+                } else {
+                    /* Reached the end of the chain. We're done. */
+                    break;
+                }
+
+                /* The next data source needs to be rewound to ensure data is read in looping scenarios. */
+                result = ma_data_source_seek_to_pcm_frame(pDataSourceBase->pCurrent, 0);
+                if (result != MA_SUCCESS) {
+                    break;
+                }
+            }
+        }
+
+        if (pRunningFramesOut != NULL) {
+            pRunningFramesOut = ma_offset_ptr(pRunningFramesOut, framesProcessed * ma_get_bytes_per_frame(format, channels));
+        }
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = totalFramesProcessed;
+    }
+
+    MA_ASSERT(!(result == MA_AT_END && totalFramesProcessed > 0));  /* We should never be returning MA_AT_END if we read some data. */
+
+    if (result == MA_SUCCESS && totalFramesProcessed == 0) {
+        result  = MA_AT_END;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_data_source_seek_pcm_frames(ma_data_source* pDataSource, ma_uint64 frameCount, ma_uint64* pFramesSeeked)
+{
+    return ma_data_source_read_pcm_frames(pDataSource, NULL, frameCount, pFramesSeeked);
+}
+
+MA_API ma_result ma_data_source_seek_to_pcm_frame(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    if (pDataSourceBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDataSourceBase->vtable->onSeek == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    if (frameIndex > pDataSourceBase->rangeEndInFrames) {
+        return MA_INVALID_OPERATION;    /* Trying to seek too far forward. */
+    }
+
+    MA_ASSERT(pDataSourceBase->vtable != NULL);
+
+    return pDataSourceBase->vtable->onSeek(pDataSource, pDataSourceBase->rangeBegInFrames + frameIndex);
+}
+
+MA_API ma_result ma_data_source_seek_seconds(ma_data_source* pDataSource, float secondCount, float* pSecondsSeeked)
+{
+    ma_uint64 frameCount;
+    ma_uint64 framesSeeked = 0;
+    ma_uint32 sampleRate;
+    ma_result result;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_data_source_get_data_format(pDataSource, NULL, NULL, &sampleRate, NULL, 0);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* We need PCM frames instead of seconds */
+    frameCount = (ma_uint64)(secondCount * sampleRate);
+
+    result = ma_data_source_seek_pcm_frames(pDataSource, frameCount, &framesSeeked);
+
+    /* VC6 doesn't support division between unsigned 64-bit integer and floating point number. Signed integer needed. This shouldn't affect anything in practice */
+    *pSecondsSeeked = (ma_int64)framesSeeked / (float)sampleRate;
+    return result;
+}
+
+MA_API ma_result ma_data_source_seek_to_second(ma_data_source* pDataSource, float seekPointInSeconds)
+{
+    ma_uint64 frameIndex;
+    ma_uint32 sampleRate;
+    ma_result result;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_data_source_get_data_format(pDataSource, NULL, NULL, &sampleRate, NULL, 0);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* We need PCM frames instead of seconds */
+    frameIndex = (ma_uint64)(seekPointInSeconds * sampleRate);
+
+    return ma_data_source_seek_to_pcm_frame(pDataSource, frameIndex);
+}
+
+MA_API ma_result ma_data_source_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+    ma_result result;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+
+    /* Initialize to defaults for safety just in case the data source does not implement this callback. */
+    if (pFormat != NULL) {
+        *pFormat = ma_format_unknown;
+    }
+    if (pChannels != NULL) {
+        *pChannels = 0;
+    }
+    if (pSampleRate != NULL) {
+        *pSampleRate = 0;
+    }
+    if (pChannelMap != NULL) {
+        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
+    }
+
+    if (pDataSourceBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ASSERT(pDataSourceBase->vtable != NULL);
+
+    if (pDataSourceBase->vtable->onGetDataFormat == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    result = pDataSourceBase->vtable->onGetDataFormat(pDataSource, &format, &channels, &sampleRate, pChannelMap, channelMapCap);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pFormat != NULL) {
+        *pFormat = format;
+    }
+    if (pChannels != NULL) {
+        *pChannels = channels;
+    }
+    if (pSampleRate != NULL) {
+        *pSampleRate = sampleRate;
+    }
+
+    /* Channel map was passed in directly to the callback. This is safe due to the channelMapCap parameter. */
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_source_get_cursor_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+    ma_result result;
+    ma_uint64 cursor;
+
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;
+
+    if (pDataSourceBase == NULL) {
+        return MA_SUCCESS;
+    }
+
+    MA_ASSERT(pDataSourceBase->vtable != NULL);
+
+    if (pDataSourceBase->vtable->onGetCursor == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    result = pDataSourceBase->vtable->onGetCursor(pDataSourceBase, &cursor);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* The cursor needs to be made relative to the start of the range. */
+    if (cursor < pDataSourceBase->rangeBegInFrames) {   /* Safety check so we don't return some huge number. */
+        *pCursor = 0;
+    } else {
+        *pCursor = cursor - pDataSourceBase->rangeBegInFrames;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_source_get_length_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;
+
+    if (pDataSourceBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ASSERT(pDataSourceBase->vtable != NULL);
+
+    /*
+    If we have a range defined we'll use that to determine the length. This is one of rare times
+    where we'll actually trust the caller. If they've set the range, I think it's mostly safe to
+    assume they've set it based on some higher level knowledge of the structure of the sound bank.
+    */
+    if (pDataSourceBase->rangeEndInFrames != ~((ma_uint64)0)) {
+        *pLength = pDataSourceBase->rangeEndInFrames - pDataSourceBase->rangeBegInFrames;
+        return MA_SUCCESS;
+    }
+
+    /*
+    Getting here means a range is not defined so we'll need to get the data source itself to tell
+    us the length.
+    */
+    if (pDataSourceBase->vtable->onGetLength == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pDataSourceBase->vtable->onGetLength(pDataSource, pLength);
+}
+
+MA_API ma_result ma_data_source_get_cursor_in_seconds(ma_data_source* pDataSource, float* pCursor)
+{
+    ma_result result;
+    ma_uint64 cursorInPCMFrames;
+    ma_uint32 sampleRate;
+
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;
+
+    result = ma_data_source_get_cursor_in_pcm_frames(pDataSource, &cursorInPCMFrames);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_data_source_get_data_format(pDataSource, NULL, NULL, &sampleRate, NULL, 0);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* VC6 does not support division of unsigned 64-bit integers with floating point numbers. Need to use a signed number. This shouldn't effect anything in practice. */
+    *pCursor = (ma_int64)cursorInPCMFrames / (float)sampleRate;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_source_get_length_in_seconds(ma_data_source* pDataSource, float* pLength)
+{
+    ma_result result;
+    ma_uint64 lengthInPCMFrames;
+    ma_uint32 sampleRate;
+
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;
+
+    result = ma_data_source_get_length_in_pcm_frames(pDataSource, &lengthInPCMFrames);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_data_source_get_data_format(pDataSource, NULL, NULL, &sampleRate, NULL, 0);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* VC6 does not support division of unsigned 64-bit integers with floating point numbers. Need to use a signed number. This shouldn't effect anything in practice. */
+    *pLength = (ma_int64)lengthInPCMFrames / (float)sampleRate;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_data_source_set_looping(ma_data_source* pDataSource, ma_bool32 isLooping)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_atomic_exchange_32(&pDataSourceBase->isLooping, isLooping);
+
+    MA_ASSERT(pDataSourceBase->vtable != NULL);
+
+    /* If there's no callback for this just treat it as a successful no-op. */
+    if (pDataSourceBase->vtable->onSetLooping == NULL) {
+        return MA_SUCCESS;
+    }
+
+    return pDataSourceBase->vtable->onSetLooping(pDataSource, isLooping);
+}
+
+MA_API ma_bool32 ma_data_source_is_looping(const ma_data_source* pDataSource)
+{
+    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_atomic_load_32(&pDataSourceBase->isLooping);
+}
+
+MA_API ma_result ma_data_source_set_range_in_pcm_frames(ma_data_source* pDataSource, ma_uint64 rangeBegInFrames, ma_uint64 rangeEndInFrames)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+    ma_result result;
+    ma_uint64 relativeCursor;
+    ma_uint64 absoluteCursor;
+    ma_bool32 doSeekAdjustment = MA_FALSE;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (rangeEndInFrames < rangeBegInFrames) {
+        return MA_INVALID_ARGS; /* The end of the range must come after the beginning. */
+    }
+
+    /*
+    We may need to adjust the position of the cursor to ensure it's clamped to the range. Grab it now
+    so we can calculate its absolute position before we change the range.
+    */
+    result = ma_data_source_get_cursor_in_pcm_frames(pDataSource, &relativeCursor);
+    if (result == MA_SUCCESS) {
+        doSeekAdjustment = MA_TRUE;
+        absoluteCursor = relativeCursor + pDataSourceBase->rangeBegInFrames;
+    } else {
+        /*
+        We couldn't get the position of the cursor. It probably means the data source has no notion
+        of a cursor. We'll just leave it at position 0. Don't treat this as an error.
+        */
+        doSeekAdjustment = MA_FALSE;
+        relativeCursor = 0;
+        absoluteCursor = 0;
+    }
+
+    pDataSourceBase->rangeBegInFrames = rangeBegInFrames;
+    pDataSourceBase->rangeEndInFrames = rangeEndInFrames;
+
+    /*
+    The commented out logic below was intended to maintain loop points in response to a change in the
+    range. However, this is not useful because it results in the sound breaking when you move the range
+    outside of the old loop points. I'm simplifying this by simply resetting the loop points. The
+    caller is expected to update their loop points if they change the range.
+
+    In practice this should be mostly a non-issue because the majority of the time the range will be
+    set once right after initialization.
+    */
+    pDataSourceBase->loopBegInFrames = 0;
+    pDataSourceBase->loopEndInFrames = ~((ma_uint64)0);
+
+
+    /*
+    Seek to within range. Note that our seek positions here are relative to the new range. We don't want
+    to do this if we failed to retrieve the cursor earlier on because it probably means the data source
+    has no notion of a cursor. In practice the seek would probably fail (which we silently ignore), but
+    I'm just not even going to attempt it.
+    */
+    if (doSeekAdjustment) {
+        if (absoluteCursor < rangeBegInFrames) {
+            ma_data_source_seek_to_pcm_frame(pDataSource, 0);
+        } else if (absoluteCursor > rangeEndInFrames) {
+            ma_data_source_seek_to_pcm_frame(pDataSource, rangeEndInFrames - rangeBegInFrames);
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_data_source_get_range_in_pcm_frames(const ma_data_source* pDataSource, ma_uint64* pRangeBegInFrames, ma_uint64* pRangeEndInFrames)
+{
+    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
+
+    if (pRangeBegInFrames != NULL) {
+        *pRangeBegInFrames = 0;
+    }
+    if (pRangeEndInFrames != NULL) {
+        *pRangeEndInFrames = 0;
+    }
+
+    if (pDataSource == NULL) {
+        return;
+    }
+
+    if (pRangeBegInFrames != NULL) {
+        *pRangeBegInFrames = pDataSourceBase->rangeBegInFrames;
+    }
+
+    if (pRangeEndInFrames != NULL) {
+        *pRangeEndInFrames = pDataSourceBase->rangeEndInFrames;
+    }
+}
+
+MA_API ma_result ma_data_source_set_loop_point_in_pcm_frames(ma_data_source* pDataSource, ma_uint64 loopBegInFrames, ma_uint64 loopEndInFrames)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (loopEndInFrames < loopBegInFrames) {
+        return MA_INVALID_ARGS; /* The end of the loop point must come after the beginning. */
+    }
+
+    if (loopEndInFrames > pDataSourceBase->rangeEndInFrames && loopEndInFrames != ~((ma_uint64)0)) {
+        return MA_INVALID_ARGS; /* The end of the loop point must not go beyond the range. */
+    }
+
+    pDataSourceBase->loopBegInFrames = loopBegInFrames;
+    pDataSourceBase->loopEndInFrames = loopEndInFrames;
+
+    /* The end cannot exceed the range. */
+    if (pDataSourceBase->loopEndInFrames > (pDataSourceBase->rangeEndInFrames - pDataSourceBase->rangeBegInFrames) && pDataSourceBase->loopEndInFrames != ~((ma_uint64)0)) {
+        pDataSourceBase->loopEndInFrames = (pDataSourceBase->rangeEndInFrames - pDataSourceBase->rangeBegInFrames);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_data_source_get_loop_point_in_pcm_frames(const ma_data_source* pDataSource, ma_uint64* pLoopBegInFrames, ma_uint64* pLoopEndInFrames)
+{
+    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
+
+    if (pLoopBegInFrames != NULL) {
+        *pLoopBegInFrames = 0;
+    }
+    if (pLoopEndInFrames != NULL) {
+        *pLoopEndInFrames = 0;
+    }
+
+    if (pDataSource == NULL) {
+        return;
+    }
+
+    if (pLoopBegInFrames != NULL) {
+        *pLoopBegInFrames = pDataSourceBase->loopBegInFrames;
+    }
+
+    if (pLoopEndInFrames != NULL) {
+        *pLoopEndInFrames = pDataSourceBase->loopEndInFrames;
+    }
+}
+
+MA_API ma_result ma_data_source_set_current(ma_data_source* pDataSource, ma_data_source* pCurrentDataSource)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pDataSourceBase->pCurrent = pCurrentDataSource;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_data_source* ma_data_source_get_current(const ma_data_source* pDataSource)
+{
+    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return NULL;
+    }
+
+    return pDataSourceBase->pCurrent;
+}
+
+MA_API ma_result ma_data_source_set_next(ma_data_source* pDataSource, ma_data_source* pNextDataSource)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pDataSourceBase->pNext = pNextDataSource;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_data_source* ma_data_source_get_next(const ma_data_source* pDataSource)
+{
+    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return NULL;
+    }
+
+    return pDataSourceBase->pNext;
+}
+
+MA_API ma_result ma_data_source_set_next_callback(ma_data_source* pDataSource, ma_data_source_get_next_proc onGetNext)
+{
+    ma_data_source_base* pDataSourceBase = (ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pDataSourceBase->onGetNext = onGetNext;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_data_source_get_next_proc ma_data_source_get_next_callback(const ma_data_source* pDataSource)
+{
+    const ma_data_source_base* pDataSourceBase = (const ma_data_source_base*)pDataSource;
+
+    if (pDataSource == NULL) {
+        return NULL;
+    }
+
+    return pDataSourceBase->onGetNext;
+}
+
+
+static ma_result ma_audio_buffer_ref__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_audio_buffer_ref* pAudioBufferRef = (ma_audio_buffer_ref*)pDataSource;
+    ma_uint64 framesRead = ma_audio_buffer_ref_read_pcm_frames(pAudioBufferRef, pFramesOut, frameCount, MA_FALSE);
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = framesRead;
+    }
+
+    if (framesRead < frameCount || framesRead == 0) {
+        return MA_AT_END;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_audio_buffer_ref__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_audio_buffer_ref_seek_to_pcm_frame((ma_audio_buffer_ref*)pDataSource, frameIndex);
+}
+
+static ma_result ma_audio_buffer_ref__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    ma_audio_buffer_ref* pAudioBufferRef = (ma_audio_buffer_ref*)pDataSource;
+
+    *pFormat     = pAudioBufferRef->format;
+    *pChannels   = pAudioBufferRef->channels;
+    *pSampleRate = pAudioBufferRef->sampleRate;
+    ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pAudioBufferRef->channels);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_audio_buffer_ref__data_source_on_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    ma_audio_buffer_ref* pAudioBufferRef = (ma_audio_buffer_ref*)pDataSource;
+
+    *pCursor = pAudioBufferRef->cursor;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_audio_buffer_ref__data_source_on_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    ma_audio_buffer_ref* pAudioBufferRef = (ma_audio_buffer_ref*)pDataSource;
+
+    *pLength = pAudioBufferRef->sizeInFrames;
+
+    return MA_SUCCESS;
+}
+
+static ma_data_source_vtable g_ma_audio_buffer_ref_data_source_vtable =
+{
+    ma_audio_buffer_ref__data_source_on_read,
+    ma_audio_buffer_ref__data_source_on_seek,
+    ma_audio_buffer_ref__data_source_on_get_data_format,
+    ma_audio_buffer_ref__data_source_on_get_cursor,
+    ma_audio_buffer_ref__data_source_on_get_length,
+    NULL,   /* onSetLooping */
+    0
+};
+
+MA_API ma_result ma_audio_buffer_ref_init(ma_format format, ma_uint32 channels, const void* pData, ma_uint64 sizeInFrames, ma_audio_buffer_ref* pAudioBufferRef)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+
+    if (pAudioBufferRef == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pAudioBufferRef);
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_audio_buffer_ref_data_source_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pAudioBufferRef->ds);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pAudioBufferRef->format       = format;
+    pAudioBufferRef->channels     = channels;
+    pAudioBufferRef->sampleRate   = 0;  /* TODO: Version 0.12. Set this to sampleRate. */
+    pAudioBufferRef->cursor       = 0;
+    pAudioBufferRef->sizeInFrames = sizeInFrames;
+    pAudioBufferRef->pData        = pData;
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_audio_buffer_ref_uninit(ma_audio_buffer_ref* pAudioBufferRef)
+{
+    if (pAudioBufferRef == NULL) {
+        return;
+    }
+
+    ma_data_source_uninit(&pAudioBufferRef->ds);
+}
+
+MA_API ma_result ma_audio_buffer_ref_set_data(ma_audio_buffer_ref* pAudioBufferRef, const void* pData, ma_uint64 sizeInFrames)
+{
+    if (pAudioBufferRef == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pAudioBufferRef->cursor       = 0;
+    pAudioBufferRef->sizeInFrames = sizeInFrames;
+    pAudioBufferRef->pData        = pData;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint64 ma_audio_buffer_ref_read_pcm_frames(ma_audio_buffer_ref* pAudioBufferRef, void* pFramesOut, ma_uint64 frameCount, ma_bool32 loop)
+{
+    ma_uint64 totalFramesRead = 0;
+
+    if (pAudioBufferRef == NULL) {
+        return 0;
+    }
+
+    if (frameCount == 0) {
+        return 0;
+    }
+
+    while (totalFramesRead < frameCount) {
+        ma_uint64 framesAvailable = pAudioBufferRef->sizeInFrames - pAudioBufferRef->cursor;
+        ma_uint64 framesRemaining = frameCount - totalFramesRead;
+        ma_uint64 framesToRead;
+
+        framesToRead = framesRemaining;
+        if (framesToRead > framesAvailable) {
+            framesToRead = framesAvailable;
+        }
+
+        if (pFramesOut != NULL) {
+            ma_copy_pcm_frames(ma_offset_ptr(pFramesOut, totalFramesRead * ma_get_bytes_per_frame(pAudioBufferRef->format, pAudioBufferRef->channels)), ma_offset_ptr(pAudioBufferRef->pData, pAudioBufferRef->cursor * ma_get_bytes_per_frame(pAudioBufferRef->format, pAudioBufferRef->channels)), framesToRead, pAudioBufferRef->format, pAudioBufferRef->channels);
+        }
+
+        totalFramesRead += framesToRead;
+
+        pAudioBufferRef->cursor += framesToRead;
+        if (pAudioBufferRef->cursor == pAudioBufferRef->sizeInFrames) {
+            if (loop) {
+                pAudioBufferRef->cursor = 0;
+            } else {
+                break;  /* We've reached the end and we're not looping. Done. */
+            }
+        }
+
+        MA_ASSERT(pAudioBufferRef->cursor < pAudioBufferRef->sizeInFrames);
+    }
+
+    return totalFramesRead;
+}
+
+MA_API ma_result ma_audio_buffer_ref_seek_to_pcm_frame(ma_audio_buffer_ref* pAudioBufferRef, ma_uint64 frameIndex)
+{
+    if (pAudioBufferRef == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (frameIndex > pAudioBufferRef->sizeInFrames) {
+        return MA_INVALID_ARGS;
+    }
+
+    pAudioBufferRef->cursor = (size_t)frameIndex;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_audio_buffer_ref_map(ma_audio_buffer_ref* pAudioBufferRef, void** ppFramesOut, ma_uint64* pFrameCount)
+{
+    ma_uint64 framesAvailable;
+    ma_uint64 frameCount = 0;
+
+    if (ppFramesOut != NULL) {
+        *ppFramesOut = NULL;    /* Safety. */
+    }
+
+    if (pFrameCount != NULL) {
+        frameCount = *pFrameCount;
+        *pFrameCount = 0;       /* Safety. */
+    }
+
+    if (pAudioBufferRef == NULL || ppFramesOut == NULL || pFrameCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    framesAvailable = pAudioBufferRef->sizeInFrames - pAudioBufferRef->cursor;
+    if (frameCount > framesAvailable) {
+        frameCount = framesAvailable;
+    }
+
+    *ppFramesOut = ma_offset_ptr(pAudioBufferRef->pData, pAudioBufferRef->cursor * ma_get_bytes_per_frame(pAudioBufferRef->format, pAudioBufferRef->channels));
+    *pFrameCount = frameCount;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_audio_buffer_ref_unmap(ma_audio_buffer_ref* pAudioBufferRef, ma_uint64 frameCount)
+{
+    ma_uint64 framesAvailable;
+
+    if (pAudioBufferRef == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    framesAvailable = pAudioBufferRef->sizeInFrames - pAudioBufferRef->cursor;
+    if (frameCount > framesAvailable) {
+        return MA_INVALID_ARGS;   /* The frame count was too big. This should never happen in an unmapping. Need to make sure the caller is aware of this. */
+    }
+
+    pAudioBufferRef->cursor += frameCount;
+
+    if (pAudioBufferRef->cursor == pAudioBufferRef->sizeInFrames) {
+        return MA_AT_END;   /* Successful. Need to tell the caller that the end has been reached so that it can loop if desired. */
+    } else {
+        return MA_SUCCESS;
+    }
+}
+
+MA_API ma_bool32 ma_audio_buffer_ref_at_end(const ma_audio_buffer_ref* pAudioBufferRef)
+{
+    if (pAudioBufferRef == NULL) {
+        return MA_FALSE;
+    }
+
+    return pAudioBufferRef->cursor == pAudioBufferRef->sizeInFrames;
+}
+
+MA_API ma_result ma_audio_buffer_ref_get_cursor_in_pcm_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;
+
+    if (pAudioBufferRef == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = pAudioBufferRef->cursor;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_audio_buffer_ref_get_length_in_pcm_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pLength)
+{
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;
+
+    if (pAudioBufferRef == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = pAudioBufferRef->sizeInFrames;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_audio_buffer_ref_get_available_frames(const ma_audio_buffer_ref* pAudioBufferRef, ma_uint64* pAvailableFrames)
+{
+    if (pAvailableFrames == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pAvailableFrames = 0;
+
+    if (pAudioBufferRef == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pAudioBufferRef->sizeInFrames <= pAudioBufferRef->cursor) {
+        *pAvailableFrames = 0;
+    } else {
+        *pAvailableFrames = pAudioBufferRef->sizeInFrames - pAudioBufferRef->cursor;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+
+
+MA_API ma_audio_buffer_config ma_audio_buffer_config_init(ma_format format, ma_uint32 channels, ma_uint64 sizeInFrames, const void* pData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_audio_buffer_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format       = format;
+    config.channels     = channels;
+    config.sampleRate   = 0;    /* TODO: Version 0.12. Set this to sampleRate. */
+    config.sizeInFrames = sizeInFrames;
+    config.pData        = pData;
+    ma_allocation_callbacks_init_copy(&config.allocationCallbacks, pAllocationCallbacks);
+
+    return config;
+}
+
+static ma_result ma_audio_buffer_init_ex(const ma_audio_buffer_config* pConfig, ma_bool32 doCopy, ma_audio_buffer* pAudioBuffer)
+{
+    ma_result result;
+
+    if (pAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_MEMORY(pAudioBuffer, sizeof(*pAudioBuffer) - sizeof(pAudioBuffer->_pExtraData));   /* Safety. Don't overwrite the extra data. */
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->sizeInFrames == 0) {
+        return MA_INVALID_ARGS; /* Not allowing buffer sizes of 0 frames. */
+    }
+
+    result = ma_audio_buffer_ref_init(pConfig->format, pConfig->channels, NULL, 0, &pAudioBuffer->ref);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* TODO: Version 0.12. Set this in ma_audio_buffer_ref_init() instead of here. */
+    pAudioBuffer->ref.sampleRate = pConfig->sampleRate;
+
+    ma_allocation_callbacks_init_copy(&pAudioBuffer->allocationCallbacks, &pConfig->allocationCallbacks);
+
+    if (doCopy) {
+        ma_uint64 allocationSizeInBytes;
+        void* pData;
+
+        allocationSizeInBytes = pConfig->sizeInFrames * ma_get_bytes_per_frame(pConfig->format, pConfig->channels);
+        if (allocationSizeInBytes > MA_SIZE_MAX) {
+            return MA_OUT_OF_MEMORY;    /* Too big. */
+        }
+
+        pData = ma_malloc((size_t)allocationSizeInBytes, &pAudioBuffer->allocationCallbacks);   /* Safe cast to size_t. */
+        if (pData == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        if (pConfig->pData != NULL) {
+            ma_copy_pcm_frames(pData, pConfig->pData, pConfig->sizeInFrames, pConfig->format, pConfig->channels);
+        } else {
+            ma_silence_pcm_frames(pData, pConfig->sizeInFrames, pConfig->format, pConfig->channels);
+        }
+
+        ma_audio_buffer_ref_set_data(&pAudioBuffer->ref, pData, pConfig->sizeInFrames);
+        pAudioBuffer->ownsData = MA_TRUE;
+    } else {
+        ma_audio_buffer_ref_set_data(&pAudioBuffer->ref, pConfig->pData, pConfig->sizeInFrames);
+        pAudioBuffer->ownsData = MA_FALSE;
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_audio_buffer_uninit_ex(ma_audio_buffer* pAudioBuffer, ma_bool32 doFree)
+{
+    if (pAudioBuffer == NULL) {
+        return;
+    }
+
+    if (pAudioBuffer->ownsData && pAudioBuffer->ref.pData != &pAudioBuffer->_pExtraData[0]) {
+        ma_free((void*)pAudioBuffer->ref.pData, &pAudioBuffer->allocationCallbacks);    /* Naugty const cast, but OK in this case since we've guarded it with the ownsData check. */
+    }
+
+    if (doFree) {
+        ma_free(pAudioBuffer, &pAudioBuffer->allocationCallbacks);
+    }
+
+    ma_audio_buffer_ref_uninit(&pAudioBuffer->ref);
+}
+
+MA_API ma_result ma_audio_buffer_init(const ma_audio_buffer_config* pConfig, ma_audio_buffer* pAudioBuffer)
+{
+    return ma_audio_buffer_init_ex(pConfig, MA_FALSE, pAudioBuffer);
+}
+
+MA_API ma_result ma_audio_buffer_init_copy(const ma_audio_buffer_config* pConfig, ma_audio_buffer* pAudioBuffer)
+{
+    return ma_audio_buffer_init_ex(pConfig, MA_TRUE, pAudioBuffer);
+}
+
+MA_API ma_result ma_audio_buffer_alloc_and_init(const ma_audio_buffer_config* pConfig, ma_audio_buffer** ppAudioBuffer)
+{
+    ma_result result;
+    ma_audio_buffer* pAudioBuffer;
+    ma_audio_buffer_config innerConfig; /* We'll be making some changes to the config, so need to make a copy. */
+    ma_uint64 allocationSizeInBytes;
+
+    if (ppAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *ppAudioBuffer = NULL;  /* Safety. */
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    innerConfig = *pConfig;
+    ma_allocation_callbacks_init_copy(&innerConfig.allocationCallbacks, &pConfig->allocationCallbacks);
+
+    allocationSizeInBytes = sizeof(*pAudioBuffer) - sizeof(pAudioBuffer->_pExtraData) + (pConfig->sizeInFrames * ma_get_bytes_per_frame(pConfig->format, pConfig->channels));
+    if (allocationSizeInBytes > MA_SIZE_MAX) {
+        return MA_OUT_OF_MEMORY;    /* Too big. */
+    }
+
+    pAudioBuffer = (ma_audio_buffer*)ma_malloc((size_t)allocationSizeInBytes, &innerConfig.allocationCallbacks);  /* Safe cast to size_t. */
+    if (pAudioBuffer == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    if (pConfig->pData != NULL) {
+        ma_copy_pcm_frames(&pAudioBuffer->_pExtraData[0], pConfig->pData, pConfig->sizeInFrames, pConfig->format, pConfig->channels);
+    } else {
+        ma_silence_pcm_frames(&pAudioBuffer->_pExtraData[0], pConfig->sizeInFrames, pConfig->format, pConfig->channels);
+    }
+
+    innerConfig.pData = &pAudioBuffer->_pExtraData[0];
+
+    result = ma_audio_buffer_init_ex(&innerConfig, MA_FALSE, pAudioBuffer);
+    if (result != MA_SUCCESS) {
+        ma_free(pAudioBuffer, &innerConfig.allocationCallbacks);
+        return result;
+    }
+
+    *ppAudioBuffer = pAudioBuffer;
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_audio_buffer_uninit(ma_audio_buffer* pAudioBuffer)
+{
+    ma_audio_buffer_uninit_ex(pAudioBuffer, MA_FALSE);
+}
+
+MA_API void ma_audio_buffer_uninit_and_free(ma_audio_buffer* pAudioBuffer)
+{
+    ma_audio_buffer_uninit_ex(pAudioBuffer, MA_TRUE);
+}
+
+MA_API ma_uint64 ma_audio_buffer_read_pcm_frames(ma_audio_buffer* pAudioBuffer, void* pFramesOut, ma_uint64 frameCount, ma_bool32 loop)
+{
+    if (pAudioBuffer == NULL) {
+        return 0;
+    }
+
+    return ma_audio_buffer_ref_read_pcm_frames(&pAudioBuffer->ref, pFramesOut, frameCount, loop);
+}
+
+MA_API ma_result ma_audio_buffer_seek_to_pcm_frame(ma_audio_buffer* pAudioBuffer, ma_uint64 frameIndex)
+{
+    if (pAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_audio_buffer_ref_seek_to_pcm_frame(&pAudioBuffer->ref, frameIndex);
+}
+
+MA_API ma_result ma_audio_buffer_map(ma_audio_buffer* pAudioBuffer, void** ppFramesOut, ma_uint64* pFrameCount)
+{
+    if (ppFramesOut != NULL) {
+        *ppFramesOut = NULL;    /* Safety. */
+    }
+
+    if (pAudioBuffer == NULL) {
+        if (pFrameCount != NULL) {
+            *pFrameCount = 0;
+        }
+
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_audio_buffer_ref_map(&pAudioBuffer->ref, ppFramesOut, pFrameCount);
+}
+
+MA_API ma_result ma_audio_buffer_unmap(ma_audio_buffer* pAudioBuffer, ma_uint64 frameCount)
+{
+    if (pAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_audio_buffer_ref_unmap(&pAudioBuffer->ref, frameCount);
+}
+
+MA_API ma_bool32 ma_audio_buffer_at_end(const ma_audio_buffer* pAudioBuffer)
+{
+    if (pAudioBuffer == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_audio_buffer_ref_at_end(&pAudioBuffer->ref);
+}
+
+MA_API ma_result ma_audio_buffer_get_cursor_in_pcm_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pCursor)
+{
+    if (pAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_audio_buffer_ref_get_cursor_in_pcm_frames(&pAudioBuffer->ref, pCursor);
+}
+
+MA_API ma_result ma_audio_buffer_get_length_in_pcm_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pLength)
+{
+    if (pAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_audio_buffer_ref_get_length_in_pcm_frames(&pAudioBuffer->ref, pLength);
+}
+
+MA_API ma_result ma_audio_buffer_get_available_frames(const ma_audio_buffer* pAudioBuffer, ma_uint64* pAvailableFrames)
+{
+    if (pAvailableFrames == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pAvailableFrames = 0;
+
+    if (pAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_audio_buffer_ref_get_available_frames(&pAudioBuffer->ref, pAvailableFrames);
+}
+
+
+
+
+
+MA_API ma_result ma_paged_audio_buffer_data_init(ma_format format, ma_uint32 channels, ma_paged_audio_buffer_data* pData)
+{
+    if (pData == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pData);
+
+    pData->format   = format;
+    pData->channels = channels;
+    pData->pTail    = &pData->head;
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_paged_audio_buffer_data_uninit(ma_paged_audio_buffer_data* pData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_paged_audio_buffer_page* pPage;
+
+    if (pData == NULL) {
+        return;
+    }
+
+    /* All pages need to be freed. */
+    pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pData->head.pNext);
+    while (pPage != NULL) {
+        ma_paged_audio_buffer_page* pNext = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pPage->pNext);
+
+        ma_free(pPage, pAllocationCallbacks);
+        pPage = pNext;
+    }
+}
+
+MA_API ma_paged_audio_buffer_page* ma_paged_audio_buffer_data_get_head(ma_paged_audio_buffer_data* pData)
+{
+    if (pData == NULL) {
+        return NULL;
+    }
+
+    return &pData->head;
+}
+
+MA_API ma_paged_audio_buffer_page* ma_paged_audio_buffer_data_get_tail(ma_paged_audio_buffer_data* pData)
+{
+    if (pData == NULL) {
+        return NULL;
+    }
+
+    return pData->pTail;
+}
+
+MA_API ma_result ma_paged_audio_buffer_data_get_length_in_pcm_frames(ma_paged_audio_buffer_data* pData, ma_uint64* pLength)
+{
+    ma_paged_audio_buffer_page* pPage;
+
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;
+
+    if (pData == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Calculate the length from the linked list. */
+    for (pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pData->head.pNext); pPage != NULL; pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pPage->pNext)) {
+        *pLength += pPage->sizeInFrames;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_paged_audio_buffer_data_allocate_page(ma_paged_audio_buffer_data* pData, ma_uint64 pageSizeInFrames, const void* pInitialData, const ma_allocation_callbacks* pAllocationCallbacks, ma_paged_audio_buffer_page** ppPage)
+{
+    ma_paged_audio_buffer_page* pPage;
+    ma_uint64 allocationSize;
+
+    if (ppPage == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *ppPage = NULL;
+
+    if (pData == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    allocationSize = sizeof(*pPage) + (pageSizeInFrames * ma_get_bytes_per_frame(pData->format, pData->channels));
+    if (allocationSize > MA_SIZE_MAX) {
+        return MA_OUT_OF_MEMORY;    /* Too big. */
+    }
+
+    pPage = (ma_paged_audio_buffer_page*)ma_malloc((size_t)allocationSize, pAllocationCallbacks);   /* Safe cast to size_t. */
+    if (pPage == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    pPage->pNext = NULL;
+    pPage->sizeInFrames = pageSizeInFrames;
+
+    if (pInitialData != NULL) {
+        ma_copy_pcm_frames(pPage->pAudioData, pInitialData, pageSizeInFrames, pData->format, pData->channels);
+    }
+
+    *ppPage = pPage;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_paged_audio_buffer_data_free_page(ma_paged_audio_buffer_data* pData, ma_paged_audio_buffer_page* pPage, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pData == NULL || pPage == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* It's assumed the page is not attached to the list. */
+    ma_free(pPage, pAllocationCallbacks);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_paged_audio_buffer_data_append_page(ma_paged_audio_buffer_data* pData, ma_paged_audio_buffer_page* pPage)
+{
+    if (pData == NULL || pPage == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* This function assumes the page has been filled with audio data by this point. As soon as we append, the page will be available for reading. */
+
+    /* First thing to do is update the tail. */
+    for (;;) {
+        ma_paged_audio_buffer_page* pOldTail = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pData->pTail);
+        ma_paged_audio_buffer_page* pNewTail = pPage;
+
+        if (ma_atomic_compare_exchange_weak_ptr((volatile void**)&pData->pTail, (void**)&pOldTail, pNewTail)) {
+            /* Here is where we append the page to the list. After this, the page is attached to the list and ready to be read from. */
+            ma_atomic_exchange_ptr(&pOldTail->pNext, pPage);
+            break;  /* Done. */
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_paged_audio_buffer_data_allocate_and_append_page(ma_paged_audio_buffer_data* pData, ma_uint32 pageSizeInFrames, const void* pInitialData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_result result;
+    ma_paged_audio_buffer_page* pPage;
+
+    result = ma_paged_audio_buffer_data_allocate_page(pData, pageSizeInFrames, pInitialData, pAllocationCallbacks, &pPage);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return ma_paged_audio_buffer_data_append_page(pData, pPage);    /* <-- Should never fail. */
+}
+
+
+MA_API ma_paged_audio_buffer_config ma_paged_audio_buffer_config_init(ma_paged_audio_buffer_data* pData)
+{
+    ma_paged_audio_buffer_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.pData = pData;
+
+    return config;
+}
+
+
+static ma_result ma_paged_audio_buffer__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_paged_audio_buffer_read_pcm_frames((ma_paged_audio_buffer*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_paged_audio_buffer__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_paged_audio_buffer_seek_to_pcm_frame((ma_paged_audio_buffer*)pDataSource, frameIndex);
+}
+
+static ma_result ma_paged_audio_buffer__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    ma_paged_audio_buffer* pPagedAudioBuffer = (ma_paged_audio_buffer*)pDataSource;
+
+    *pFormat     = pPagedAudioBuffer->pData->format;
+    *pChannels   = pPagedAudioBuffer->pData->channels;
+    *pSampleRate = 0;   /* There is no notion of a sample rate with audio buffers. */
+    ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pPagedAudioBuffer->pData->channels);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_paged_audio_buffer__data_source_on_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    return ma_paged_audio_buffer_get_cursor_in_pcm_frames((ma_paged_audio_buffer*)pDataSource, pCursor);
+}
+
+static ma_result ma_paged_audio_buffer__data_source_on_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    return ma_paged_audio_buffer_get_length_in_pcm_frames((ma_paged_audio_buffer*)pDataSource, pLength);
+}
+
+static ma_data_source_vtable g_ma_paged_audio_buffer_data_source_vtable =
+{
+    ma_paged_audio_buffer__data_source_on_read,
+    ma_paged_audio_buffer__data_source_on_seek,
+    ma_paged_audio_buffer__data_source_on_get_data_format,
+    ma_paged_audio_buffer__data_source_on_get_cursor,
+    ma_paged_audio_buffer__data_source_on_get_length,
+    NULL,   /* onSetLooping */
+    0
+};
+
+MA_API ma_result ma_paged_audio_buffer_init(const ma_paged_audio_buffer_config* pConfig, ma_paged_audio_buffer* pPagedAudioBuffer)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+
+    if (pPagedAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pPagedAudioBuffer);
+
+    /* A config is required for the format and channel count. */
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->pData == NULL) {
+        return MA_INVALID_ARGS; /* No underlying data specified. */
+    }
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_paged_audio_buffer_data_source_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pPagedAudioBuffer->ds);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pPagedAudioBuffer->pData          = pConfig->pData;
+    pPagedAudioBuffer->pCurrent       = ma_paged_audio_buffer_data_get_head(pConfig->pData);
+    pPagedAudioBuffer->relativeCursor = 0;
+    pPagedAudioBuffer->absoluteCursor = 0;
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_paged_audio_buffer_uninit(ma_paged_audio_buffer* pPagedAudioBuffer)
+{
+    if (pPagedAudioBuffer == NULL) {
+        return;
+    }
+
+    /* Nothing to do. The data needs to be deleted separately. */
+}
+
+MA_API ma_result ma_paged_audio_buffer_read_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 totalFramesRead = 0;
+    ma_format format;
+    ma_uint32 channels;
+
+    if (pPagedAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    format   = pPagedAudioBuffer->pData->format;
+    channels = pPagedAudioBuffer->pData->channels;
+
+    while (totalFramesRead < frameCount) {
+        /* Read from the current page. The buffer should never be in a state where this is NULL. */
+        ma_uint64 framesRemainingInCurrentPage;
+        ma_uint64 framesRemainingToRead = frameCount - totalFramesRead;
+        ma_uint64 framesToReadThisIteration;
+
+        MA_ASSERT(pPagedAudioBuffer->pCurrent != NULL);
+
+        framesRemainingInCurrentPage = pPagedAudioBuffer->pCurrent->sizeInFrames - pPagedAudioBuffer->relativeCursor;
+
+        framesToReadThisIteration = ma_min(framesRemainingInCurrentPage, framesRemainingToRead);
+        ma_copy_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, format, channels), ma_offset_pcm_frames_ptr(pPagedAudioBuffer->pCurrent->pAudioData, pPagedAudioBuffer->relativeCursor, format, channels), framesToReadThisIteration, format, channels);
+        totalFramesRead += framesToReadThisIteration;
+
+        pPagedAudioBuffer->absoluteCursor += framesToReadThisIteration;
+        pPagedAudioBuffer->relativeCursor += framesToReadThisIteration;
+
+        /* Move to the next page if necessary. If there's no more pages, we need to return MA_AT_END. */
+        MA_ASSERT(pPagedAudioBuffer->relativeCursor <= pPagedAudioBuffer->pCurrent->sizeInFrames);
+
+        if (pPagedAudioBuffer->relativeCursor == pPagedAudioBuffer->pCurrent->sizeInFrames) {
+            /* We reached the end of the page. Need to move to the next. If there's no more pages, we're done. */
+            ma_paged_audio_buffer_page* pNext = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pPagedAudioBuffer->pCurrent->pNext);
+            if (pNext == NULL) {
+                result = MA_AT_END;
+                break;  /* We've reached the end. */
+            } else {
+                pPagedAudioBuffer->pCurrent       = pNext;
+                pPagedAudioBuffer->relativeCursor = 0;
+            }
+        }
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = totalFramesRead;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_paged_audio_buffer_seek_to_pcm_frame(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64 frameIndex)
+{
+    if (pPagedAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (frameIndex == pPagedAudioBuffer->absoluteCursor) {
+        return MA_SUCCESS;  /* Nothing to do. */
+    }
+
+    if (frameIndex < pPagedAudioBuffer->absoluteCursor) {
+        /* Moving backwards. Need to move the cursor back to the start, and then move forward. */
+        pPagedAudioBuffer->pCurrent       = ma_paged_audio_buffer_data_get_head(pPagedAudioBuffer->pData);
+        pPagedAudioBuffer->absoluteCursor = 0;
+        pPagedAudioBuffer->relativeCursor = 0;
+
+        /* Fall through to the forward seeking section below. */
+    }
+
+    if (frameIndex > pPagedAudioBuffer->absoluteCursor) {
+        /* Moving forward. */
+        ma_paged_audio_buffer_page* pPage;
+        ma_uint64 runningCursor = 0;
+
+        for (pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&ma_paged_audio_buffer_data_get_head(pPagedAudioBuffer->pData)->pNext); pPage != NULL; pPage = (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(&pPage->pNext)) {
+            ma_uint64 pageRangeBeg = runningCursor;
+            ma_uint64 pageRangeEnd = pageRangeBeg + pPage->sizeInFrames;
+
+            if (frameIndex >= pageRangeBeg) {
+                if (frameIndex < pageRangeEnd || (frameIndex == pageRangeEnd && pPage == (ma_paged_audio_buffer_page*)ma_atomic_load_ptr(ma_paged_audio_buffer_data_get_tail(pPagedAudioBuffer->pData)))) {  /* A small edge case - allow seeking to the very end of the buffer. */
+                    /* We found the page. */
+                    pPagedAudioBuffer->pCurrent       = pPage;
+                    pPagedAudioBuffer->absoluteCursor = frameIndex;
+                    pPagedAudioBuffer->relativeCursor = frameIndex - pageRangeBeg;
+                    return MA_SUCCESS;
+                }
+            }
+
+            runningCursor = pageRangeEnd;
+        }
+
+        /* Getting here means we tried seeking too far forward. Don't change any state. */
+        return MA_BAD_SEEK;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_paged_audio_buffer_get_cursor_in_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;   /* Safety. */
+
+    if (pPagedAudioBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = pPagedAudioBuffer->absoluteCursor;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_paged_audio_buffer_get_length_in_pcm_frames(ma_paged_audio_buffer* pPagedAudioBuffer, ma_uint64* pLength)
+{
+    return ma_paged_audio_buffer_data_get_length_in_pcm_frames(pPagedAudioBuffer->pData, pLength);
+}
+
+
+
+/**************************************************************************************************************************************************************
+
+VFS
+
+**************************************************************************************************************************************************************/
+MA_API ma_result ma_vfs_open(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
+
+    if (pFile == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pFile = NULL;
+
+    if (pVFS == NULL || pFilePath == NULL || openMode == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pCallbacks->onOpen == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pCallbacks->onOpen(pVFS, pFilePath, openMode, pFile);
+}
+
+MA_API ma_result ma_vfs_open_w(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
+
+    if (pFile == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pFile = NULL;
+
+    if (pVFS == NULL || pFilePath == NULL || openMode == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pCallbacks->onOpenW == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pCallbacks->onOpenW(pVFS, pFilePath, openMode, pFile);
+}
+
+MA_API ma_result ma_vfs_close(ma_vfs* pVFS, ma_vfs_file file)
+{
+    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
+
+    if (pVFS == NULL || file == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pCallbacks->onClose == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pCallbacks->onClose(pVFS, file);
+}
+
+MA_API ma_result ma_vfs_read(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
+{
+    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
+    ma_result result;
+    size_t bytesRead = 0;
+
+    if (pBytesRead != NULL) {
+        *pBytesRead = 0;
+    }
+
+    if (pVFS == NULL || file == NULL || pDst == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pCallbacks->onRead == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    result = pCallbacks->onRead(pVFS, file, pDst, sizeInBytes, &bytesRead);
+
+    if (pBytesRead != NULL) {
+        *pBytesRead = bytesRead;
+    }
+
+    if (result == MA_SUCCESS && bytesRead == 0 && sizeInBytes > 0) {
+        result  = MA_AT_END;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_vfs_write(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
+{
+    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
+
+    if (pBytesWritten != NULL) {
+        *pBytesWritten = 0;
+    }
+
+    if (pVFS == NULL || file == NULL || pSrc == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pCallbacks->onWrite == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pCallbacks->onWrite(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
+}
+
+MA_API ma_result ma_vfs_seek(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
+{
+    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
+
+    if (pVFS == NULL || file == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pCallbacks->onSeek == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pCallbacks->onSeek(pVFS, file, offset, origin);
+}
+
+MA_API ma_result ma_vfs_tell(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
+{
+    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
+
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;
+
+    if (pVFS == NULL || file == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pCallbacks->onTell == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pCallbacks->onTell(pVFS, file, pCursor);
+}
+
+MA_API ma_result ma_vfs_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
+{
+    ma_vfs_callbacks* pCallbacks = (ma_vfs_callbacks*)pVFS;
+
+    if (pInfo == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pInfo);
+
+    if (pVFS == NULL || file == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pCallbacks->onInfo == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pCallbacks->onInfo(pVFS, file, pInfo);
+}
+
+
+#if !defined(MA_USE_WIN32_FILEIO) && (defined(MA_WIN32) && defined(MA_WIN32_DESKTOP) && !defined(MA_NO_WIN32_FILEIO) && !defined(MA_POSIX))
+    #define MA_USE_WIN32_FILEIO
+#endif
+
+#if defined(MA_USE_WIN32_FILEIO)
+/*
+We need to dynamically load SetFilePointer or SetFilePointerEx because older versions of Windows do
+not have the Ex version. We therefore need to do some dynamic branching depending on what's available.
+
+We load these when we load our first file from the default VFS. It's left open for the life of the
+program and is left to the OS to uninitialize when the program terminates.
+*/
+typedef DWORD (__stdcall * ma_SetFilePointer_proc)(HANDLE hFile, LONG lDistanceToMove, LONG* lpDistanceToMoveHigh, DWORD dwMoveMethod);
+typedef BOOL  (__stdcall * ma_SetFilePointerEx_proc)(HANDLE hFile, LARGE_INTEGER liDistanceToMove, LARGE_INTEGER* lpNewFilePointer, DWORD dwMoveMethod);
+
+static ma_handle hKernel32DLL = NULL;
+static ma_SetFilePointer_proc   ma_SetFilePointer   = NULL;
+static ma_SetFilePointerEx_proc ma_SetFilePointerEx = NULL;
+
+static void ma_win32_fileio_init(void)
+{
+    if (hKernel32DLL == NULL) {
+        hKernel32DLL = ma_dlopen(NULL, "kernel32.dll");
+        if (hKernel32DLL != NULL) {
+            ma_SetFilePointer   = (ma_SetFilePointer_proc)  ma_dlsym(NULL, hKernel32DLL, "SetFilePointer");
+            ma_SetFilePointerEx = (ma_SetFilePointerEx_proc)ma_dlsym(NULL, hKernel32DLL, "SetFilePointerEx");
+        }
+    }
+}
+
+static void ma_default_vfs__get_open_settings_win32(ma_uint32 openMode, DWORD* pDesiredAccess, DWORD* pShareMode, DWORD* pCreationDisposition)
+{
+    *pDesiredAccess = 0;
+    if ((openMode & MA_OPEN_MODE_READ) != 0) {
+        *pDesiredAccess |= GENERIC_READ;
+    }
+    if ((openMode & MA_OPEN_MODE_WRITE) != 0) {
+        *pDesiredAccess |= GENERIC_WRITE;
+    }
+
+    *pShareMode = 0;
+    if ((openMode & MA_OPEN_MODE_READ) != 0) {
+        *pShareMode |= FILE_SHARE_READ;
+    }
+
+    if ((openMode & MA_OPEN_MODE_WRITE) != 0) {
+        *pCreationDisposition = CREATE_ALWAYS;  /* Opening in write mode. Truncate. */
+    } else {
+        *pCreationDisposition = OPEN_EXISTING;  /* Opening in read mode. File must exist. */
+    }
+}
+
+static ma_result ma_default_vfs_open__win32(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    HANDLE hFile;
+    DWORD dwDesiredAccess;
+    DWORD dwShareMode;
+    DWORD dwCreationDisposition;
+
+    (void)pVFS;
+
+    /* Load some Win32 symbols dynamically so we can dynamically check for the existence of SetFilePointerEx. */
+    ma_win32_fileio_init();
+
+    ma_default_vfs__get_open_settings_win32(openMode, &dwDesiredAccess, &dwShareMode, &dwCreationDisposition);
+
+    hFile = CreateFileA(pFilePath, dwDesiredAccess, dwShareMode, NULL, dwCreationDisposition, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    *pFile = hFile;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_open_w__win32(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    HANDLE hFile;
+    DWORD dwDesiredAccess;
+    DWORD dwShareMode;
+    DWORD dwCreationDisposition;
+
+    (void)pVFS;
+
+    /* Load some Win32 symbols dynamically so we can dynamically check for the existence of SetFilePointerEx. */
+    ma_win32_fileio_init();
+
+    ma_default_vfs__get_open_settings_win32(openMode, &dwDesiredAccess, &dwShareMode, &dwCreationDisposition);
+
+    hFile = CreateFileW(pFilePath, dwDesiredAccess, dwShareMode, NULL, dwCreationDisposition, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    *pFile = hFile;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_close__win32(ma_vfs* pVFS, ma_vfs_file file)
+{
+    (void)pVFS;
+
+    if (CloseHandle((HANDLE)file) == 0) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_default_vfs_read__win32(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
+{
+    ma_result result = MA_SUCCESS;
+    size_t totalBytesRead;
+
+    (void)pVFS;
+
+    totalBytesRead = 0;
+    while (totalBytesRead < sizeInBytes) {
+        size_t bytesRemaining;
+        DWORD bytesToRead;
+        DWORD bytesRead;
+        BOOL readResult;
+
+        bytesRemaining = sizeInBytes - totalBytesRead;
+        if (bytesRemaining >= 0xFFFFFFFF) {
+            bytesToRead = 0xFFFFFFFF;
+        } else {
+            bytesToRead = (DWORD)bytesRemaining;
+        }
+
+        readResult = ReadFile((HANDLE)file, ma_offset_ptr(pDst, totalBytesRead), bytesToRead, &bytesRead, NULL);
+        if (readResult == 1 && bytesRead == 0) {
+            result = MA_AT_END;
+            break;  /* EOF */
+        }
+
+        totalBytesRead += bytesRead;
+
+        if (bytesRead < bytesToRead) {
+            break;  /* EOF */
+        }
+
+        if (readResult == 0) {
+            result = ma_result_from_GetLastError(GetLastError());
+            break;
+        }
+    }
+
+    if (pBytesRead != NULL) {
+        *pBytesRead = totalBytesRead;
+    }
+
+    return result;
+}
+
+static ma_result ma_default_vfs_write__win32(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
+{
+    ma_result result = MA_SUCCESS;
+    size_t totalBytesWritten;
+
+    (void)pVFS;
+
+    totalBytesWritten = 0;
+    while (totalBytesWritten < sizeInBytes) {
+        size_t bytesRemaining;
+        DWORD bytesToWrite;
+        DWORD bytesWritten;
+        BOOL writeResult;
+
+        bytesRemaining = sizeInBytes - totalBytesWritten;
+        if (bytesRemaining >= 0xFFFFFFFF) {
+            bytesToWrite = 0xFFFFFFFF;
+        } else {
+            bytesToWrite = (DWORD)bytesRemaining;
+        }
+
+        writeResult = WriteFile((HANDLE)file, ma_offset_ptr(pSrc, totalBytesWritten), bytesToWrite, &bytesWritten, NULL);
+        totalBytesWritten += bytesWritten;
+
+        if (writeResult == 0) {
+            result = ma_result_from_GetLastError(GetLastError());
+            break;
+        }
+    }
+
+    if (pBytesWritten != NULL) {
+        *pBytesWritten = totalBytesWritten;
+    }
+
+    return result;
+}
+
+
+static ma_result ma_default_vfs_seek__win32(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
+{
+    LARGE_INTEGER liDistanceToMove;
+    DWORD dwMoveMethod;
+    BOOL result;
+
+    (void)pVFS;
+
+    liDistanceToMove.QuadPart = offset;
+
+    /*  */ if (origin == ma_seek_origin_current) {
+        dwMoveMethod = FILE_CURRENT;
+    } else if (origin == ma_seek_origin_end) {
+        dwMoveMethod = FILE_END;
+    } else {
+        dwMoveMethod = FILE_BEGIN;
+    }
+
+    if (ma_SetFilePointerEx != NULL) {
+        result = ma_SetFilePointerEx((HANDLE)file, liDistanceToMove, NULL, dwMoveMethod);
+    } else if (ma_SetFilePointer != NULL) {
+        /* No SetFilePointerEx() so restrict to 31 bits. */
+        if (offset > 0x7FFFFFFF) {
+            return MA_OUT_OF_RANGE;
+        }
+
+        result = ma_SetFilePointer((HANDLE)file, (LONG)liDistanceToMove.QuadPart, NULL, dwMoveMethod);
+    } else {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    if (result == 0) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_tell__win32(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
+{
+    LARGE_INTEGER liZero;
+    LARGE_INTEGER liTell;
+    BOOL result;
+
+    (void)pVFS;
+
+    liZero.QuadPart = 0;
+
+    if (ma_SetFilePointerEx != NULL) {
+        result = ma_SetFilePointerEx((HANDLE)file, liZero, &liTell, FILE_CURRENT);
+    } else if (ma_SetFilePointer != NULL) {
+        LONG tell;
+
+        result = ma_SetFilePointer((HANDLE)file, (LONG)liZero.QuadPart, &tell, FILE_CURRENT);
+        liTell.QuadPart = tell;
+    } else {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    if (result == 0) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    if (pCursor != NULL) {
+        *pCursor = liTell.QuadPart;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_info__win32(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
+{
+    BY_HANDLE_FILE_INFORMATION fi;
+    BOOL result;
+
+    (void)pVFS;
+
+    result = GetFileInformationByHandle((HANDLE)file, &fi);
+    if (result == 0) {
+        return ma_result_from_GetLastError(GetLastError());
+    }
+
+    pInfo->sizeInBytes = ((ma_uint64)fi.nFileSizeHigh << 32) | ((ma_uint64)fi.nFileSizeLow);
+
+    return MA_SUCCESS;
+}
+#else
+static ma_result ma_default_vfs_open__stdio(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    ma_result result;
+    FILE* pFileStd;
+    const char* pOpenModeStr;
+
+    MA_ASSERT(pFilePath != NULL);
+    MA_ASSERT(openMode  != 0);
+    MA_ASSERT(pFile     != NULL);
+
+    (void)pVFS;
+
+    if ((openMode & MA_OPEN_MODE_READ) != 0) {
+        if ((openMode & MA_OPEN_MODE_WRITE) != 0) {
+            pOpenModeStr = "r+";
+        } else {
+            pOpenModeStr = "rb";
+        }
+    } else {
+        pOpenModeStr = "wb";
+    }
+
+    result = ma_fopen(&pFileStd, pFilePath, pOpenModeStr);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pFile = pFileStd;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_open_w__stdio(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    ma_result result;
+    FILE* pFileStd;
+    const wchar_t* pOpenModeStr;
+
+    MA_ASSERT(pFilePath != NULL);
+    MA_ASSERT(openMode  != 0);
+    MA_ASSERT(pFile     != NULL);
+
+    (void)pVFS;
+
+    if ((openMode & MA_OPEN_MODE_READ) != 0) {
+        if ((openMode & MA_OPEN_MODE_WRITE) != 0) {
+            pOpenModeStr = L"r+";
+        } else {
+            pOpenModeStr = L"rb";
+        }
+    } else {
+        pOpenModeStr = L"wb";
+    }
+
+    result = ma_wfopen(&pFileStd, pFilePath, pOpenModeStr, (pVFS != NULL) ? &((ma_default_vfs*)pVFS)->allocationCallbacks : NULL);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pFile = pFileStd;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_close__stdio(ma_vfs* pVFS, ma_vfs_file file)
+{
+    MA_ASSERT(file != NULL);
+
+    (void)pVFS;
+
+    fclose((FILE*)file);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_read__stdio(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
+{
+    size_t result;
+
+    MA_ASSERT(file != NULL);
+    MA_ASSERT(pDst != NULL);
+
+    (void)pVFS;
+
+    result = fread(pDst, 1, sizeInBytes, (FILE*)file);
+
+    if (pBytesRead != NULL) {
+        *pBytesRead = result;
+    }
+
+    if (result != sizeInBytes) {
+        if (result == 0 && feof((FILE*)file)) {
+            return MA_AT_END;
+        } else {
+            return ma_result_from_errno(ferror((FILE*)file));
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_write__stdio(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
+{
+    size_t result;
+
+    MA_ASSERT(file != NULL);
+    MA_ASSERT(pSrc != NULL);
+
+    (void)pVFS;
+
+    result = fwrite(pSrc, 1, sizeInBytes, (FILE*)file);
+
+    if (pBytesWritten != NULL) {
+        *pBytesWritten = result;
+    }
+
+    if (result != sizeInBytes) {
+        return ma_result_from_errno(ferror((FILE*)file));
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_seek__stdio(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
+{
+    int result;
+    int whence;
+
+    MA_ASSERT(file != NULL);
+
+    (void)pVFS;
+
+    if (origin == ma_seek_origin_start) {
+        whence = SEEK_SET;
+    } else if (origin == ma_seek_origin_end) {
+        whence = SEEK_END;
+    } else {
+        whence = SEEK_CUR;
+    }
+
+#if defined(_WIN32)
+    #if defined(_MSC_VER) && _MSC_VER > 1200
+        result = _fseeki64((FILE*)file, offset, whence);
+    #else
+        /* No _fseeki64() so restrict to 31 bits. */
+        if (offset > 0x7FFFFFFF) {
+            return MA_OUT_OF_RANGE;
+        }
+
+        result = fseek((FILE*)file, (int)offset, whence);
+    #endif
+#else
+    result = fseek((FILE*)file, (long int)offset, whence);
+#endif
+    if (result != 0) {
+        return MA_ERROR;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_default_vfs_tell__stdio(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
+{
+    ma_int64 result;
+
+    MA_ASSERT(file    != NULL);
+    MA_ASSERT(pCursor != NULL);
+
+    (void)pVFS;
+
+#if defined(_WIN32)
+    #if defined(_MSC_VER) && _MSC_VER > 1200
+        result = _ftelli64((FILE*)file);
+    #else
+        result = ftell((FILE*)file);
+    #endif
+#else
+    result = ftell((FILE*)file);
+#endif
+
+    *pCursor = result;
+
+    return MA_SUCCESS;
+}
+
+#if !defined(_MSC_VER) && !((defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 1) || defined(_XOPEN_SOURCE) || defined(_POSIX_SOURCE)) && !defined(MA_BSD)
+int fileno(FILE *stream);
+#endif
+
+static ma_result ma_default_vfs_info__stdio(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
+{
+    int fd;
+    struct stat info;
+
+    MA_ASSERT(file  != NULL);
+    MA_ASSERT(pInfo != NULL);
+
+    (void)pVFS;
+
+#if defined(_MSC_VER)
+    fd = _fileno((FILE*)file);
+#else
+    fd =  fileno((FILE*)file);
+#endif
+
+    if (fstat(fd, &info) != 0) {
+        return ma_result_from_errno(errno);
+    }
+
+    pInfo->sizeInBytes = info.st_size;
+
+    return MA_SUCCESS;
+}
+#endif
+
+
+static ma_result ma_default_vfs_open(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    if (pFile == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pFile = NULL;
+
+    if (pFilePath == NULL || openMode == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_USE_WIN32_FILEIO)
+    return ma_default_vfs_open__win32(pVFS, pFilePath, openMode, pFile);
+#else
+    return ma_default_vfs_open__stdio(pVFS, pFilePath, openMode, pFile);
+#endif
+}
+
+static ma_result ma_default_vfs_open_w(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    if (pFile == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pFile = NULL;
+
+    if (pFilePath == NULL || openMode == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_USE_WIN32_FILEIO)
+    return ma_default_vfs_open_w__win32(pVFS, pFilePath, openMode, pFile);
+#else
+    return ma_default_vfs_open_w__stdio(pVFS, pFilePath, openMode, pFile);
+#endif
+}
+
+static ma_result ma_default_vfs_close(ma_vfs* pVFS, ma_vfs_file file)
+{
+    if (file == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_USE_WIN32_FILEIO)
+    return ma_default_vfs_close__win32(pVFS, file);
+#else
+    return ma_default_vfs_close__stdio(pVFS, file);
+#endif
+}
+
+static ma_result ma_default_vfs_read(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
+{
+    if (pBytesRead != NULL) {
+        *pBytesRead = 0;
+    }
+
+    if (file == NULL || pDst == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_USE_WIN32_FILEIO)
+    return ma_default_vfs_read__win32(pVFS, file, pDst, sizeInBytes, pBytesRead);
+#else
+    return ma_default_vfs_read__stdio(pVFS, file, pDst, sizeInBytes, pBytesRead);
+#endif
+}
+
+static ma_result ma_default_vfs_write(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
+{
+    if (pBytesWritten != NULL) {
+        *pBytesWritten = 0;
+    }
+
+    if (file == NULL || pSrc == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_USE_WIN32_FILEIO)
+    return ma_default_vfs_write__win32(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
+#else
+    return ma_default_vfs_write__stdio(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
+#endif
+}
+
+static ma_result ma_default_vfs_seek(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
+{
+    if (file == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_USE_WIN32_FILEIO)
+    return ma_default_vfs_seek__win32(pVFS, file, offset, origin);
+#else
+    return ma_default_vfs_seek__stdio(pVFS, file, offset, origin);
+#endif
+}
+
+static ma_result ma_default_vfs_tell(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;
+
+    if (file == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_USE_WIN32_FILEIO)
+    return ma_default_vfs_tell__win32(pVFS, file, pCursor);
+#else
+    return ma_default_vfs_tell__stdio(pVFS, file, pCursor);
+#endif
+}
+
+static ma_result ma_default_vfs_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
+{
+    if (pInfo == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pInfo);
+
+    if (file == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+#if defined(MA_USE_WIN32_FILEIO)
+    return ma_default_vfs_info__win32(pVFS, file, pInfo);
+#else
+    return ma_default_vfs_info__stdio(pVFS, file, pInfo);
+#endif
+}
+
+
+MA_API ma_result ma_default_vfs_init(ma_default_vfs* pVFS, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pVFS == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pVFS->cb.onOpen  = ma_default_vfs_open;
+    pVFS->cb.onOpenW = ma_default_vfs_open_w;
+    pVFS->cb.onClose = ma_default_vfs_close;
+    pVFS->cb.onRead  = ma_default_vfs_read;
+    pVFS->cb.onWrite = ma_default_vfs_write;
+    pVFS->cb.onSeek  = ma_default_vfs_seek;
+    pVFS->cb.onTell  = ma_default_vfs_tell;
+    pVFS->cb.onInfo  = ma_default_vfs_info;
+    ma_allocation_callbacks_init_copy(&pVFS->allocationCallbacks, pAllocationCallbacks);
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_vfs_or_default_open(ma_vfs* pVFS, const char* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    if (pVFS != NULL) {
+        return ma_vfs_open(pVFS, pFilePath, openMode, pFile);
+    } else {
+        return ma_default_vfs_open(pVFS, pFilePath, openMode, pFile);
+    }
+}
+
+MA_API ma_result ma_vfs_or_default_open_w(ma_vfs* pVFS, const wchar_t* pFilePath, ma_uint32 openMode, ma_vfs_file* pFile)
+{
+    if (pVFS != NULL) {
+        return ma_vfs_open_w(pVFS, pFilePath, openMode, pFile);
+    } else {
+        return ma_default_vfs_open_w(pVFS, pFilePath, openMode, pFile);
+    }
+}
+
+MA_API ma_result ma_vfs_or_default_close(ma_vfs* pVFS, ma_vfs_file file)
+{
+    if (pVFS != NULL) {
+        return ma_vfs_close(pVFS, file);
+    } else {
+        return ma_default_vfs_close(pVFS, file);
+    }
+}
+
+MA_API ma_result ma_vfs_or_default_read(ma_vfs* pVFS, ma_vfs_file file, void* pDst, size_t sizeInBytes, size_t* pBytesRead)
+{
+    if (pVFS != NULL) {
+        return ma_vfs_read(pVFS, file, pDst, sizeInBytes, pBytesRead);
+    } else {
+        return ma_default_vfs_read(pVFS, file, pDst, sizeInBytes, pBytesRead);
+    }
+}
+
+MA_API ma_result ma_vfs_or_default_write(ma_vfs* pVFS, ma_vfs_file file, const void* pSrc, size_t sizeInBytes, size_t* pBytesWritten)
+{
+    if (pVFS != NULL) {
+        return ma_vfs_write(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
+    } else {
+        return ma_default_vfs_write(pVFS, file, pSrc, sizeInBytes, pBytesWritten);
+    }
+}
+
+MA_API ma_result ma_vfs_or_default_seek(ma_vfs* pVFS, ma_vfs_file file, ma_int64 offset, ma_seek_origin origin)
+{
+    if (pVFS != NULL) {
+        return ma_vfs_seek(pVFS, file, offset, origin);
+    } else {
+        return ma_default_vfs_seek(pVFS, file, offset, origin);
+    }
+}
+
+MA_API ma_result ma_vfs_or_default_tell(ma_vfs* pVFS, ma_vfs_file file, ma_int64* pCursor)
+{
+    if (pVFS != NULL) {
+        return ma_vfs_tell(pVFS, file, pCursor);
+    } else {
+        return ma_default_vfs_tell(pVFS, file, pCursor);
+    }
+}
+
+MA_API ma_result ma_vfs_or_default_info(ma_vfs* pVFS, ma_vfs_file file, ma_file_info* pInfo)
+{
+    if (pVFS != NULL) {
+        return ma_vfs_info(pVFS, file, pInfo);
+    } else {
+        return ma_default_vfs_info(pVFS, file, pInfo);
+    }
+}
+
+
+
+static ma_result ma_vfs_open_and_read_file_ex(ma_vfs* pVFS, const char* pFilePath, const wchar_t* pFilePathW, void** ppData, size_t* pSize, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_result result;
+    ma_vfs_file file;
+    ma_file_info info;
+    void* pData;
+    size_t bytesRead;
+
+    if (ppData != NULL) {
+        *ppData = NULL;
+    }
+    if (pSize != NULL) {
+        *pSize = 0;
+    }
+
+    if (ppData == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pFilePath != NULL) {
+        result = ma_vfs_or_default_open(pVFS, pFilePath, MA_OPEN_MODE_READ, &file);
+    } else {
+        result = ma_vfs_or_default_open_w(pVFS, pFilePathW, MA_OPEN_MODE_READ, &file);
+    }
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_vfs_or_default_info(pVFS, file, &info);
+    if (result != MA_SUCCESS) {
+        ma_vfs_or_default_close(pVFS, file);
+        return result;
+    }
+
+    if (info.sizeInBytes > MA_SIZE_MAX) {
+        ma_vfs_or_default_close(pVFS, file);
+        return MA_TOO_BIG;
+    }
+
+    pData = ma_malloc((size_t)info.sizeInBytes, pAllocationCallbacks);  /* Safe cast. */
+    if (pData == NULL) {
+        ma_vfs_or_default_close(pVFS, file);
+        return result;
+    }
+
+    result = ma_vfs_or_default_read(pVFS, file, pData, (size_t)info.sizeInBytes, &bytesRead);  /* Safe cast. */
+    ma_vfs_or_default_close(pVFS, file);
+
+    if (result != MA_SUCCESS) {
+        ma_free(pData, pAllocationCallbacks);
+        return result;
+    }
+
+    if (pSize != NULL) {
+        *pSize = bytesRead;
+    }
+
+    MA_ASSERT(ppData != NULL);
+    *ppData = pData;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_vfs_open_and_read_file(ma_vfs* pVFS, const char* pFilePath, void** ppData, size_t* pSize, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_vfs_open_and_read_file_ex(pVFS, pFilePath, NULL, ppData, pSize, pAllocationCallbacks);
+}
+
+MA_API ma_result ma_vfs_open_and_read_file_w(ma_vfs* pVFS, const wchar_t* pFilePath, void** ppData, size_t* pSize, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_vfs_open_and_read_file_ex(pVFS, NULL, pFilePath, ppData, pSize, pAllocationCallbacks);
+}
+
+
+
+/**************************************************************************************************************************************************************
+
+Decoding and Encoding Headers. These are auto-generated from a tool.
+
+**************************************************************************************************************************************************************/
+#if !defined(MA_NO_WAV) && (!defined(MA_NO_DECODING) || !defined(MA_NO_ENCODING))
+/* dr_wav_h begin */
+#ifndef ma_dr_wav_h
+#define ma_dr_wav_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define MA_DR_WAV_STRINGIFY(x)      #x
+#define MA_DR_WAV_XSTRINGIFY(x)     MA_DR_WAV_STRINGIFY(x)
+#define MA_DR_WAV_VERSION_MAJOR     0
+#define MA_DR_WAV_VERSION_MINOR     13
+#define MA_DR_WAV_VERSION_REVISION  18
+#define MA_DR_WAV_VERSION_STRING    MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_MAJOR) "." MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_MINOR) "." MA_DR_WAV_XSTRINGIFY(MA_DR_WAV_VERSION_REVISION)
+#include <stddef.h>
+#define MA_DR_WAVE_FORMAT_PCM          0x1
+#define MA_DR_WAVE_FORMAT_ADPCM        0x2
+#define MA_DR_WAVE_FORMAT_IEEE_FLOAT   0x3
+#define MA_DR_WAVE_FORMAT_ALAW         0x6
+#define MA_DR_WAVE_FORMAT_MULAW        0x7
+#define MA_DR_WAVE_FORMAT_DVI_ADPCM    0x11
+#define MA_DR_WAVE_FORMAT_EXTENSIBLE   0xFFFE
+#define MA_DR_WAV_SEQUENTIAL            0x00000001
+#define MA_DR_WAV_WITH_METADATA         0x00000002
+MA_API void ma_dr_wav_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision);
+MA_API const char* ma_dr_wav_version_string(void);
+typedef enum
+{
+    ma_dr_wav_seek_origin_start,
+    ma_dr_wav_seek_origin_current
+} ma_dr_wav_seek_origin;
+typedef enum
+{
+    ma_dr_wav_container_riff,
+    ma_dr_wav_container_rifx,
+    ma_dr_wav_container_w64,
+    ma_dr_wav_container_rf64,
+    ma_dr_wav_container_aiff
+} ma_dr_wav_container;
+typedef struct
+{
+    union
+    {
+        ma_uint8 fourcc[4];
+        ma_uint8 guid[16];
+    } id;
+    ma_uint64 sizeInBytes;
+    unsigned int paddingSize;
+} ma_dr_wav_chunk_header;
+typedef struct
+{
+    ma_uint16 formatTag;
+    ma_uint16 channels;
+    ma_uint32 sampleRate;
+    ma_uint32 avgBytesPerSec;
+    ma_uint16 blockAlign;
+    ma_uint16 bitsPerSample;
+    ma_uint16 extendedSize;
+    ma_uint16 validBitsPerSample;
+    ma_uint32 channelMask;
+    ma_uint8 subFormat[16];
+} ma_dr_wav_fmt;
+MA_API ma_uint16 ma_dr_wav_fmt_get_format(const ma_dr_wav_fmt* pFMT);
+typedef size_t (* ma_dr_wav_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
+typedef size_t (* ma_dr_wav_write_proc)(void* pUserData, const void* pData, size_t bytesToWrite);
+typedef ma_bool32 (* ma_dr_wav_seek_proc)(void* pUserData, int offset, ma_dr_wav_seek_origin origin);
+typedef ma_uint64 (* ma_dr_wav_chunk_proc)(void* pChunkUserData, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pReadSeekUserData, const ma_dr_wav_chunk_header* pChunkHeader, ma_dr_wav_container container, const ma_dr_wav_fmt* pFMT);
+typedef struct
+{
+    const ma_uint8* data;
+    size_t dataSize;
+    size_t currentReadPos;
+} ma_dr_wav__memory_stream;
+typedef struct
+{
+    void** ppData;
+    size_t* pDataSize;
+    size_t dataSize;
+    size_t dataCapacity;
+    size_t currentWritePos;
+} ma_dr_wav__memory_stream_write;
+typedef struct
+{
+    ma_dr_wav_container container;
+    ma_uint32 format;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_uint32 bitsPerSample;
+} ma_dr_wav_data_format;
+typedef enum
+{
+    ma_dr_wav_metadata_type_none                        = 0,
+    ma_dr_wav_metadata_type_unknown                     = 1 << 0,
+    ma_dr_wav_metadata_type_smpl                        = 1 << 1,
+    ma_dr_wav_metadata_type_inst                        = 1 << 2,
+    ma_dr_wav_metadata_type_cue                         = 1 << 3,
+    ma_dr_wav_metadata_type_acid                        = 1 << 4,
+    ma_dr_wav_metadata_type_bext                        = 1 << 5,
+    ma_dr_wav_metadata_type_list_label                  = 1 << 6,
+    ma_dr_wav_metadata_type_list_note                   = 1 << 7,
+    ma_dr_wav_metadata_type_list_labelled_cue_region    = 1 << 8,
+    ma_dr_wav_metadata_type_list_info_software          = 1 << 9,
+    ma_dr_wav_metadata_type_list_info_copyright         = 1 << 10,
+    ma_dr_wav_metadata_type_list_info_title             = 1 << 11,
+    ma_dr_wav_metadata_type_list_info_artist            = 1 << 12,
+    ma_dr_wav_metadata_type_list_info_comment           = 1 << 13,
+    ma_dr_wav_metadata_type_list_info_date              = 1 << 14,
+    ma_dr_wav_metadata_type_list_info_genre             = 1 << 15,
+    ma_dr_wav_metadata_type_list_info_album             = 1 << 16,
+    ma_dr_wav_metadata_type_list_info_tracknumber       = 1 << 17,
+    ma_dr_wav_metadata_type_list_all_info_strings       = ma_dr_wav_metadata_type_list_info_software
+                                                    | ma_dr_wav_metadata_type_list_info_copyright
+                                                    | ma_dr_wav_metadata_type_list_info_title
+                                                    | ma_dr_wav_metadata_type_list_info_artist
+                                                    | ma_dr_wav_metadata_type_list_info_comment
+                                                    | ma_dr_wav_metadata_type_list_info_date
+                                                    | ma_dr_wav_metadata_type_list_info_genre
+                                                    | ma_dr_wav_metadata_type_list_info_album
+                                                    | ma_dr_wav_metadata_type_list_info_tracknumber,
+    ma_dr_wav_metadata_type_list_all_adtl               = ma_dr_wav_metadata_type_list_label
+                                                    | ma_dr_wav_metadata_type_list_note
+                                                    | ma_dr_wav_metadata_type_list_labelled_cue_region,
+    ma_dr_wav_metadata_type_all                         = -2,
+    ma_dr_wav_metadata_type_all_including_unknown       = -1
+} ma_dr_wav_metadata_type;
+typedef enum
+{
+    ma_dr_wav_smpl_loop_type_forward  = 0,
+    ma_dr_wav_smpl_loop_type_pingpong = 1,
+    ma_dr_wav_smpl_loop_type_backward = 2
+} ma_dr_wav_smpl_loop_type;
+typedef struct
+{
+    ma_uint32 cuePointId;
+    ma_uint32 type;
+    ma_uint32 firstSampleByteOffset;
+    ma_uint32 lastSampleByteOffset;
+    ma_uint32 sampleFraction;
+    ma_uint32 playCount;
+} ma_dr_wav_smpl_loop;
+typedef struct
+{
+    ma_uint32 manufacturerId;
+    ma_uint32 productId;
+    ma_uint32 samplePeriodNanoseconds;
+    ma_uint32 midiUnityNote;
+    ma_uint32 midiPitchFraction;
+    ma_uint32 smpteFormat;
+    ma_uint32 smpteOffset;
+    ma_uint32 sampleLoopCount;
+    ma_uint32 samplerSpecificDataSizeInBytes;
+    ma_dr_wav_smpl_loop* pLoops;
+    ma_uint8* pSamplerSpecificData;
+} ma_dr_wav_smpl;
+typedef struct
+{
+    ma_int8 midiUnityNote;
+    ma_int8 fineTuneCents;
+    ma_int8 gainDecibels;
+    ma_int8 lowNote;
+    ma_int8 highNote;
+    ma_int8 lowVelocity;
+    ma_int8 highVelocity;
+} ma_dr_wav_inst;
+typedef struct
+{
+    ma_uint32 id;
+    ma_uint32 playOrderPosition;
+    ma_uint8 dataChunkId[4];
+    ma_uint32 chunkStart;
+    ma_uint32 blockStart;
+    ma_uint32 sampleByteOffset;
+} ma_dr_wav_cue_point;
+typedef struct
+{
+    ma_uint32 cuePointCount;
+    ma_dr_wav_cue_point *pCuePoints;
+} ma_dr_wav_cue;
+typedef enum
+{
+    ma_dr_wav_acid_flag_one_shot      = 1,
+    ma_dr_wav_acid_flag_root_note_set = 2,
+    ma_dr_wav_acid_flag_stretch       = 4,
+    ma_dr_wav_acid_flag_disk_based    = 8,
+    ma_dr_wav_acid_flag_acidizer      = 16
+} ma_dr_wav_acid_flag;
+typedef struct
+{
+    ma_uint32 flags;
+    ma_uint16 midiUnityNote;
+    ma_uint16 reserved1;
+    float reserved2;
+    ma_uint32 numBeats;
+    ma_uint16 meterDenominator;
+    ma_uint16 meterNumerator;
+    float tempo;
+} ma_dr_wav_acid;
+typedef struct
+{
+    ma_uint32 cuePointId;
+    ma_uint32 stringLength;
+    char* pString;
+} ma_dr_wav_list_label_or_note;
+typedef struct
+{
+    char* pDescription;
+    char* pOriginatorName;
+    char* pOriginatorReference;
+    char  pOriginationDate[10];
+    char  pOriginationTime[8];
+    ma_uint64 timeReference;
+    ma_uint16 version;
+    char* pCodingHistory;
+    ma_uint32 codingHistorySize;
+    ma_uint8* pUMID;
+    ma_uint16 loudnessValue;
+    ma_uint16 loudnessRange;
+    ma_uint16 maxTruePeakLevel;
+    ma_uint16 maxMomentaryLoudness;
+    ma_uint16 maxShortTermLoudness;
+} ma_dr_wav_bext;
+typedef struct
+{
+    ma_uint32 stringLength;
+    char* pString;
+} ma_dr_wav_list_info_text;
+typedef struct
+{
+    ma_uint32 cuePointId;
+    ma_uint32 sampleLength;
+    ma_uint8 purposeId[4];
+    ma_uint16 country;
+    ma_uint16 language;
+    ma_uint16 dialect;
+    ma_uint16 codePage;
+    ma_uint32 stringLength;
+    char* pString;
+} ma_dr_wav_list_labelled_cue_region;
+typedef enum
+{
+    ma_dr_wav_metadata_location_invalid,
+    ma_dr_wav_metadata_location_top_level,
+    ma_dr_wav_metadata_location_inside_info_list,
+    ma_dr_wav_metadata_location_inside_adtl_list
+} ma_dr_wav_metadata_location;
+typedef struct
+{
+    ma_uint8 id[4];
+    ma_dr_wav_metadata_location chunkLocation;
+    ma_uint32 dataSizeInBytes;
+    ma_uint8* pData;
+} ma_dr_wav_unknown_metadata;
+typedef struct
+{
+    ma_dr_wav_metadata_type type;
+    union
+    {
+        ma_dr_wav_cue cue;
+        ma_dr_wav_smpl smpl;
+        ma_dr_wav_acid acid;
+        ma_dr_wav_inst inst;
+        ma_dr_wav_bext bext;
+        ma_dr_wav_list_label_or_note labelOrNote;
+        ma_dr_wav_list_labelled_cue_region labelledCueRegion;
+        ma_dr_wav_list_info_text infoText;
+        ma_dr_wav_unknown_metadata unknown;
+    } data;
+} ma_dr_wav_metadata;
+typedef struct
+{
+    ma_dr_wav_read_proc onRead;
+    ma_dr_wav_write_proc onWrite;
+    ma_dr_wav_seek_proc onSeek;
+    void* pUserData;
+    ma_allocation_callbacks allocationCallbacks;
+    ma_dr_wav_container container;
+    ma_dr_wav_fmt fmt;
+    ma_uint32 sampleRate;
+    ma_uint16 channels;
+    ma_uint16 bitsPerSample;
+    ma_uint16 translatedFormatTag;
+    ma_uint64 totalPCMFrameCount;
+    ma_uint64 dataChunkDataSize;
+    ma_uint64 dataChunkDataPos;
+    ma_uint64 bytesRemaining;
+    ma_uint64 readCursorInPCMFrames;
+    ma_uint64 dataChunkDataSizeTargetWrite;
+    ma_bool32 isSequentialWrite;
+    ma_dr_wav_metadata* pMetadata;
+    ma_uint32 metadataCount;
+    ma_dr_wav__memory_stream memoryStream;
+    ma_dr_wav__memory_stream_write memoryStreamWrite;
+    struct
+    {
+        ma_uint32 bytesRemainingInBlock;
+        ma_uint16 predictor[2];
+        ma_int32  delta[2];
+        ma_int32  cachedFrames[4];
+        ma_uint32 cachedFrameCount;
+        ma_int32  prevFrames[2][2];
+    } msadpcm;
+    struct
+    {
+        ma_uint32 bytesRemainingInBlock;
+        ma_int32  predictor[2];
+        ma_int32  stepIndex[2];
+        ma_int32  cachedFrames[16];
+        ma_uint32 cachedFrameCount;
+    } ima;
+    struct
+    {
+        ma_bool8 isLE;
+        ma_bool8 isUnsigned;
+    } aiff;
+} ma_dr_wav;
+MA_API ma_bool32 ma_dr_wav_init(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_ex(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, ma_dr_wav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_with_metadata(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_write(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_write_sequential(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_dr_wav_write_proc onWrite, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_write_sequential_pcm_frames(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, ma_dr_wav_write_proc onWrite, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_write_with_metadata(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount);
+MA_API ma_uint64 ma_dr_wav_target_write_size_bytes(const ma_dr_wav_data_format* pFormat, ma_uint64 totalFrameCount, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount);
+MA_API ma_dr_wav_metadata* ma_dr_wav_take_ownership_of_metadata(ma_dr_wav* pWav);
+MA_API ma_result ma_dr_wav_uninit(ma_dr_wav* pWav);
+MA_API size_t ma_dr_wav_read_raw(ma_dr_wav* pWav, size_t bytesToRead, void* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_le(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_be(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut);
+MA_API ma_bool32 ma_dr_wav_seek_to_pcm_frame(ma_dr_wav* pWav, ma_uint64 targetFrameIndex);
+MA_API ma_result ma_dr_wav_get_cursor_in_pcm_frames(ma_dr_wav* pWav, ma_uint64* pCursor);
+MA_API ma_result ma_dr_wav_get_length_in_pcm_frames(ma_dr_wav* pWav, ma_uint64* pLength);
+MA_API size_t ma_dr_wav_write_raw(ma_dr_wav* pWav, size_t bytesToWrite, const void* pData);
+MA_API ma_uint64 ma_dr_wav_write_pcm_frames(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData);
+MA_API ma_uint64 ma_dr_wav_write_pcm_frames_le(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData);
+MA_API ma_uint64 ma_dr_wav_write_pcm_frames_be(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData);
+#ifndef MA_DR_WAV_NO_CONVERSION_API
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16le(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16be(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut);
+MA_API void ma_dr_wav_u8_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_s24_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_s32_to_s16(ma_int16* pOut, const ma_int32* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_f32_to_s16(ma_int16* pOut, const float* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_f64_to_s16(ma_int16* pOut, const double* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_alaw_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_mulaw_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32le(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32be(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut);
+MA_API void ma_dr_wav_u8_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_s16_to_f32(float* pOut, const ma_int16* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_s24_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_s32_to_f32(float* pOut, const ma_int32* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_alaw_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_mulaw_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32le(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut);
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32be(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut);
+MA_API void ma_dr_wav_u8_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_s16_to_s32(ma_int32* pOut, const ma_int16* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_s24_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_f32_to_s32(ma_int32* pOut, const float* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_f64_to_s32(ma_int32* pOut, const double* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_alaw_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount);
+MA_API void ma_dr_wav_mulaw_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount);
+#endif
+#ifndef MA_DR_WAV_NO_STDIO
+MA_API ma_bool32 ma_dr_wav_init_file(ma_dr_wav* pWav, const char* filename, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_ex(ma_dr_wav* pWav, const char* filename, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_ex_w(ma_dr_wav* pWav, const wchar_t* filename, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_with_metadata(ma_dr_wav* pWav, const char* filename, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_with_metadata_w(ma_dr_wav* pWav, const wchar_t* filename, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_write(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_write_sequential(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_pcm_frames(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_write_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_pcm_frames_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+#endif
+MA_API ma_bool32 ma_dr_wav_init_memory(ma_dr_wav* pWav, const void* data, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_memory_ex(ma_dr_wav* pWav, const void* data, size_t dataSize, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_memory_with_metadata(ma_dr_wav* pWav, const void* data, size_t dataSize, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_memory_write(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_memory_write_sequential(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_wav_init_memory_write_sequential_pcm_frames(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+#ifndef MA_DR_WAV_NO_CONVERSION_API
+MA_API ma_int16* ma_dr_wav_open_and_read_pcm_frames_s16(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API float* ma_dr_wav_open_and_read_pcm_frames_f32(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int32* ma_dr_wav_open_and_read_pcm_frames_s32(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+#ifndef MA_DR_WAV_NO_STDIO
+MA_API ma_int16* ma_dr_wav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API float* ma_dr_wav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int32* ma_dr_wav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int16* ma_dr_wav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API float* ma_dr_wav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int32* ma_dr_wav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+#endif
+MA_API ma_int16* ma_dr_wav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API float* ma_dr_wav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int32* ma_dr_wav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks);
+#endif
+MA_API void ma_dr_wav_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_uint16 ma_dr_wav_bytes_to_u16(const ma_uint8* data);
+MA_API ma_int16 ma_dr_wav_bytes_to_s16(const ma_uint8* data);
+MA_API ma_uint32 ma_dr_wav_bytes_to_u32(const ma_uint8* data);
+MA_API ma_int32 ma_dr_wav_bytes_to_s32(const ma_uint8* data);
+MA_API ma_uint64 ma_dr_wav_bytes_to_u64(const ma_uint8* data);
+MA_API ma_int64 ma_dr_wav_bytes_to_s64(const ma_uint8* data);
+MA_API float ma_dr_wav_bytes_to_f32(const ma_uint8* data);
+MA_API ma_bool32 ma_dr_wav_guid_equal(const ma_uint8 a[16], const ma_uint8 b[16]);
+MA_API ma_bool32 ma_dr_wav_fourcc_equal(const ma_uint8* a, const char* b);
+#ifdef __cplusplus
+}
+#endif
+#endif
+/* dr_wav_h end */
+#endif  /* MA_NO_WAV */
+
+#if !defined(MA_NO_FLAC) && !defined(MA_NO_DECODING)
+/* dr_flac_h begin */
+#ifndef ma_dr_flac_h
+#define ma_dr_flac_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define MA_DR_FLAC_STRINGIFY(x)      #x
+#define MA_DR_FLAC_XSTRINGIFY(x)     MA_DR_FLAC_STRINGIFY(x)
+#define MA_DR_FLAC_VERSION_MAJOR     0
+#define MA_DR_FLAC_VERSION_MINOR     12
+#define MA_DR_FLAC_VERSION_REVISION  43
+#define MA_DR_FLAC_VERSION_STRING    MA_DR_FLAC_XSTRINGIFY(MA_DR_FLAC_VERSION_MAJOR) "." MA_DR_FLAC_XSTRINGIFY(MA_DR_FLAC_VERSION_MINOR) "." MA_DR_FLAC_XSTRINGIFY(MA_DR_FLAC_VERSION_REVISION)
+#include <stddef.h>
+#if defined(_MSC_VER) && _MSC_VER >= 1700
+    #define MA_DR_FLAC_DEPRECATED       __declspec(deprecated)
+#elif (defined(__GNUC__) && __GNUC__ >= 4)
+    #define MA_DR_FLAC_DEPRECATED       __attribute__((deprecated))
+#elif defined(__has_feature)
+    #if __has_feature(attribute_deprecated)
+        #define MA_DR_FLAC_DEPRECATED   __attribute__((deprecated))
+    #else
+        #define MA_DR_FLAC_DEPRECATED
+    #endif
+#else
+    #define MA_DR_FLAC_DEPRECATED
+#endif
+MA_API void ma_dr_flac_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision);
+MA_API const char* ma_dr_flac_version_string(void);
+#ifndef MA_DR_FLAC_BUFFER_SIZE
+#define MA_DR_FLAC_BUFFER_SIZE   4096
+#endif
+#ifdef MA_64BIT
+typedef ma_uint64 ma_dr_flac_cache_t;
+#else
+typedef ma_uint32 ma_dr_flac_cache_t;
+#endif
+#define MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO       0
+#define MA_DR_FLAC_METADATA_BLOCK_TYPE_PADDING          1
+#define MA_DR_FLAC_METADATA_BLOCK_TYPE_APPLICATION      2
+#define MA_DR_FLAC_METADATA_BLOCK_TYPE_SEEKTABLE        3
+#define MA_DR_FLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT   4
+#define MA_DR_FLAC_METADATA_BLOCK_TYPE_CUESHEET         5
+#define MA_DR_FLAC_METADATA_BLOCK_TYPE_PICTURE          6
+#define MA_DR_FLAC_METADATA_BLOCK_TYPE_INVALID          127
+#define MA_DR_FLAC_PICTURE_TYPE_OTHER                   0
+#define MA_DR_FLAC_PICTURE_TYPE_FILE_ICON               1
+#define MA_DR_FLAC_PICTURE_TYPE_OTHER_FILE_ICON         2
+#define MA_DR_FLAC_PICTURE_TYPE_COVER_FRONT             3
+#define MA_DR_FLAC_PICTURE_TYPE_COVER_BACK              4
+#define MA_DR_FLAC_PICTURE_TYPE_LEAFLET_PAGE            5
+#define MA_DR_FLAC_PICTURE_TYPE_MEDIA                   6
+#define MA_DR_FLAC_PICTURE_TYPE_LEAD_ARTIST             7
+#define MA_DR_FLAC_PICTURE_TYPE_ARTIST                  8
+#define MA_DR_FLAC_PICTURE_TYPE_CONDUCTOR               9
+#define MA_DR_FLAC_PICTURE_TYPE_BAND                    10
+#define MA_DR_FLAC_PICTURE_TYPE_COMPOSER                11
+#define MA_DR_FLAC_PICTURE_TYPE_LYRICIST                12
+#define MA_DR_FLAC_PICTURE_TYPE_RECORDING_LOCATION      13
+#define MA_DR_FLAC_PICTURE_TYPE_DURING_RECORDING        14
+#define MA_DR_FLAC_PICTURE_TYPE_DURING_PERFORMANCE      15
+#define MA_DR_FLAC_PICTURE_TYPE_SCREEN_CAPTURE          16
+#define MA_DR_FLAC_PICTURE_TYPE_BRIGHT_COLORED_FISH     17
+#define MA_DR_FLAC_PICTURE_TYPE_ILLUSTRATION            18
+#define MA_DR_FLAC_PICTURE_TYPE_BAND_LOGOTYPE           19
+#define MA_DR_FLAC_PICTURE_TYPE_PUBLISHER_LOGOTYPE      20
+typedef enum
+{
+    ma_dr_flac_container_native,
+    ma_dr_flac_container_ogg,
+    ma_dr_flac_container_unknown
+} ma_dr_flac_container;
+typedef enum
+{
+    ma_dr_flac_seek_origin_start,
+    ma_dr_flac_seek_origin_current
+} ma_dr_flac_seek_origin;
+typedef struct
+{
+    ma_uint64 firstPCMFrame;
+    ma_uint64 flacFrameOffset;
+    ma_uint16 pcmFrameCount;
+} ma_dr_flac_seekpoint;
+typedef struct
+{
+    ma_uint16 minBlockSizeInPCMFrames;
+    ma_uint16 maxBlockSizeInPCMFrames;
+    ma_uint32 minFrameSizeInPCMFrames;
+    ma_uint32 maxFrameSizeInPCMFrames;
+    ma_uint32 sampleRate;
+    ma_uint8  channels;
+    ma_uint8  bitsPerSample;
+    ma_uint64 totalPCMFrameCount;
+    ma_uint8  md5[16];
+} ma_dr_flac_streaminfo;
+typedef struct
+{
+    ma_uint32 type;
+    const void* pRawData;
+    ma_uint32 rawDataSize;
+    union
+    {
+        ma_dr_flac_streaminfo streaminfo;
+        struct
+        {
+            int unused;
+        } padding;
+        struct
+        {
+            ma_uint32 id;
+            const void* pData;
+            ma_uint32 dataSize;
+        } application;
+        struct
+        {
+            ma_uint32 seekpointCount;
+            const ma_dr_flac_seekpoint* pSeekpoints;
+        } seektable;
+        struct
+        {
+            ma_uint32 vendorLength;
+            const char* vendor;
+            ma_uint32 commentCount;
+            const void* pComments;
+        } vorbis_comment;
+        struct
+        {
+            char catalog[128];
+            ma_uint64 leadInSampleCount;
+            ma_bool32 isCD;
+            ma_uint8 trackCount;
+            const void* pTrackData;
+        } cuesheet;
+        struct
+        {
+            ma_uint32 type;
+            ma_uint32 mimeLength;
+            const char* mime;
+            ma_uint32 descriptionLength;
+            const char* description;
+            ma_uint32 width;
+            ma_uint32 height;
+            ma_uint32 colorDepth;
+            ma_uint32 indexColorCount;
+            ma_uint32 pictureDataSize;
+            const ma_uint8* pPictureData;
+        } picture;
+    } data;
+} ma_dr_flac_metadata;
+typedef size_t (* ma_dr_flac_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
+typedef ma_bool32 (* ma_dr_flac_seek_proc)(void* pUserData, int offset, ma_dr_flac_seek_origin origin);
+typedef void (* ma_dr_flac_meta_proc)(void* pUserData, ma_dr_flac_metadata* pMetadata);
+typedef struct
+{
+    const ma_uint8* data;
+    size_t dataSize;
+    size_t currentReadPos;
+} ma_dr_flac__memory_stream;
+typedef struct
+{
+    ma_dr_flac_read_proc onRead;
+    ma_dr_flac_seek_proc onSeek;
+    void* pUserData;
+    size_t unalignedByteCount;
+    ma_dr_flac_cache_t unalignedCache;
+    ma_uint32 nextL2Line;
+    ma_uint32 consumedBits;
+    ma_dr_flac_cache_t cacheL2[MA_DR_FLAC_BUFFER_SIZE/sizeof(ma_dr_flac_cache_t)];
+    ma_dr_flac_cache_t cache;
+    ma_uint16 crc16;
+    ma_dr_flac_cache_t crc16Cache;
+    ma_uint32 crc16CacheIgnoredBytes;
+} ma_dr_flac_bs;
+typedef struct
+{
+    ma_uint8 subframeType;
+    ma_uint8 wastedBitsPerSample;
+    ma_uint8 lpcOrder;
+    ma_int32* pSamplesS32;
+} ma_dr_flac_subframe;
+typedef struct
+{
+    ma_uint64 pcmFrameNumber;
+    ma_uint32 flacFrameNumber;
+    ma_uint32 sampleRate;
+    ma_uint16 blockSizeInPCMFrames;
+    ma_uint8 channelAssignment;
+    ma_uint8 bitsPerSample;
+    ma_uint8 crc8;
+} ma_dr_flac_frame_header;
+typedef struct
+{
+    ma_dr_flac_frame_header header;
+    ma_uint32 pcmFramesRemaining;
+    ma_dr_flac_subframe subframes[8];
+} ma_dr_flac_frame;
+typedef struct
+{
+    ma_dr_flac_meta_proc onMeta;
+    void* pUserDataMD;
+    ma_allocation_callbacks allocationCallbacks;
+    ma_uint32 sampleRate;
+    ma_uint8 channels;
+    ma_uint8 bitsPerSample;
+    ma_uint16 maxBlockSizeInPCMFrames;
+    ma_uint64 totalPCMFrameCount;
+    ma_dr_flac_container container;
+    ma_uint32 seekpointCount;
+    ma_dr_flac_frame currentFLACFrame;
+    ma_uint64 currentPCMFrame;
+    ma_uint64 firstFLACFramePosInBytes;
+    ma_dr_flac__memory_stream memoryStream;
+    ma_int32* pDecodedSamples;
+    ma_dr_flac_seekpoint* pSeekpoints;
+    void* _oggbs;
+    ma_bool32 _noSeekTableSeek    : 1;
+    ma_bool32 _noBinarySearchSeek : 1;
+    ma_bool32 _noBruteForceSeek   : 1;
+    ma_dr_flac_bs bs;
+    ma_uint8 pExtraData[1];
+} ma_dr_flac;
+MA_API ma_dr_flac* ma_dr_flac_open(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_dr_flac* ma_dr_flac_open_relaxed(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_container container, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_dr_flac* ma_dr_flac_open_with_metadata(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_dr_flac* ma_dr_flac_open_with_metadata_relaxed(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, ma_dr_flac_container container, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API void ma_dr_flac_close(ma_dr_flac* pFlac);
+MA_API ma_uint64 ma_dr_flac_read_pcm_frames_s32(ma_dr_flac* pFlac, ma_uint64 framesToRead, ma_int32* pBufferOut);
+MA_API ma_uint64 ma_dr_flac_read_pcm_frames_s16(ma_dr_flac* pFlac, ma_uint64 framesToRead, ma_int16* pBufferOut);
+MA_API ma_uint64 ma_dr_flac_read_pcm_frames_f32(ma_dr_flac* pFlac, ma_uint64 framesToRead, float* pBufferOut);
+MA_API ma_bool32 ma_dr_flac_seek_to_pcm_frame(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex);
+#ifndef MA_DR_FLAC_NO_STDIO
+MA_API ma_dr_flac* ma_dr_flac_open_file(const char* pFileName, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_dr_flac* ma_dr_flac_open_file_w(const wchar_t* pFileName, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_dr_flac* ma_dr_flac_open_file_with_metadata(const char* pFileName, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_dr_flac* ma_dr_flac_open_file_with_metadata_w(const wchar_t* pFileName, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+#endif
+MA_API ma_dr_flac* ma_dr_flac_open_memory(const void* pData, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_dr_flac* ma_dr_flac_open_memory_with_metadata(const void* pData, size_t dataSize, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int32* ma_dr_flac_open_and_read_pcm_frames_s32(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int16* ma_dr_flac_open_and_read_pcm_frames_s16(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API float* ma_dr_flac_open_and_read_pcm_frames_f32(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+#ifndef MA_DR_FLAC_NO_STDIO
+MA_API ma_int32* ma_dr_flac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int16* ma_dr_flac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API float* ma_dr_flac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+#endif
+MA_API ma_int32* ma_dr_flac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int16* ma_dr_flac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API float* ma_dr_flac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API void ma_dr_flac_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
+typedef struct
+{
+    ma_uint32 countRemaining;
+    const char* pRunningData;
+} ma_dr_flac_vorbis_comment_iterator;
+MA_API void ma_dr_flac_init_vorbis_comment_iterator(ma_dr_flac_vorbis_comment_iterator* pIter, ma_uint32 commentCount, const void* pComments);
+MA_API const char* ma_dr_flac_next_vorbis_comment(ma_dr_flac_vorbis_comment_iterator* pIter, ma_uint32* pCommentLengthOut);
+typedef struct
+{
+    ma_uint32 countRemaining;
+    const char* pRunningData;
+} ma_dr_flac_cuesheet_track_iterator;
+typedef struct
+{
+    ma_uint64 offset;
+    ma_uint8 index;
+    ma_uint8 reserved[3];
+} ma_dr_flac_cuesheet_track_index;
+typedef struct
+{
+    ma_uint64 offset;
+    ma_uint8 trackNumber;
+    char ISRC[12];
+    ma_bool8 isAudio;
+    ma_bool8 preEmphasis;
+    ma_uint8 indexCount;
+    const ma_dr_flac_cuesheet_track_index* pIndexPoints;
+} ma_dr_flac_cuesheet_track;
+MA_API void ma_dr_flac_init_cuesheet_track_iterator(ma_dr_flac_cuesheet_track_iterator* pIter, ma_uint32 trackCount, const void* pTrackData);
+MA_API ma_bool32 ma_dr_flac_next_cuesheet_track(ma_dr_flac_cuesheet_track_iterator* pIter, ma_dr_flac_cuesheet_track* pCuesheetTrack);
+#ifdef __cplusplus
+}
+#endif
+#endif
+/* dr_flac_h end */
+#endif  /* MA_NO_FLAC */
+
+#if !defined(MA_NO_MP3) && !defined(MA_NO_DECODING)
+/* dr_mp3_h begin */
+#ifndef ma_dr_mp3_h
+#define ma_dr_mp3_h
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define MA_DR_MP3_STRINGIFY(x)      #x
+#define MA_DR_MP3_XSTRINGIFY(x)     MA_DR_MP3_STRINGIFY(x)
+#define MA_DR_MP3_VERSION_MAJOR     0
+#define MA_DR_MP3_VERSION_MINOR     6
+#define MA_DR_MP3_VERSION_REVISION  40
+#define MA_DR_MP3_VERSION_STRING    MA_DR_MP3_XSTRINGIFY(MA_DR_MP3_VERSION_MAJOR) "." MA_DR_MP3_XSTRINGIFY(MA_DR_MP3_VERSION_MINOR) "." MA_DR_MP3_XSTRINGIFY(MA_DR_MP3_VERSION_REVISION)
+#include <stddef.h>
+#define MA_DR_MP3_MAX_PCM_FRAMES_PER_MP3_FRAME  1152
+#define MA_DR_MP3_MAX_SAMPLES_PER_FRAME         (MA_DR_MP3_MAX_PCM_FRAMES_PER_MP3_FRAME*2)
+MA_API void ma_dr_mp3_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision);
+MA_API const char* ma_dr_mp3_version_string(void);
+typedef struct
+{
+    int frame_bytes, channels, hz, layer, bitrate_kbps;
+} ma_dr_mp3dec_frame_info;
+typedef struct
+{
+    float mdct_overlap[2][9*32], qmf_state[15*2*32];
+    int reserv, free_format_bytes;
+    ma_uint8 header[4], reserv_buf[511];
+} ma_dr_mp3dec;
+MA_API void ma_dr_mp3dec_init(ma_dr_mp3dec *dec);
+MA_API int ma_dr_mp3dec_decode_frame(ma_dr_mp3dec *dec, const ma_uint8 *mp3, int mp3_bytes, void *pcm, ma_dr_mp3dec_frame_info *info);
+MA_API void ma_dr_mp3dec_f32_to_s16(const float *in, ma_int16 *out, size_t num_samples);
+typedef enum
+{
+    ma_dr_mp3_seek_origin_start,
+    ma_dr_mp3_seek_origin_current
+} ma_dr_mp3_seek_origin;
+typedef struct
+{
+    ma_uint64 seekPosInBytes;
+    ma_uint64 pcmFrameIndex;
+    ma_uint16 mp3FramesToDiscard;
+    ma_uint16 pcmFramesToDiscard;
+} ma_dr_mp3_seek_point;
+typedef size_t (* ma_dr_mp3_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
+typedef ma_bool32 (* ma_dr_mp3_seek_proc)(void* pUserData, int offset, ma_dr_mp3_seek_origin origin);
+typedef struct
+{
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+} ma_dr_mp3_config;
+typedef struct
+{
+    ma_dr_mp3dec decoder;
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_dr_mp3_read_proc onRead;
+    ma_dr_mp3_seek_proc onSeek;
+    void* pUserData;
+    ma_allocation_callbacks allocationCallbacks;
+    ma_uint32 mp3FrameChannels;
+    ma_uint32 mp3FrameSampleRate;
+    ma_uint32 pcmFramesConsumedInMP3Frame;
+    ma_uint32 pcmFramesRemainingInMP3Frame;
+    ma_uint8 pcmFrames[sizeof(float)*MA_DR_MP3_MAX_SAMPLES_PER_FRAME];
+    ma_uint64 currentPCMFrame;
+    ma_uint64 streamCursor;
+    ma_dr_mp3_seek_point* pSeekPoints;
+    ma_uint32 seekPointCount;
+    size_t dataSize;
+    size_t dataCapacity;
+    size_t dataConsumed;
+    ma_uint8* pData;
+    ma_bool32 atEnd : 1;
+    struct
+    {
+        const ma_uint8* pData;
+        size_t dataSize;
+        size_t currentReadPos;
+    } memory;
+} ma_dr_mp3;
+MA_API ma_bool32 ma_dr_mp3_init(ma_dr_mp3* pMP3, ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_mp3_init_memory(ma_dr_mp3* pMP3, const void* pData, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks);
+#ifndef MA_DR_MP3_NO_STDIO
+MA_API ma_bool32 ma_dr_mp3_init_file(ma_dr_mp3* pMP3, const char* pFilePath, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_bool32 ma_dr_mp3_init_file_w(ma_dr_mp3* pMP3, const wchar_t* pFilePath, const ma_allocation_callbacks* pAllocationCallbacks);
+#endif
+MA_API void ma_dr_mp3_uninit(ma_dr_mp3* pMP3);
+MA_API ma_uint64 ma_dr_mp3_read_pcm_frames_f32(ma_dr_mp3* pMP3, ma_uint64 framesToRead, float* pBufferOut);
+MA_API ma_uint64 ma_dr_mp3_read_pcm_frames_s16(ma_dr_mp3* pMP3, ma_uint64 framesToRead, ma_int16* pBufferOut);
+MA_API ma_bool32 ma_dr_mp3_seek_to_pcm_frame(ma_dr_mp3* pMP3, ma_uint64 frameIndex);
+MA_API ma_uint64 ma_dr_mp3_get_pcm_frame_count(ma_dr_mp3* pMP3);
+MA_API ma_uint64 ma_dr_mp3_get_mp3_frame_count(ma_dr_mp3* pMP3);
+MA_API ma_bool32 ma_dr_mp3_get_mp3_and_pcm_frame_count(ma_dr_mp3* pMP3, ma_uint64* pMP3FrameCount, ma_uint64* pPCMFrameCount);
+MA_API ma_bool32 ma_dr_mp3_calculate_seek_points(ma_dr_mp3* pMP3, ma_uint32* pSeekPointCount, ma_dr_mp3_seek_point* pSeekPoints);
+MA_API ma_bool32 ma_dr_mp3_bind_seek_table(ma_dr_mp3* pMP3, ma_uint32 seekPointCount, ma_dr_mp3_seek_point* pSeekPoints);
+MA_API float* ma_dr_mp3_open_and_read_pcm_frames_f32(ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int16* ma_dr_mp3_open_and_read_pcm_frames_s16(ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API float* ma_dr_mp3_open_memory_and_read_pcm_frames_f32(const void* pData, size_t dataSize, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int16* ma_dr_mp3_open_memory_and_read_pcm_frames_s16(const void* pData, size_t dataSize, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+#ifndef MA_DR_MP3_NO_STDIO
+MA_API float* ma_dr_mp3_open_file_and_read_pcm_frames_f32(const char* filePath, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_int16* ma_dr_mp3_open_file_and_read_pcm_frames_s16(const char* filePath, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks);
+#endif
+MA_API void* ma_dr_mp3_malloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API void ma_dr_mp3_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks);
+#ifdef __cplusplus
+}
+#endif
+#endif
+/* dr_mp3_h end */
+#endif  /* MA_NO_MP3 */
+
+
+/**************************************************************************************************************************************************************
+
+Decoding
+
+**************************************************************************************************************************************************************/
+#ifndef MA_NO_DECODING
+
+static ma_result ma_decoder_read_bytes(ma_decoder* pDecoder, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead)
+{
+    MA_ASSERT(pDecoder != NULL);
+
+    return pDecoder->onRead(pDecoder, pBufferOut, bytesToRead, pBytesRead);
+}
+
+static ma_result ma_decoder_seek_bytes(ma_decoder* pDecoder, ma_int64 byteOffset, ma_seek_origin origin)
+{
+    MA_ASSERT(pDecoder != NULL);
+
+    return pDecoder->onSeek(pDecoder, byteOffset, origin);
+}
+
+static ma_result ma_decoder_tell_bytes(ma_decoder* pDecoder, ma_int64* pCursor)
+{
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pDecoder->onTell == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    return pDecoder->onTell(pDecoder, pCursor);
+}
+
+
+MA_API ma_decoding_backend_config ma_decoding_backend_config_init(ma_format preferredFormat, ma_uint32 seekPointCount)
+{
+    ma_decoding_backend_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.preferredFormat = preferredFormat;
+    config.seekPointCount  = seekPointCount;
+
+    return config;
+}
+
+
+MA_API ma_decoder_config ma_decoder_config_init(ma_format outputFormat, ma_uint32 outputChannels, ma_uint32 outputSampleRate)
+{
+    ma_decoder_config config;
+    MA_ZERO_OBJECT(&config);
+    config.format         = outputFormat;
+    config.channels       = outputChannels;
+    config.sampleRate     = outputSampleRate;
+    config.resampling     = ma_resampler_config_init(ma_format_unknown, 0, 0, 0, ma_resample_algorithm_linear); /* Format/channels/rate doesn't matter here. */
+    config.encodingFormat = ma_encoding_format_unknown;
+
+    /* Note that we are intentionally leaving the channel map empty here which will cause the default channel map to be used. */
+
+    return config;
+}
+
+MA_API ma_decoder_config ma_decoder_config_init_default(void)
+{
+    return ma_decoder_config_init(ma_format_unknown, 0, 0);
+}
+
+MA_API ma_decoder_config ma_decoder_config_init_copy(const ma_decoder_config* pConfig)
+{
+    ma_decoder_config config;
+    if (pConfig != NULL) {
+        config = *pConfig;
+    } else {
+        MA_ZERO_OBJECT(&config);
+    }
+
+    return config;
+}
+
+static ma_result ma_decoder__init_data_converter(ma_decoder* pDecoder, const ma_decoder_config* pConfig)
+{
+    ma_result result;
+    ma_data_converter_config converterConfig;
+    ma_format internalFormat;
+    ma_uint32 internalChannels;
+    ma_uint32 internalSampleRate;
+    ma_channel internalChannelMap[MA_MAX_CHANNELS];
+
+    MA_ASSERT(pDecoder != NULL);
+    MA_ASSERT(pConfig  != NULL);
+
+    result = ma_data_source_get_data_format(pDecoder->pBackend, &internalFormat, &internalChannels, &internalSampleRate, internalChannelMap, ma_countof(internalChannelMap));
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to retrieve the internal data format. */
+    }
+
+
+    /* Make sure we're not asking for too many channels. */
+    if (pConfig->channels > MA_MAX_CHANNELS) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The internal channels should have already been validated at a higher level, but we'll do it again explicitly here for safety. */
+    if (internalChannels > MA_MAX_CHANNELS) {
+        return MA_INVALID_ARGS;
+    }
+
+
+    /* Output format. */
+    if (pConfig->format == ma_format_unknown) {
+        pDecoder->outputFormat = internalFormat;
+    } else {
+        pDecoder->outputFormat = pConfig->format;
+    }
+
+    if (pConfig->channels == 0) {
+        pDecoder->outputChannels = internalChannels;
+    } else {
+        pDecoder->outputChannels = pConfig->channels;
+    }
+
+    if (pConfig->sampleRate == 0) {
+        pDecoder->outputSampleRate = internalSampleRate;
+    } else {
+        pDecoder->outputSampleRate = pConfig->sampleRate;
+    }
+
+    converterConfig = ma_data_converter_config_init(
+        internalFormat,     pDecoder->outputFormat,
+        internalChannels,   pDecoder->outputChannels,
+        internalSampleRate, pDecoder->outputSampleRate
+    );
+    converterConfig.pChannelMapIn          = internalChannelMap;
+    converterConfig.pChannelMapOut         = pConfig->pChannelMap;
+    converterConfig.channelMixMode         = pConfig->channelMixMode;
+    converterConfig.ditherMode             = pConfig->ditherMode;
+    converterConfig.allowDynamicSampleRate = MA_FALSE;   /* Never allow dynamic sample rate conversion. Setting this to true will disable passthrough optimizations. */
+    converterConfig.resampling             = pConfig->resampling;
+
+    result = ma_data_converter_init(&converterConfig, &pDecoder->allocationCallbacks, &pDecoder->converter);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /*
+    Now that we have the decoder we need to determine whether or not we need a heap-allocated cache. We'll
+    need this if the data converter does not support calculation of the required input frame count. To
+    determine support for this we'll just run a test.
+    */
+    {
+        ma_uint64 unused;
+
+        result = ma_data_converter_get_required_input_frame_count(&pDecoder->converter, 1, &unused);
+        if (result != MA_SUCCESS) {
+            /*
+            We were unable to calculate the required input frame count which means we'll need to use
+            a heap-allocated cache.
+            */
+            ma_uint64 inputCacheCapSizeInBytes;
+
+            pDecoder->inputCacheCap = MA_DATA_CONVERTER_STACK_BUFFER_SIZE / ma_get_bytes_per_frame(internalFormat, internalChannels);
+
+            /* Not strictly necessary, but keeping here for safety in case we change the default value of pDecoder->inputCacheCap. */
+            inputCacheCapSizeInBytes = pDecoder->inputCacheCap * ma_get_bytes_per_frame(internalFormat, internalChannels);
+            if (inputCacheCapSizeInBytes > MA_SIZE_MAX) {
+                ma_data_converter_uninit(&pDecoder->converter, &pDecoder->allocationCallbacks);
+                return MA_OUT_OF_MEMORY;
+            }
+
+            pDecoder->pInputCache = ma_malloc((size_t)inputCacheCapSizeInBytes, &pDecoder->allocationCallbacks);    /* Safe cast to size_t. */
+            if (pDecoder->pInputCache == NULL) {
+                ma_data_converter_uninit(&pDecoder->converter, &pDecoder->allocationCallbacks);
+                return MA_OUT_OF_MEMORY;
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+
+static ma_result ma_decoder_internal_on_read__custom(void* pUserData, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead)
+{
+    ma_decoder* pDecoder = (ma_decoder*)pUserData;
+    MA_ASSERT(pDecoder != NULL);
+
+    return ma_decoder_read_bytes(pDecoder, pBufferOut, bytesToRead, pBytesRead);
+}
+
+static ma_result ma_decoder_internal_on_seek__custom(void* pUserData, ma_int64 offset, ma_seek_origin origin)
+{
+    ma_decoder* pDecoder = (ma_decoder*)pUserData;
+    MA_ASSERT(pDecoder != NULL);
+
+    return ma_decoder_seek_bytes(pDecoder, offset, origin);
+}
+
+static ma_result ma_decoder_internal_on_tell__custom(void* pUserData, ma_int64* pCursor)
+{
+    ma_decoder* pDecoder = (ma_decoder*)pUserData;
+    MA_ASSERT(pDecoder != NULL);
+
+    return ma_decoder_tell_bytes(pDecoder, pCursor);
+}
+
+
+static ma_result ma_decoder_init_from_vtable__internal(const ma_decoding_backend_vtable* pVTable, void* pVTableUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoding_backend_config backendConfig;
+    ma_data_source* pBackend;
+
+    MA_ASSERT(pVTable  != NULL);
+    MA_ASSERT(pConfig  != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pVTable->onInit == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    backendConfig = ma_decoding_backend_config_init(pConfig->format, pConfig->seekPointCount);
+
+    result = pVTable->onInit(pVTableUserData, ma_decoder_internal_on_read__custom, ma_decoder_internal_on_seek__custom, ma_decoder_internal_on_tell__custom, pDecoder, &backendConfig, &pDecoder->allocationCallbacks, &pBackend);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the backend from this vtable. */
+    }
+
+    /* Getting here means we were able to initialize the backend so we can now initialize the decoder. */
+    pDecoder->pBackend         = pBackend;
+    pDecoder->pBackendVTable   = pVTable;
+    pDecoder->pBackendUserData = pConfig->pCustomBackendUserData;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoder_init_from_file__internal(const ma_decoding_backend_vtable* pVTable, void* pVTableUserData, const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoding_backend_config backendConfig;
+    ma_data_source* pBackend;
+
+    MA_ASSERT(pVTable  != NULL);
+    MA_ASSERT(pConfig  != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pVTable->onInitFile == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    backendConfig = ma_decoding_backend_config_init(pConfig->format, pConfig->seekPointCount);
+
+    result = pVTable->onInitFile(pVTableUserData, pFilePath, &backendConfig, &pDecoder->allocationCallbacks, &pBackend);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the backend from this vtable. */
+    }
+
+    /* Getting here means we were able to initialize the backend so we can now initialize the decoder. */
+    pDecoder->pBackend         = pBackend;
+    pDecoder->pBackendVTable   = pVTable;
+    pDecoder->pBackendUserData = pConfig->pCustomBackendUserData;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoder_init_from_file_w__internal(const ma_decoding_backend_vtable* pVTable, void* pVTableUserData, const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoding_backend_config backendConfig;
+    ma_data_source* pBackend;
+
+    MA_ASSERT(pVTable  != NULL);
+    MA_ASSERT(pConfig  != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pVTable->onInitFileW == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    backendConfig = ma_decoding_backend_config_init(pConfig->format, pConfig->seekPointCount);
+
+    result = pVTable->onInitFileW(pVTableUserData, pFilePath, &backendConfig, &pDecoder->allocationCallbacks, &pBackend);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the backend from this vtable. */
+    }
+
+    /* Getting here means we were able to initialize the backend so we can now initialize the decoder. */
+    pDecoder->pBackend         = pBackend;
+    pDecoder->pBackendVTable   = pVTable;
+    pDecoder->pBackendUserData = pConfig->pCustomBackendUserData;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoder_init_from_memory__internal(const ma_decoding_backend_vtable* pVTable, void* pVTableUserData, const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoding_backend_config backendConfig;
+    ma_data_source* pBackend;
+
+    MA_ASSERT(pVTable  != NULL);
+    MA_ASSERT(pConfig  != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pVTable->onInitMemory == NULL) {
+        return MA_NOT_IMPLEMENTED;
+    }
+
+    backendConfig = ma_decoding_backend_config_init(pConfig->format, pConfig->seekPointCount);
+
+    result = pVTable->onInitMemory(pVTableUserData, pData, dataSize, &backendConfig, &pDecoder->allocationCallbacks, &pBackend);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the backend from this vtable. */
+    }
+
+    /* Getting here means we were able to initialize the backend so we can now initialize the decoder. */
+    pDecoder->pBackend         = pBackend;
+    pDecoder->pBackendVTable   = pVTable;
+    pDecoder->pBackendUserData = pConfig->pCustomBackendUserData;
+
+    return MA_SUCCESS;
+}
+
+
+
+static ma_result ma_decoder_init_custom__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result = MA_NO_BACKEND;
+    size_t ivtable;
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pConfig->ppCustomBackendVTables == NULL) {
+        return MA_NO_BACKEND;
+    }
+
+    /* The order each backend is listed is what defines the priority. */
+    for (ivtable = 0; ivtable < pConfig->customBackendCount; ivtable += 1) {
+        const ma_decoding_backend_vtable* pVTable = pConfig->ppCustomBackendVTables[ivtable];
+        if (pVTable != NULL) {
+            result = ma_decoder_init_from_vtable__internal(pVTable, pConfig->pCustomBackendUserData, pConfig, pDecoder);
+            if (result == MA_SUCCESS) {
+                return MA_SUCCESS;
+            } else {
+                /* Initialization failed. Move on to the next one, but seek back to the start first so the next vtable starts from the first byte of the file. */
+                result = ma_decoder_seek_bytes(pDecoder, 0, ma_seek_origin_start);
+                if (result != MA_SUCCESS) {
+                    return result;  /* Failed to seek back to the start. */
+                }
+            }
+        } else {
+            /* No vtable. */
+        }
+    }
+
+    /* Getting here means we couldn't find a backend. */
+    return MA_NO_BACKEND;
+}
+
+static ma_result ma_decoder_init_custom_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result = MA_NO_BACKEND;
+    size_t ivtable;
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pConfig->ppCustomBackendVTables == NULL) {
+        return MA_NO_BACKEND;
+    }
+
+    /* The order each backend is listed is what defines the priority. */
+    for (ivtable = 0; ivtable < pConfig->customBackendCount; ivtable += 1) {
+        const ma_decoding_backend_vtable* pVTable = pConfig->ppCustomBackendVTables[ivtable];
+        if (pVTable != NULL) {
+            result = ma_decoder_init_from_file__internal(pVTable, pConfig->pCustomBackendUserData, pFilePath, pConfig, pDecoder);
+            if (result == MA_SUCCESS) {
+                return MA_SUCCESS;
+            }
+        } else {
+            /* No vtable. */
+        }
+    }
+
+    /* Getting here means we couldn't find a backend. */
+    return MA_NO_BACKEND;
+}
+
+static ma_result ma_decoder_init_custom_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result = MA_NO_BACKEND;
+    size_t ivtable;
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pConfig->ppCustomBackendVTables == NULL) {
+        return MA_NO_BACKEND;
+    }
+
+    /* The order each backend is listed is what defines the priority. */
+    for (ivtable = 0; ivtable < pConfig->customBackendCount; ivtable += 1) {
+        const ma_decoding_backend_vtable* pVTable = pConfig->ppCustomBackendVTables[ivtable];
+        if (pVTable != NULL) {
+            result = ma_decoder_init_from_file_w__internal(pVTable, pConfig->pCustomBackendUserData, pFilePath, pConfig, pDecoder);
+            if (result == MA_SUCCESS) {
+                return MA_SUCCESS;
+            }
+        } else {
+            /* No vtable. */
+        }
+    }
+
+    /* Getting here means we couldn't find a backend. */
+    return MA_NO_BACKEND;
+}
+
+static ma_result ma_decoder_init_custom_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result = MA_NO_BACKEND;
+    size_t ivtable;
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pConfig->ppCustomBackendVTables == NULL) {
+        return MA_NO_BACKEND;
+    }
+
+    /* The order each backend is listed is what defines the priority. */
+    for (ivtable = 0; ivtable < pConfig->customBackendCount; ivtable += 1) {
+        const ma_decoding_backend_vtable* pVTable = pConfig->ppCustomBackendVTables[ivtable];
+        if (pVTable != NULL) {
+            result = ma_decoder_init_from_memory__internal(pVTable, pConfig->pCustomBackendUserData, pData, dataSize, pConfig, pDecoder);
+            if (result == MA_SUCCESS) {
+                return MA_SUCCESS;
+            }
+        } else {
+            /* No vtable. */
+        }
+    }
+
+    /* Getting here means we couldn't find a backend. */
+    return MA_NO_BACKEND;
+}
+
+
+/* WAV */
+#ifdef ma_dr_wav_h
+#define MA_HAS_WAV
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_read_proc onRead;
+    ma_seek_proc onSeek;
+    ma_tell_proc onTell;
+    void* pReadSeekTellUserData;
+    ma_format format;           /* Can be f32, s16 or s32. */
+#if !defined(MA_NO_WAV)
+    ma_dr_wav dr;
+#endif
+} ma_wav;
+
+MA_API ma_result ma_wav_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav);
+MA_API ma_result ma_wav_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav);
+MA_API ma_result ma_wav_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav);
+MA_API ma_result ma_wav_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav);
+MA_API void ma_wav_uninit(ma_wav* pWav, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_wav_read_pcm_frames(ma_wav* pWav, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_wav_seek_to_pcm_frame(ma_wav* pWav, ma_uint64 frameIndex);
+MA_API ma_result ma_wav_get_data_format(ma_wav* pWav, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_wav_get_cursor_in_pcm_frames(ma_wav* pWav, ma_uint64* pCursor);
+MA_API ma_result ma_wav_get_length_in_pcm_frames(ma_wav* pWav, ma_uint64* pLength);
+
+
+static ma_result ma_wav_ds_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_wav_read_pcm_frames((ma_wav*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_wav_ds_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_wav_seek_to_pcm_frame((ma_wav*)pDataSource, frameIndex);
+}
+
+static ma_result ma_wav_ds_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    return ma_wav_get_data_format((ma_wav*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+}
+
+static ma_result ma_wav_ds_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    return ma_wav_get_cursor_in_pcm_frames((ma_wav*)pDataSource, pCursor);
+}
+
+static ma_result ma_wav_ds_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    return ma_wav_get_length_in_pcm_frames((ma_wav*)pDataSource, pLength);
+}
+
+static ma_data_source_vtable g_ma_wav_ds_vtable =
+{
+    ma_wav_ds_read,
+    ma_wav_ds_seek,
+    ma_wav_ds_get_data_format,
+    ma_wav_ds_get_cursor,
+    ma_wav_ds_get_length,
+    NULL,   /* onSetLooping */
+    0
+};
+
+
+#if !defined(MA_NO_WAV)
+static size_t ma_wav_dr_callback__read(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    ma_wav* pWav = (ma_wav*)pUserData;
+    ma_result result;
+    size_t bytesRead;
+
+    MA_ASSERT(pWav != NULL);
+
+    result = pWav->onRead(pWav->pReadSeekTellUserData, pBufferOut, bytesToRead, &bytesRead);
+    (void)result;
+
+    return bytesRead;
+}
+
+static ma_bool32 ma_wav_dr_callback__seek(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
+{
+    ma_wav* pWav = (ma_wav*)pUserData;
+    ma_result result;
+    ma_seek_origin maSeekOrigin;
+
+    MA_ASSERT(pWav != NULL);
+
+    maSeekOrigin = ma_seek_origin_start;
+    if (origin == ma_dr_wav_seek_origin_current) {
+        maSeekOrigin =  ma_seek_origin_current;
+    }
+
+    result = pWav->onSeek(pWav->pReadSeekTellUserData, offset, maSeekOrigin);
+    if (result != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+
+    return MA_TRUE;
+}
+#endif
+
+static ma_result ma_wav_init_internal(const ma_decoding_backend_config* pConfig, ma_wav* pWav)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+
+    if (pWav == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pWav);
+    pWav->format = ma_format_unknown;   /* Use closest match to source file by default. */
+
+    if (pConfig != NULL && (pConfig->preferredFormat == ma_format_f32 || pConfig->preferredFormat == ma_format_s16 || pConfig->preferredFormat == ma_format_s32)) {
+        pWav->format = pConfig->preferredFormat;
+    } else {
+        /* Getting here means something other than f32 and s16 was specified. Just leave this unset to use the default format. */
+    }
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_wav_ds_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pWav->ds);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the base data source. */
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_wav_post_init(ma_wav* pWav)
+{
+    /*
+    If an explicit format was not specified, try picking the closest match based on the internal
+    format. The format needs to be supported by miniaudio.
+    */
+    if (pWav->format == ma_format_unknown) {
+        switch (pWav->dr.translatedFormatTag)
+        {
+            case MA_DR_WAVE_FORMAT_PCM:
+            {
+                if (pWav->dr.bitsPerSample == 8) {
+                    pWav->format = ma_format_u8;
+                } else if (pWav->dr.bitsPerSample == 16) {
+                    pWav->format = ma_format_s16;
+                } else if (pWav->dr.bitsPerSample == 24) {
+                    pWav->format = ma_format_s24;
+                } else if (pWav->dr.bitsPerSample == 32) {
+                    pWav->format = ma_format_s32;
+                }
+            } break;
+
+            case MA_DR_WAVE_FORMAT_IEEE_FLOAT:
+            {
+                if (pWav->dr.bitsPerSample == 32) {
+                    pWav->format = ma_format_f32;
+                }
+            } break;
+
+            default: break;
+        }
+
+        /* Fall back to f32 if we couldn't find anything. */
+        if (pWav->format == ma_format_unknown) {
+            pWav->format =  ma_format_f32;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_wav_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav)
+{
+    ma_result result;
+
+    result = ma_wav_init_internal(pConfig, pWav);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (onRead == NULL || onSeek == NULL) {
+        return MA_INVALID_ARGS; /* onRead and onSeek are mandatory. */
+    }
+
+    pWav->onRead = onRead;
+    pWav->onSeek = onSeek;
+    pWav->onTell = onTell;
+    pWav->pReadSeekTellUserData = pReadSeekTellUserData;
+
+    #if !defined(MA_NO_WAV)
+    {
+        ma_bool32 wavResult;
+
+        wavResult = ma_dr_wav_init(&pWav->dr, ma_wav_dr_callback__read, ma_wav_dr_callback__seek, pWav, pAllocationCallbacks);
+        if (wavResult != MA_TRUE) {
+            return MA_INVALID_FILE;
+        }
+
+        ma_wav_post_init(pWav);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* wav is disabled. */
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_wav_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav)
+{
+    ma_result result;
+
+    result = ma_wav_init_internal(pConfig, pWav);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_WAV)
+    {
+        ma_bool32 wavResult;
+
+        wavResult = ma_dr_wav_init_file(&pWav->dr, pFilePath, pAllocationCallbacks);
+        if (wavResult != MA_TRUE) {
+            return MA_INVALID_FILE;
+        }
+
+        ma_wav_post_init(pWav);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* wav is disabled. */
+        (void)pFilePath;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_wav_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav)
+{
+    ma_result result;
+
+    result = ma_wav_init_internal(pConfig, pWav);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_WAV)
+    {
+        ma_bool32 wavResult;
+
+        wavResult = ma_dr_wav_init_file_w(&pWav->dr, pFilePath, pAllocationCallbacks);
+        if (wavResult != MA_TRUE) {
+            return MA_INVALID_FILE;
+        }
+
+        ma_wav_post_init(pWav);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* wav is disabled. */
+        (void)pFilePath;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_wav_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_wav* pWav)
+{
+    ma_result result;
+
+    result = ma_wav_init_internal(pConfig, pWav);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_WAV)
+    {
+        ma_bool32 wavResult;
+
+        wavResult = ma_dr_wav_init_memory(&pWav->dr, pData, dataSize, pAllocationCallbacks);
+        if (wavResult != MA_TRUE) {
+            return MA_INVALID_FILE;
+        }
+
+        ma_wav_post_init(pWav);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* wav is disabled. */
+        (void)pData;
+        (void)dataSize;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API void ma_wav_uninit(ma_wav* pWav, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pWav == NULL) {
+        return;
+    }
+
+    (void)pAllocationCallbacks;
+
+    #if !defined(MA_NO_WAV)
+    {
+        ma_dr_wav_uninit(&pWav->dr);
+    }
+    #else
+    {
+        /* wav is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+    }
+    #endif
+
+    ma_data_source_uninit(&pWav->ds);
+}
+
+MA_API ma_result ma_wav_read_pcm_frames(ma_wav* pWav, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pWav == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_WAV)
+    {
+        /* We always use floating point format. */
+        ma_result result = MA_SUCCESS;  /* Must be initialized to MA_SUCCESS. */
+        ma_uint64 totalFramesRead = 0;
+        ma_format format;
+
+        ma_wav_get_data_format(pWav, &format, NULL, NULL, NULL, 0);
+
+        switch (format)
+        {
+            case ma_format_f32:
+            {
+                totalFramesRead = ma_dr_wav_read_pcm_frames_f32(&pWav->dr, frameCount, (float*)pFramesOut);
+            } break;
+
+            case ma_format_s16:
+            {
+                totalFramesRead = ma_dr_wav_read_pcm_frames_s16(&pWav->dr, frameCount, (ma_int16*)pFramesOut);
+            } break;
+
+            case ma_format_s32:
+            {
+                totalFramesRead = ma_dr_wav_read_pcm_frames_s32(&pWav->dr, frameCount, (ma_int32*)pFramesOut);
+            } break;
+
+            /* Fallback to a raw read. */
+            case ma_format_unknown: return MA_INVALID_OPERATION; /* <-- this should never be hit because initialization would just fall back to a supported format. */
+            default:
+            {
+                totalFramesRead = ma_dr_wav_read_pcm_frames(&pWav->dr, frameCount, pFramesOut);
+            } break;
+        }
+
+        /* In the future we'll update ma_dr_wav to return MA_AT_END for us. */
+        if (totalFramesRead == 0) {
+            result = MA_AT_END;
+        }
+
+        if (pFramesRead != NULL) {
+            *pFramesRead = totalFramesRead;
+        }
+
+        if (result == MA_SUCCESS && totalFramesRead == 0) {
+            result  = MA_AT_END;
+        }
+
+        return result;
+    }
+    #else
+    {
+        /* wav is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+
+        (void)pFramesOut;
+        (void)frameCount;
+        (void)pFramesRead;
+
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_wav_seek_to_pcm_frame(ma_wav* pWav, ma_uint64 frameIndex)
+{
+    if (pWav == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_WAV)
+    {
+        ma_bool32 wavResult;
+
+        wavResult = ma_dr_wav_seek_to_pcm_frame(&pWav->dr, frameIndex);
+        if (wavResult != MA_TRUE) {
+            return MA_ERROR;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* wav is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+
+        (void)frameIndex;
+
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_wav_get_data_format(ma_wav* pWav, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    /* Defaults for safety. */
+    if (pFormat != NULL) {
+        *pFormat = ma_format_unknown;
+    }
+    if (pChannels != NULL) {
+        *pChannels = 0;
+    }
+    if (pSampleRate != NULL) {
+        *pSampleRate = 0;
+    }
+    if (pChannelMap != NULL) {
+        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
+    }
+
+    if (pWav == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pFormat != NULL) {
+        *pFormat = pWav->format;
+    }
+
+    #if !defined(MA_NO_WAV)
+    {
+        if (pChannels != NULL) {
+            *pChannels = pWav->dr.channels;
+        }
+
+        if (pSampleRate != NULL) {
+            *pSampleRate = pWav->dr.sampleRate;
+        }
+
+        if (pChannelMap != NULL) {
+            ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pChannelMap, channelMapCap, pWav->dr.channels);
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* wav is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_wav_get_cursor_in_pcm_frames(ma_wav* pWav, ma_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;   /* Safety. */
+
+    if (pWav == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_WAV)
+    {
+        ma_result wavResult = ma_dr_wav_get_cursor_in_pcm_frames(&pWav->dr, pCursor);
+        if (wavResult != MA_SUCCESS) {
+            return (ma_result)wavResult;    /* ma_dr_wav result codes map to miniaudio's. */
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* wav is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_wav_get_length_in_pcm_frames(ma_wav* pWav, ma_uint64* pLength)
+{
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;   /* Safety. */
+
+    if (pWav == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_WAV)
+    {
+        ma_result wavResult = ma_dr_wav_get_length_in_pcm_frames(&pWav->dr, pLength);
+        if (wavResult != MA_SUCCESS) {
+            return (ma_result)wavResult;    /* ma_dr_wav result codes map to miniaudio's. */
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* wav is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+
+static ma_result ma_decoding_backend_init__wav(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_wav* pWav;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pWav = (ma_wav*)ma_malloc(sizeof(*pWav), pAllocationCallbacks);
+    if (pWav == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_wav_init(onRead, onSeek, onTell, pReadSeekTellUserData, pConfig, pAllocationCallbacks, pWav);
+    if (result != MA_SUCCESS) {
+        ma_free(pWav, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pWav;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_file__wav(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_wav* pWav;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pWav = (ma_wav*)ma_malloc(sizeof(*pWav), pAllocationCallbacks);
+    if (pWav == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_wav_init_file(pFilePath, pConfig, pAllocationCallbacks, pWav);
+    if (result != MA_SUCCESS) {
+        ma_free(pWav, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pWav;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_file_w__wav(void* pUserData, const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_wav* pWav;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pWav = (ma_wav*)ma_malloc(sizeof(*pWav), pAllocationCallbacks);
+    if (pWav == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_wav_init_file_w(pFilePath, pConfig, pAllocationCallbacks, pWav);
+    if (result != MA_SUCCESS) {
+        ma_free(pWav, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pWav;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_memory__wav(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_wav* pWav;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pWav = (ma_wav*)ma_malloc(sizeof(*pWav), pAllocationCallbacks);
+    if (pWav == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_wav_init_memory(pData, dataSize, pConfig, pAllocationCallbacks, pWav);
+    if (result != MA_SUCCESS) {
+        ma_free(pWav, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pWav;
+
+    return MA_SUCCESS;
+}
+
+static void ma_decoding_backend_uninit__wav(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_wav* pWav = (ma_wav*)pBackend;
+
+    (void)pUserData;
+
+    ma_wav_uninit(pWav, pAllocationCallbacks);
+    ma_free(pWav, pAllocationCallbacks);
+}
+
+static ma_decoding_backend_vtable g_ma_decoding_backend_vtable_wav =
+{
+    ma_decoding_backend_init__wav,
+    ma_decoding_backend_init_file__wav,
+    ma_decoding_backend_init_file_w__wav,
+    ma_decoding_backend_init_memory__wav,
+    ma_decoding_backend_uninit__wav
+};
+
+static ma_result ma_decoder_init_wav__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_vtable__internal(&g_ma_decoding_backend_vtable_wav, NULL, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_wav_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_file__internal(&g_ma_decoding_backend_vtable_wav, NULL, pFilePath, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_wav_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_file_w__internal(&g_ma_decoding_backend_vtable_wav, NULL, pFilePath, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_wav_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_memory__internal(&g_ma_decoding_backend_vtable_wav, NULL, pData, dataSize, pConfig, pDecoder);
+}
+#endif  /* ma_dr_wav_h */
+
+/* FLAC */
+#ifdef ma_dr_flac_h
+#define MA_HAS_FLAC
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_read_proc onRead;
+    ma_seek_proc onSeek;
+    ma_tell_proc onTell;
+    void* pReadSeekTellUserData;
+    ma_format format;           /* Can be f32, s16 or s32. */
+#if !defined(MA_NO_FLAC)
+    ma_dr_flac* dr;
+#endif
+} ma_flac;
+
+MA_API ma_result ma_flac_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac);
+MA_API ma_result ma_flac_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac);
+MA_API ma_result ma_flac_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac);
+MA_API ma_result ma_flac_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac);
+MA_API void ma_flac_uninit(ma_flac* pFlac, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_flac_read_pcm_frames(ma_flac* pFlac, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_flac_seek_to_pcm_frame(ma_flac* pFlac, ma_uint64 frameIndex);
+MA_API ma_result ma_flac_get_data_format(ma_flac* pFlac, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_flac_get_cursor_in_pcm_frames(ma_flac* pFlac, ma_uint64* pCursor);
+MA_API ma_result ma_flac_get_length_in_pcm_frames(ma_flac* pFlac, ma_uint64* pLength);
+
+
+static ma_result ma_flac_ds_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_flac_read_pcm_frames((ma_flac*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_flac_ds_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_flac_seek_to_pcm_frame((ma_flac*)pDataSource, frameIndex);
+}
+
+static ma_result ma_flac_ds_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    return ma_flac_get_data_format((ma_flac*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+}
+
+static ma_result ma_flac_ds_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    return ma_flac_get_cursor_in_pcm_frames((ma_flac*)pDataSource, pCursor);
+}
+
+static ma_result ma_flac_ds_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    return ma_flac_get_length_in_pcm_frames((ma_flac*)pDataSource, pLength);
+}
+
+static ma_data_source_vtable g_ma_flac_ds_vtable =
+{
+    ma_flac_ds_read,
+    ma_flac_ds_seek,
+    ma_flac_ds_get_data_format,
+    ma_flac_ds_get_cursor,
+    ma_flac_ds_get_length,
+    NULL,   /* onSetLooping */
+    0
+};
+
+
+#if !defined(MA_NO_FLAC)
+static size_t ma_flac_dr_callback__read(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    ma_flac* pFlac = (ma_flac*)pUserData;
+    ma_result result;
+    size_t bytesRead;
+
+    MA_ASSERT(pFlac != NULL);
+
+    result = pFlac->onRead(pFlac->pReadSeekTellUserData, pBufferOut, bytesToRead, &bytesRead);
+    (void)result;
+
+    return bytesRead;
+}
+
+static ma_bool32 ma_flac_dr_callback__seek(void* pUserData, int offset, ma_dr_flac_seek_origin origin)
+{
+    ma_flac* pFlac = (ma_flac*)pUserData;
+    ma_result result;
+    ma_seek_origin maSeekOrigin;
+
+    MA_ASSERT(pFlac != NULL);
+
+    maSeekOrigin = ma_seek_origin_start;
+    if (origin == ma_dr_flac_seek_origin_current) {
+        maSeekOrigin =  ma_seek_origin_current;
+    }
+
+    result = pFlac->onSeek(pFlac->pReadSeekTellUserData, offset, maSeekOrigin);
+    if (result != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+
+    return MA_TRUE;
+}
+#endif
+
+static ma_result ma_flac_init_internal(const ma_decoding_backend_config* pConfig, ma_flac* pFlac)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+
+    if (pFlac == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pFlac);
+    pFlac->format = ma_format_f32;    /* f32 by default. */
+
+    if (pConfig != NULL && (pConfig->preferredFormat == ma_format_f32 || pConfig->preferredFormat == ma_format_s16 || pConfig->preferredFormat == ma_format_s32)) {
+        pFlac->format = pConfig->preferredFormat;
+    } else {
+        /* Getting here means something other than f32 and s16 was specified. Just leave this unset to use the default format. */
+    }
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_flac_ds_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pFlac->ds);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the base data source. */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_flac_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac)
+{
+    ma_result result;
+
+    result = ma_flac_init_internal(pConfig, pFlac);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (onRead == NULL || onSeek == NULL) {
+        return MA_INVALID_ARGS; /* onRead and onSeek are mandatory. */
+    }
+
+    pFlac->onRead = onRead;
+    pFlac->onSeek = onSeek;
+    pFlac->onTell = onTell;
+    pFlac->pReadSeekTellUserData = pReadSeekTellUserData;
+
+    #if !defined(MA_NO_FLAC)
+    {
+        pFlac->dr = ma_dr_flac_open(ma_flac_dr_callback__read, ma_flac_dr_callback__seek, pFlac, pAllocationCallbacks);
+        if (pFlac->dr == NULL) {
+            return MA_INVALID_FILE;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* flac is disabled. */
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_flac_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac)
+{
+    ma_result result;
+
+    result = ma_flac_init_internal(pConfig, pFlac);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_FLAC)
+    {
+        pFlac->dr = ma_dr_flac_open_file(pFilePath, pAllocationCallbacks);
+        if (pFlac->dr == NULL) {
+            return MA_INVALID_FILE;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* flac is disabled. */
+        (void)pFilePath;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_flac_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac)
+{
+    ma_result result;
+
+    result = ma_flac_init_internal(pConfig, pFlac);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_FLAC)
+    {
+        pFlac->dr = ma_dr_flac_open_file_w(pFilePath, pAllocationCallbacks);
+        if (pFlac->dr == NULL) {
+            return MA_INVALID_FILE;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* flac is disabled. */
+        (void)pFilePath;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_flac_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_flac* pFlac)
+{
+    ma_result result;
+
+    result = ma_flac_init_internal(pConfig, pFlac);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_FLAC)
+    {
+        pFlac->dr = ma_dr_flac_open_memory(pData, dataSize, pAllocationCallbacks);
+        if (pFlac->dr == NULL) {
+            return MA_INVALID_FILE;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* flac is disabled. */
+        (void)pData;
+        (void)dataSize;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API void ma_flac_uninit(ma_flac* pFlac, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFlac == NULL) {
+        return;
+    }
+
+    (void)pAllocationCallbacks;
+
+    #if !defined(MA_NO_FLAC)
+    {
+        ma_dr_flac_close(pFlac->dr);
+    }
+    #else
+    {
+        /* flac is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+    }
+    #endif
+
+    ma_data_source_uninit(&pFlac->ds);
+}
+
+MA_API ma_result ma_flac_read_pcm_frames(ma_flac* pFlac, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pFlac == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_FLAC)
+    {
+        /* We always use floating point format. */
+        ma_result result = MA_SUCCESS;  /* Must be initialized to MA_SUCCESS. */
+        ma_uint64 totalFramesRead = 0;
+        ma_format format;
+
+        ma_flac_get_data_format(pFlac, &format, NULL, NULL, NULL, 0);
+
+        switch (format)
+        {
+            case ma_format_f32:
+            {
+                totalFramesRead = ma_dr_flac_read_pcm_frames_f32(pFlac->dr, frameCount, (float*)pFramesOut);
+            } break;
+
+            case ma_format_s16:
+            {
+                totalFramesRead = ma_dr_flac_read_pcm_frames_s16(pFlac->dr, frameCount, (ma_int16*)pFramesOut);
+            } break;
+
+            case ma_format_s32:
+            {
+                totalFramesRead = ma_dr_flac_read_pcm_frames_s32(pFlac->dr, frameCount, (ma_int32*)pFramesOut);
+            } break;
+
+            case ma_format_u8:
+            case ma_format_s24:
+            case ma_format_unknown:
+            default:
+            {
+                return MA_INVALID_OPERATION;
+            };
+        }
+
+        /* In the future we'll update ma_dr_flac to return MA_AT_END for us. */
+        if (totalFramesRead == 0) {
+            result = MA_AT_END;
+        }
+
+        if (pFramesRead != NULL) {
+            *pFramesRead = totalFramesRead;
+        }
+
+        if (result == MA_SUCCESS && totalFramesRead == 0) {
+            result  = MA_AT_END;
+        }
+
+        return result;
+    }
+    #else
+    {
+        /* flac is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+
+        (void)pFramesOut;
+        (void)frameCount;
+        (void)pFramesRead;
+
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_flac_seek_to_pcm_frame(ma_flac* pFlac, ma_uint64 frameIndex)
+{
+    if (pFlac == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_FLAC)
+    {
+        ma_bool32 flacResult;
+
+        flacResult = ma_dr_flac_seek_to_pcm_frame(pFlac->dr, frameIndex);
+        if (flacResult != MA_TRUE) {
+            return MA_ERROR;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* flac is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+
+        (void)frameIndex;
+
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_flac_get_data_format(ma_flac* pFlac, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    /* Defaults for safety. */
+    if (pFormat != NULL) {
+        *pFormat = ma_format_unknown;
+    }
+    if (pChannels != NULL) {
+        *pChannels = 0;
+    }
+    if (pSampleRate != NULL) {
+        *pSampleRate = 0;
+    }
+    if (pChannelMap != NULL) {
+        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
+    }
+
+    if (pFlac == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pFormat != NULL) {
+        *pFormat = pFlac->format;
+    }
+
+    #if !defined(MA_NO_FLAC)
+    {
+        if (pChannels != NULL) {
+            *pChannels = pFlac->dr->channels;
+        }
+
+        if (pSampleRate != NULL) {
+            *pSampleRate = pFlac->dr->sampleRate;
+        }
+
+        if (pChannelMap != NULL) {
+            ma_channel_map_init_standard(ma_standard_channel_map_microsoft, pChannelMap, channelMapCap, pFlac->dr->channels);
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* flac is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_flac_get_cursor_in_pcm_frames(ma_flac* pFlac, ma_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;   /* Safety. */
+
+    if (pFlac == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_FLAC)
+    {
+        *pCursor = pFlac->dr->currentPCMFrame;
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* flac is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_flac_get_length_in_pcm_frames(ma_flac* pFlac, ma_uint64* pLength)
+{
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;   /* Safety. */
+
+    if (pFlac == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_FLAC)
+    {
+        *pLength = pFlac->dr->totalPCMFrameCount;
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* flac is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+
+static ma_result ma_decoding_backend_init__flac(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_flac* pFlac;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pFlac = (ma_flac*)ma_malloc(sizeof(*pFlac), pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_flac_init(onRead, onSeek, onTell, pReadSeekTellUserData, pConfig, pAllocationCallbacks, pFlac);
+    if (result != MA_SUCCESS) {
+        ma_free(pFlac, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pFlac;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_file__flac(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_flac* pFlac;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pFlac = (ma_flac*)ma_malloc(sizeof(*pFlac), pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_flac_init_file(pFilePath, pConfig, pAllocationCallbacks, pFlac);
+    if (result != MA_SUCCESS) {
+        ma_free(pFlac, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pFlac;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_file_w__flac(void* pUserData, const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_flac* pFlac;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pFlac = (ma_flac*)ma_malloc(sizeof(*pFlac), pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_flac_init_file_w(pFilePath, pConfig, pAllocationCallbacks, pFlac);
+    if (result != MA_SUCCESS) {
+        ma_free(pFlac, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pFlac;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_memory__flac(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_flac* pFlac;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pFlac = (ma_flac*)ma_malloc(sizeof(*pFlac), pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_flac_init_memory(pData, dataSize, pConfig, pAllocationCallbacks, pFlac);
+    if (result != MA_SUCCESS) {
+        ma_free(pFlac, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pFlac;
+
+    return MA_SUCCESS;
+}
+
+static void ma_decoding_backend_uninit__flac(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_flac* pFlac = (ma_flac*)pBackend;
+
+    (void)pUserData;
+
+    ma_flac_uninit(pFlac, pAllocationCallbacks);
+    ma_free(pFlac, pAllocationCallbacks);
+}
+
+static ma_decoding_backend_vtable g_ma_decoding_backend_vtable_flac =
+{
+    ma_decoding_backend_init__flac,
+    ma_decoding_backend_init_file__flac,
+    ma_decoding_backend_init_file_w__flac,
+    ma_decoding_backend_init_memory__flac,
+    ma_decoding_backend_uninit__flac
+};
+
+static ma_result ma_decoder_init_flac__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_vtable__internal(&g_ma_decoding_backend_vtable_flac, NULL, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_flac_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_file__internal(&g_ma_decoding_backend_vtable_flac, NULL, pFilePath, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_flac_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_file_w__internal(&g_ma_decoding_backend_vtable_flac, NULL, pFilePath, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_flac_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_memory__internal(&g_ma_decoding_backend_vtable_flac, NULL, pData, dataSize, pConfig, pDecoder);
+}
+#endif  /* ma_dr_flac_h */
+
+/* MP3 */
+#ifdef ma_dr_mp3_h
+#define MA_HAS_MP3
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_read_proc onRead;
+    ma_seek_proc onSeek;
+    ma_tell_proc onTell;
+    void* pReadSeekTellUserData;
+    ma_format format;           /* Can be f32 or s16. */
+#if !defined(MA_NO_MP3)
+    ma_dr_mp3 dr;
+    ma_uint32 seekPointCount;
+    ma_dr_mp3_seek_point* pSeekPoints;  /* Only used if seek table generation is used. */
+#endif
+} ma_mp3;
+
+MA_API ma_result ma_mp3_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3);
+MA_API ma_result ma_mp3_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3);
+MA_API ma_result ma_mp3_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3);
+MA_API ma_result ma_mp3_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3);
+MA_API void ma_mp3_uninit(ma_mp3* pMP3, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_mp3_read_pcm_frames(ma_mp3* pMP3, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_mp3_seek_to_pcm_frame(ma_mp3* pMP3, ma_uint64 frameIndex);
+MA_API ma_result ma_mp3_get_data_format(ma_mp3* pMP3, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_mp3_get_cursor_in_pcm_frames(ma_mp3* pMP3, ma_uint64* pCursor);
+MA_API ma_result ma_mp3_get_length_in_pcm_frames(ma_mp3* pMP3, ma_uint64* pLength);
+
+
+static ma_result ma_mp3_ds_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_mp3_read_pcm_frames((ma_mp3*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_mp3_ds_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_mp3_seek_to_pcm_frame((ma_mp3*)pDataSource, frameIndex);
+}
+
+static ma_result ma_mp3_ds_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    return ma_mp3_get_data_format((ma_mp3*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+}
+
+static ma_result ma_mp3_ds_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    return ma_mp3_get_cursor_in_pcm_frames((ma_mp3*)pDataSource, pCursor);
+}
+
+static ma_result ma_mp3_ds_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    return ma_mp3_get_length_in_pcm_frames((ma_mp3*)pDataSource, pLength);
+}
+
+static ma_data_source_vtable g_ma_mp3_ds_vtable =
+{
+    ma_mp3_ds_read,
+    ma_mp3_ds_seek,
+    ma_mp3_ds_get_data_format,
+    ma_mp3_ds_get_cursor,
+    ma_mp3_ds_get_length,
+    NULL,   /* onSetLooping */
+    0
+};
+
+
+#if !defined(MA_NO_MP3)
+static size_t ma_mp3_dr_callback__read(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    ma_mp3* pMP3 = (ma_mp3*)pUserData;
+    ma_result result;
+    size_t bytesRead;
+
+    MA_ASSERT(pMP3 != NULL);
+
+    result = pMP3->onRead(pMP3->pReadSeekTellUserData, pBufferOut, bytesToRead, &bytesRead);
+    (void)result;
+
+    return bytesRead;
+}
+
+static ma_bool32 ma_mp3_dr_callback__seek(void* pUserData, int offset, ma_dr_mp3_seek_origin origin)
+{
+    ma_mp3* pMP3 = (ma_mp3*)pUserData;
+    ma_result result;
+    ma_seek_origin maSeekOrigin;
+
+    MA_ASSERT(pMP3 != NULL);
+
+    maSeekOrigin = ma_seek_origin_start;
+    if (origin == ma_dr_mp3_seek_origin_current) {
+        maSeekOrigin =  ma_seek_origin_current;
+    }
+
+    result = pMP3->onSeek(pMP3->pReadSeekTellUserData, offset, maSeekOrigin);
+    if (result != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+
+    return MA_TRUE;
+}
+#endif
+
+static ma_result ma_mp3_init_internal(const ma_decoding_backend_config* pConfig, ma_mp3* pMP3)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+
+    if (pMP3 == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pMP3);
+    pMP3->format = ma_format_f32;    /* f32 by default. */
+
+    if (pConfig != NULL && (pConfig->preferredFormat == ma_format_f32 || pConfig->preferredFormat == ma_format_s16)) {
+        pMP3->format = pConfig->preferredFormat;
+    } else {
+        /* Getting here means something other than f32 and s16 was specified. Just leave this unset to use the default format. */
+    }
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_mp3_ds_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pMP3->ds);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the base data source. */
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_mp3_generate_seek_table(ma_mp3* pMP3, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_bool32 mp3Result;
+    ma_uint32 seekPointCount = 0;
+    ma_dr_mp3_seek_point* pSeekPoints = NULL;
+
+    MA_ASSERT(pMP3    != NULL);
+    MA_ASSERT(pConfig != NULL);
+
+    seekPointCount = pConfig->seekPointCount;
+    if (seekPointCount > 0) {
+        pSeekPoints = (ma_dr_mp3_seek_point*)ma_malloc(sizeof(*pMP3->pSeekPoints) * seekPointCount, pAllocationCallbacks);
+        if (pSeekPoints == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    }
+
+    mp3Result = ma_dr_mp3_calculate_seek_points(&pMP3->dr, &seekPointCount, pSeekPoints);
+    if (mp3Result != MA_TRUE) {
+        ma_free(pSeekPoints, pAllocationCallbacks);
+        return MA_ERROR;
+    }
+
+    mp3Result = ma_dr_mp3_bind_seek_table(&pMP3->dr, seekPointCount, pSeekPoints);
+    if (mp3Result != MA_TRUE) {
+        ma_free(pSeekPoints, pAllocationCallbacks);
+        return MA_ERROR;
+    }
+
+    pMP3->seekPointCount = seekPointCount;
+    pMP3->pSeekPoints    = pSeekPoints;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_mp3_post_init(ma_mp3* pMP3, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_result result;
+
+    result = ma_mp3_generate_seek_table(pMP3, pConfig, pAllocationCallbacks);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_mp3_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3)
+{
+    ma_result result;
+
+    result = ma_mp3_init_internal(pConfig, pMP3);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (onRead == NULL || onSeek == NULL) {
+        return MA_INVALID_ARGS; /* onRead and onSeek are mandatory. */
+    }
+
+    pMP3->onRead = onRead;
+    pMP3->onSeek = onSeek;
+    pMP3->onTell = onTell;
+    pMP3->pReadSeekTellUserData = pReadSeekTellUserData;
+
+    #if !defined(MA_NO_MP3)
+    {
+        ma_bool32 mp3Result;
+
+        mp3Result = ma_dr_mp3_init(&pMP3->dr, ma_mp3_dr_callback__read, ma_mp3_dr_callback__seek, pMP3, pAllocationCallbacks);
+        if (mp3Result != MA_TRUE) {
+            return MA_INVALID_FILE;
+        }
+
+        ma_mp3_post_init(pMP3, pConfig, pAllocationCallbacks);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* mp3 is disabled. */
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_mp3_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3)
+{
+    ma_result result;
+
+    result = ma_mp3_init_internal(pConfig, pMP3);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        ma_bool32 mp3Result;
+
+        mp3Result = ma_dr_mp3_init_file(&pMP3->dr, pFilePath, pAllocationCallbacks);
+        if (mp3Result != MA_TRUE) {
+            return MA_INVALID_FILE;
+        }
+
+        ma_mp3_post_init(pMP3, pConfig, pAllocationCallbacks);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* mp3 is disabled. */
+        (void)pFilePath;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_mp3_init_file_w(const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3)
+{
+    ma_result result;
+
+    result = ma_mp3_init_internal(pConfig, pMP3);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        ma_bool32 mp3Result;
+
+        mp3Result = ma_dr_mp3_init_file_w(&pMP3->dr, pFilePath, pAllocationCallbacks);
+        if (mp3Result != MA_TRUE) {
+            return MA_INVALID_FILE;
+        }
+
+        ma_mp3_post_init(pMP3, pConfig, pAllocationCallbacks);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* mp3 is disabled. */
+        (void)pFilePath;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_mp3_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_mp3* pMP3)
+{
+    ma_result result;
+
+    result = ma_mp3_init_internal(pConfig, pMP3);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        ma_bool32 mp3Result;
+
+        mp3Result = ma_dr_mp3_init_memory(&pMP3->dr, pData, dataSize, pAllocationCallbacks);
+        if (mp3Result != MA_TRUE) {
+            return MA_INVALID_FILE;
+        }
+
+        ma_mp3_post_init(pMP3, pConfig, pAllocationCallbacks);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* mp3 is disabled. */
+        (void)pData;
+        (void)dataSize;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API void ma_mp3_uninit(ma_mp3* pMP3, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pMP3 == NULL) {
+        return;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        ma_dr_mp3_uninit(&pMP3->dr);
+    }
+    #else
+    {
+        /* mp3 is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+    }
+    #endif
+
+    /* Seek points need to be freed after the MP3 decoder has been uninitialized to ensure they're no longer being referenced. */
+    ma_free(pMP3->pSeekPoints, pAllocationCallbacks);
+
+    ma_data_source_uninit(&pMP3->ds);
+}
+
+MA_API ma_result ma_mp3_read_pcm_frames(ma_mp3* pMP3, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pMP3 == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        /* We always use floating point format. */
+        ma_result result = MA_SUCCESS;  /* Must be initialized to MA_SUCCESS. */
+        ma_uint64 totalFramesRead = 0;
+        ma_format format;
+
+        ma_mp3_get_data_format(pMP3, &format, NULL, NULL, NULL, 0);
+
+        switch (format)
+        {
+            case ma_format_f32:
+            {
+                totalFramesRead = ma_dr_mp3_read_pcm_frames_f32(&pMP3->dr, frameCount, (float*)pFramesOut);
+            } break;
+
+            case ma_format_s16:
+            {
+                totalFramesRead = ma_dr_mp3_read_pcm_frames_s16(&pMP3->dr, frameCount, (ma_int16*)pFramesOut);
+            } break;
+
+            case ma_format_u8:
+            case ma_format_s24:
+            case ma_format_s32:
+            case ma_format_unknown:
+            default:
+            {
+                return MA_INVALID_OPERATION;
+            };
+        }
+
+        /* In the future we'll update ma_dr_mp3 to return MA_AT_END for us. */
+        if (totalFramesRead == 0) {
+            result = MA_AT_END;
+        }
+
+        if (pFramesRead != NULL) {
+            *pFramesRead = totalFramesRead;
+        }
+
+        return result;
+    }
+    #else
+    {
+        /* mp3 is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+
+        (void)pFramesOut;
+        (void)frameCount;
+        (void)pFramesRead;
+
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_mp3_seek_to_pcm_frame(ma_mp3* pMP3, ma_uint64 frameIndex)
+{
+    if (pMP3 == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        ma_bool32 mp3Result;
+
+        mp3Result = ma_dr_mp3_seek_to_pcm_frame(&pMP3->dr, frameIndex);
+        if (mp3Result != MA_TRUE) {
+            return MA_ERROR;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* mp3 is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+
+        (void)frameIndex;
+
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_mp3_get_data_format(ma_mp3* pMP3, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    /* Defaults for safety. */
+    if (pFormat != NULL) {
+        *pFormat = ma_format_unknown;
+    }
+    if (pChannels != NULL) {
+        *pChannels = 0;
+    }
+    if (pSampleRate != NULL) {
+        *pSampleRate = 0;
+    }
+    if (pChannelMap != NULL) {
+        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
+    }
+
+    if (pMP3 == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pFormat != NULL) {
+        *pFormat = pMP3->format;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        if (pChannels != NULL) {
+            *pChannels = pMP3->dr.channels;
+        }
+
+        if (pSampleRate != NULL) {
+            *pSampleRate = pMP3->dr.sampleRate;
+        }
+
+        if (pChannelMap != NULL) {
+            ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pMP3->dr.channels);
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* mp3 is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_mp3_get_cursor_in_pcm_frames(ma_mp3* pMP3, ma_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;   /* Safety. */
+
+    if (pMP3 == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        *pCursor = pMP3->dr.currentPCMFrame;
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* mp3 is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_mp3_get_length_in_pcm_frames(ma_mp3* pMP3, ma_uint64* pLength)
+{
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;   /* Safety. */
+
+    if (pMP3 == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_MP3)
+    {
+        *pLength = ma_dr_mp3_get_pcm_frame_count(&pMP3->dr);
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* mp3 is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+
+static ma_result ma_decoding_backend_init__mp3(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_mp3* pMP3;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pMP3 = (ma_mp3*)ma_malloc(sizeof(*pMP3), pAllocationCallbacks);
+    if (pMP3 == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_mp3_init(onRead, onSeek, onTell, pReadSeekTellUserData, pConfig, pAllocationCallbacks, pMP3);
+    if (result != MA_SUCCESS) {
+        ma_free(pMP3, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pMP3;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_file__mp3(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_mp3* pMP3;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pMP3 = (ma_mp3*)ma_malloc(sizeof(*pMP3), pAllocationCallbacks);
+    if (pMP3 == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_mp3_init_file(pFilePath, pConfig, pAllocationCallbacks, pMP3);
+    if (result != MA_SUCCESS) {
+        ma_free(pMP3, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pMP3;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_file_w__mp3(void* pUserData, const wchar_t* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_mp3* pMP3;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pMP3 = (ma_mp3*)ma_malloc(sizeof(*pMP3), pAllocationCallbacks);
+    if (pMP3 == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_mp3_init_file_w(pFilePath, pConfig, pAllocationCallbacks, pMP3);
+    if (result != MA_SUCCESS) {
+        ma_free(pMP3, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pMP3;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_memory__mp3(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_mp3* pMP3;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pMP3 = (ma_mp3*)ma_malloc(sizeof(*pMP3), pAllocationCallbacks);
+    if (pMP3 == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_mp3_init_memory(pData, dataSize, pConfig, pAllocationCallbacks, pMP3);
+    if (result != MA_SUCCESS) {
+        ma_free(pMP3, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pMP3;
+
+    return MA_SUCCESS;
+}
+
+static void ma_decoding_backend_uninit__mp3(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_mp3* pMP3 = (ma_mp3*)pBackend;
+
+    (void)pUserData;
+
+    ma_mp3_uninit(pMP3, pAllocationCallbacks);
+    ma_free(pMP3, pAllocationCallbacks);
+}
+
+static ma_decoding_backend_vtable g_ma_decoding_backend_vtable_mp3 =
+{
+    ma_decoding_backend_init__mp3,
+    ma_decoding_backend_init_file__mp3,
+    ma_decoding_backend_init_file_w__mp3,
+    ma_decoding_backend_init_memory__mp3,
+    ma_decoding_backend_uninit__mp3
+};
+
+static ma_result ma_decoder_init_mp3__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_vtable__internal(&g_ma_decoding_backend_vtable_mp3, NULL, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_mp3_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_file__internal(&g_ma_decoding_backend_vtable_mp3, NULL, pFilePath, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_mp3_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_file_w__internal(&g_ma_decoding_backend_vtable_mp3, NULL, pFilePath, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_mp3_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_memory__internal(&g_ma_decoding_backend_vtable_mp3, NULL, pData, dataSize, pConfig, pDecoder);
+}
+#endif  /* ma_dr_mp3_h */
+
+/* Vorbis */
+#ifdef STB_VORBIS_INCLUDE_STB_VORBIS_H
+#define MA_HAS_VORBIS
+
+/* The size in bytes of each chunk of data to read from the Vorbis stream. */
+#define MA_VORBIS_DATA_CHUNK_SIZE  4096
+
+typedef struct
+{
+    ma_data_source_base ds;
+    ma_read_proc onRead;
+    ma_seek_proc onSeek;
+    ma_tell_proc onTell;
+    void* pReadSeekTellUserData;
+    ma_allocation_callbacks allocationCallbacks;    /* Store the allocation callbacks within the structure because we may need to dynamically expand a buffer in ma_stbvorbis_read_pcm_frames() when using push mode. */
+    ma_format format;               /* Only f32 is allowed with stb_vorbis. */
+    ma_uint32 channels;
+    ma_uint32 sampleRate;
+    ma_uint64 cursor;
+#if !defined(MA_NO_VORBIS)
+    stb_vorbis* stb;
+    ma_bool32 usingPushMode;
+    struct
+    {
+        ma_uint8* pData;
+        size_t dataSize;
+        size_t dataCapacity;
+        size_t audioStartOffsetInBytes;
+        ma_uint32 framesConsumed;   /* The number of frames consumed in ppPacketData. */
+        ma_uint32 framesRemaining;  /* The number of frames remaining in ppPacketData. */
+        float** ppPacketData;
+    } push;
+#endif
+} ma_stbvorbis;
+
+MA_API ma_result ma_stbvorbis_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis);
+MA_API ma_result ma_stbvorbis_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis);
+MA_API ma_result ma_stbvorbis_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis);
+MA_API void ma_stbvorbis_uninit(ma_stbvorbis* pVorbis, const ma_allocation_callbacks* pAllocationCallbacks);
+MA_API ma_result ma_stbvorbis_read_pcm_frames(ma_stbvorbis* pVorbis, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead);
+MA_API ma_result ma_stbvorbis_seek_to_pcm_frame(ma_stbvorbis* pVorbis, ma_uint64 frameIndex);
+MA_API ma_result ma_stbvorbis_get_data_format(ma_stbvorbis* pVorbis, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap);
+MA_API ma_result ma_stbvorbis_get_cursor_in_pcm_frames(ma_stbvorbis* pVorbis, ma_uint64* pCursor);
+MA_API ma_result ma_stbvorbis_get_length_in_pcm_frames(ma_stbvorbis* pVorbis, ma_uint64* pLength);
+
+
+static ma_result ma_stbvorbis_ds_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_stbvorbis_read_pcm_frames((ma_stbvorbis*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_stbvorbis_ds_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_stbvorbis_seek_to_pcm_frame((ma_stbvorbis*)pDataSource, frameIndex);
+}
+
+static ma_result ma_stbvorbis_ds_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    return ma_stbvorbis_get_data_format((ma_stbvorbis*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+}
+
+static ma_result ma_stbvorbis_ds_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    return ma_stbvorbis_get_cursor_in_pcm_frames((ma_stbvorbis*)pDataSource, pCursor);
+}
+
+static ma_result ma_stbvorbis_ds_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    return ma_stbvorbis_get_length_in_pcm_frames((ma_stbvorbis*)pDataSource, pLength);
+}
+
+static ma_data_source_vtable g_ma_stbvorbis_ds_vtable =
+{
+    ma_stbvorbis_ds_read,
+    ma_stbvorbis_ds_seek,
+    ma_stbvorbis_ds_get_data_format,
+    ma_stbvorbis_ds_get_cursor,
+    ma_stbvorbis_ds_get_length,
+    NULL,   /* onSetLooping */
+    0
+};
+
+
+static ma_result ma_stbvorbis_init_internal(const ma_decoding_backend_config* pConfig, ma_stbvorbis* pVorbis)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+
+    (void)pConfig;
+
+    if (pVorbis == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pVorbis);
+    pVorbis->format = ma_format_f32;    /* Only supporting f32. */
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_stbvorbis_ds_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pVorbis->ds);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the base data source. */
+    }
+
+    return MA_SUCCESS;
+}
+
+#if !defined(MA_NO_VORBIS)
+static ma_result ma_stbvorbis_post_init(ma_stbvorbis* pVorbis)
+{
+    stb_vorbis_info info;
+
+    MA_ASSERT(pVorbis != NULL);
+
+    info = stb_vorbis_get_info(pVorbis->stb);
+
+    pVorbis->channels   = info.channels;
+    pVorbis->sampleRate = info.sample_rate;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_stbvorbis_init_internal_decoder_push(ma_stbvorbis* pVorbis)
+{
+    ma_result result;
+    stb_vorbis* stb;
+    size_t dataSize = 0;
+    size_t dataCapacity = 0;
+    ma_uint8* pData = NULL; /* <-- Must be initialized to NULL. */
+
+    for (;;) {
+        int vorbisError;
+        int consumedDataSize;   /* <-- Fill by stb_vorbis_open_pushdata(). */
+        size_t bytesRead;
+        ma_uint8* pNewData;
+
+        /* Allocate memory for the new chunk. */
+        dataCapacity += MA_VORBIS_DATA_CHUNK_SIZE;
+        pNewData = (ma_uint8*)ma_realloc(pData, dataCapacity, &pVorbis->allocationCallbacks);
+        if (pNewData == NULL) {
+            ma_free(pData, &pVorbis->allocationCallbacks);
+            return MA_OUT_OF_MEMORY;
+        }
+
+        pData = pNewData;
+
+        /* Read in the next chunk. */
+        result = pVorbis->onRead(pVorbis->pReadSeekTellUserData, ma_offset_ptr(pData, dataSize), (dataCapacity - dataSize), &bytesRead);
+        dataSize += bytesRead;
+
+        if (result != MA_SUCCESS) {
+            ma_free(pData, &pVorbis->allocationCallbacks);
+            return result;
+        }
+
+        /* We have a maximum of 31 bits with stb_vorbis. */
+        if (dataSize > INT_MAX) {
+            ma_free(pData, &pVorbis->allocationCallbacks);
+            return MA_TOO_BIG;
+        }
+
+        stb = stb_vorbis_open_pushdata(pData, (int)dataSize, &consumedDataSize, &vorbisError, NULL);
+        if (stb != NULL) {
+            /*
+            Successfully opened the Vorbis decoder. We might have some leftover unprocessed
+            data so we'll need to move that down to the front.
+            */
+            dataSize -= (size_t)consumedDataSize;   /* Consume the data. */
+            MA_MOVE_MEMORY(pData, ma_offset_ptr(pData, consumedDataSize), dataSize);
+
+            /*
+            We need to track the start point so we can seek back to the start of the audio
+            data when seeking.
+            */
+            pVorbis->push.audioStartOffsetInBytes = consumedDataSize;
+
+            break;
+        } else {
+            /* Failed to open the decoder. */
+            if (vorbisError == VORBIS_need_more_data) {
+                continue;
+            } else {
+                ma_free(pData, &pVorbis->allocationCallbacks);
+                return MA_ERROR;   /* Failed to open the stb_vorbis decoder. */
+            }
+        }
+    }
+
+    MA_ASSERT(stb != NULL);
+    pVorbis->stb = stb;
+    pVorbis->push.pData = pData;
+    pVorbis->push.dataSize = dataSize;
+    pVorbis->push.dataCapacity = dataCapacity;
+
+    return MA_SUCCESS;
+}
+#endif
+
+MA_API ma_result ma_stbvorbis_init(ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis)
+{
+    ma_result result;
+
+    result = ma_stbvorbis_init_internal(pConfig, pVorbis);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (onRead == NULL || onSeek == NULL) {
+        return MA_INVALID_ARGS; /* onRead and onSeek are mandatory. */
+    }
+
+    pVorbis->onRead = onRead;
+    pVorbis->onSeek = onSeek;
+    pVorbis->onTell = onTell;
+    pVorbis->pReadSeekTellUserData = pReadSeekTellUserData;
+    ma_allocation_callbacks_init_copy(&pVorbis->allocationCallbacks, pAllocationCallbacks);
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        /*
+        stb_vorbis lacks a callback based API for its pulling API which means we're stuck with the
+        pushing API. In order for us to be able to successfully initialize the decoder we need to
+        supply it with enough data. We need to keep loading data until we have enough.
+        */
+        result = ma_stbvorbis_init_internal_decoder_push(pVorbis);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pVorbis->usingPushMode = MA_TRUE;
+
+        result = ma_stbvorbis_post_init(pVorbis);
+        if (result != MA_SUCCESS) {
+            stb_vorbis_close(pVorbis->stb);
+            ma_free(pVorbis->push.pData, pAllocationCallbacks);
+            return result;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* vorbis is disabled. */
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_stbvorbis_init_file(const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis)
+{
+    ma_result result;
+
+    result = ma_stbvorbis_init_internal(pConfig, pVorbis);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        (void)pAllocationCallbacks; /* Don't know how to make use of this with stb_vorbis. */
+
+        /* We can use stb_vorbis' pull mode for file based streams. */
+        pVorbis->stb = stb_vorbis_open_filename(pFilePath, NULL, NULL);
+        if (pVorbis->stb == NULL) {
+            return MA_INVALID_FILE;
+        }
+
+        pVorbis->usingPushMode = MA_FALSE;
+
+        result = ma_stbvorbis_post_init(pVorbis);
+        if (result != MA_SUCCESS) {
+            stb_vorbis_close(pVorbis->stb);
+            return result;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* vorbis is disabled. */
+        (void)pFilePath;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_stbvorbis_init_memory(const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_stbvorbis* pVorbis)
+{
+    ma_result result;
+
+    result = ma_stbvorbis_init_internal(pConfig, pVorbis);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        (void)pAllocationCallbacks;
+
+        /* stb_vorbis uses an int as its size specifier, restricting it to 32-bit even on 64-bit systems. *sigh*. */
+        if (dataSize > INT_MAX) {
+            return MA_TOO_BIG;
+        }
+
+        pVorbis->stb = stb_vorbis_open_memory((const unsigned char*)pData, (int)dataSize, NULL, NULL);
+        if (pVorbis->stb == NULL) {
+            return MA_INVALID_FILE;
+        }
+
+        pVorbis->usingPushMode = MA_FALSE;
+
+        result = ma_stbvorbis_post_init(pVorbis);
+        if (result != MA_SUCCESS) {
+            stb_vorbis_close(pVorbis->stb);
+            return result;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* vorbis is disabled. */
+        (void)pData;
+        (void)dataSize;
+        (void)pAllocationCallbacks;
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API void ma_stbvorbis_uninit(ma_stbvorbis* pVorbis, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pVorbis == NULL) {
+        return;
+    }
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        stb_vorbis_close(pVorbis->stb);
+
+        /* We'll have to clear some memory if we're using push mode. */
+        if (pVorbis->usingPushMode) {
+            ma_free(pVorbis->push.pData, pAllocationCallbacks);
+        }
+    }
+    #else
+    {
+        /* vorbis is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+    }
+    #endif
+
+    ma_data_source_uninit(&pVorbis->ds);
+}
+
+MA_API ma_result ma_stbvorbis_read_pcm_frames(ma_stbvorbis* pVorbis, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pVorbis == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        /* We always use floating point format. */
+        ma_result result = MA_SUCCESS;  /* Must be initialized to MA_SUCCESS. */
+        ma_uint64 totalFramesRead = 0;
+        ma_format format;
+        ma_uint32 channels;
+
+        ma_stbvorbis_get_data_format(pVorbis, &format, &channels, NULL, NULL, 0);
+
+        if (format == ma_format_f32) {
+            /* We read differently depending on whether or not we're using push mode. */
+            if (pVorbis->usingPushMode) {
+                /* Push mode. This is the complex case. */
+                float* pFramesOutF32 = (float*)pFramesOut;
+
+                while (totalFramesRead < frameCount) {
+                    /* The first thing to do is read from any already-cached frames. */
+                    ma_uint32 framesToReadFromCache = (ma_uint32)ma_min(pVorbis->push.framesRemaining, (frameCount - totalFramesRead));  /* Safe cast because pVorbis->framesRemaining is 32-bit. */
+
+                    /* The output pointer can be null in which case we just treat it as a seek. */
+                    if (pFramesOut != NULL) {
+                        ma_uint64 iFrame;
+                        for (iFrame = 0; iFrame < framesToReadFromCache; iFrame += 1) {
+                            ma_uint32 iChannel;
+                            for (iChannel = 0; iChannel < pVorbis->channels; iChannel += 1) {
+                                pFramesOutF32[iChannel] = pVorbis->push.ppPacketData[iChannel][pVorbis->push.framesConsumed + iFrame];
+                            }
+
+                            pFramesOutF32 += pVorbis->channels;
+                        }
+                    }
+
+                    /* Update pointers and counters. */
+                    pVorbis->push.framesConsumed  += framesToReadFromCache;
+                    pVorbis->push.framesRemaining -= framesToReadFromCache;
+                    totalFramesRead               += framesToReadFromCache;
+
+                    /* Don't bother reading any more frames right now if we've just finished loading. */
+                    if (totalFramesRead == frameCount) {
+                        break;
+                    }
+
+                    MA_ASSERT(pVorbis->push.framesRemaining == 0);
+
+                    /* Getting here means we've run out of cached frames. We'll need to load some more. */
+                    for (;;) {
+                        int samplesRead = 0;
+                        int consumedDataSize;
+
+                        /* We need to case dataSize to an int, so make sure we can do it safely. */
+                        if (pVorbis->push.dataSize > INT_MAX) {
+                            break;  /* Too big. */
+                        }
+
+                        consumedDataSize = stb_vorbis_decode_frame_pushdata(pVorbis->stb, pVorbis->push.pData, (int)pVorbis->push.dataSize, NULL, &pVorbis->push.ppPacketData, &samplesRead);
+                        if (consumedDataSize != 0) {
+                            /* Successfully decoded a Vorbis frame. Consume the data. */
+                            pVorbis->push.dataSize -= (size_t)consumedDataSize;
+                            MA_MOVE_MEMORY(pVorbis->push.pData, ma_offset_ptr(pVorbis->push.pData, consumedDataSize), pVorbis->push.dataSize);
+
+                            pVorbis->push.framesConsumed  = 0;
+                            pVorbis->push.framesRemaining = samplesRead;
+
+                            break;
+                        } else {
+                            /* Not enough data. Read more. */
+                            size_t bytesRead;
+
+                            /* Expand the data buffer if necessary. */
+                            if (pVorbis->push.dataCapacity == pVorbis->push.dataSize) {
+                                size_t newCap = pVorbis->push.dataCapacity + MA_VORBIS_DATA_CHUNK_SIZE;
+                                ma_uint8* pNewData;
+
+                                pNewData = (ma_uint8*)ma_realloc(pVorbis->push.pData, newCap, &pVorbis->allocationCallbacks);
+                                if (pNewData == NULL) {
+                                    result = MA_OUT_OF_MEMORY;
+                                    break;
+                                }
+
+                                pVorbis->push.pData = pNewData;
+                                pVorbis->push.dataCapacity = newCap;
+                            }
+
+                            /* We should have enough room to load some data. */
+                            result = pVorbis->onRead(pVorbis->pReadSeekTellUserData, ma_offset_ptr(pVorbis->push.pData, pVorbis->push.dataSize), (pVorbis->push.dataCapacity - pVorbis->push.dataSize), &bytesRead);
+                            pVorbis->push.dataSize += bytesRead;
+
+                            if (result != MA_SUCCESS) {
+                                break;  /* Failed to read any data. Get out. */
+                            }
+                        }
+                    }
+
+                    /* If we don't have a success code at this point it means we've encountered an error or the end of the file has been reached (probably the latter). */
+                    if (result != MA_SUCCESS) {
+                        break;
+                    }
+                }
+            } else {
+                /* Pull mode. This is the simple case, but we still need to run in a loop because stb_vorbis loves using 32-bit instead of 64-bit. */
+                while (totalFramesRead < frameCount) {
+                    ma_uint64 framesRemaining = (frameCount - totalFramesRead);
+                    int framesRead;
+
+                    if (framesRemaining > INT_MAX) {
+                        framesRemaining = INT_MAX;
+                    }
+
+                    framesRead = stb_vorbis_get_samples_float_interleaved(pVorbis->stb, channels, (float*)ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, format, channels), (int)framesRemaining * channels);   /* Safe cast. */
+                    totalFramesRead += framesRead;
+
+                    if (framesRead < (int)framesRemaining) {
+                        break;  /* Nothing left to read. Get out. */
+                    }
+                }
+            }
+        } else {
+            result = MA_INVALID_ARGS;
+        }
+
+        pVorbis->cursor += totalFramesRead;
+
+        if (totalFramesRead == 0) {
+            result = MA_AT_END;
+        }
+
+        if (pFramesRead != NULL) {
+            *pFramesRead = totalFramesRead;
+        }
+
+        if (result == MA_SUCCESS && totalFramesRead == 0) {
+            result  = MA_AT_END;
+        }
+
+        return result;
+    }
+    #else
+    {
+        /* vorbis is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+
+        (void)pFramesOut;
+        (void)frameCount;
+        (void)pFramesRead;
+
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_stbvorbis_seek_to_pcm_frame(ma_stbvorbis* pVorbis, ma_uint64 frameIndex)
+{
+    if (pVorbis == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        /* Different seeking methods depending on whether or not we're using push mode. */
+        if (pVorbis->usingPushMode) {
+            /* Push mode. This is the complex case. */
+            ma_result result;
+            float buffer[4096];
+
+            /* If we're seeking backwards, we need to seek back to the start and then brute-force forward. */
+            if (frameIndex < pVorbis->cursor) {
+                if (frameIndex > 0x7FFFFFFF) {
+                    return MA_INVALID_ARGS; /* Trying to seek beyond the 32-bit maximum of stb_vorbis. */
+                }
+
+                /*
+                This is wildly inefficient due to me having trouble getting sample exact seeking working
+                robustly with stb_vorbis_flush_pushdata(). The only way I can think to make this work
+                perfectly is to reinitialize the decoder. Note that we only enter this path when seeking
+                backwards. This will hopefully be removed once we get our own Vorbis decoder implemented.
+                */
+                stb_vorbis_close(pVorbis->stb);
+                ma_free(pVorbis->push.pData, &pVorbis->allocationCallbacks);
+
+                MA_ZERO_OBJECT(&pVorbis->push);
+
+                /* Seek to the start of the file. */
+                result = pVorbis->onSeek(pVorbis->pReadSeekTellUserData, 0, ma_seek_origin_start);
+                if (result != MA_SUCCESS) {
+                    return result;
+                }
+
+                result = ma_stbvorbis_init_internal_decoder_push(pVorbis);
+                if (result != MA_SUCCESS) {
+                    return result;
+                }
+
+                /* At this point we should be sitting on the first frame. */
+                pVorbis->cursor = 0;
+            }
+
+            /* We're just brute-forcing this for now. */
+            while (pVorbis->cursor < frameIndex) {
+                ma_uint64 framesRead;
+                ma_uint64 framesToRead = ma_countof(buffer)/pVorbis->channels;
+                if (framesToRead > (frameIndex - pVorbis->cursor)) {
+                    framesToRead = (frameIndex - pVorbis->cursor);
+                }
+
+                result = ma_stbvorbis_read_pcm_frames(pVorbis, buffer, framesToRead, &framesRead);
+                if (result != MA_SUCCESS) {
+                    return result;
+                }
+            }
+        } else {
+            /* Pull mode. This is the simple case. */
+            int vorbisResult;
+
+            if (frameIndex > UINT_MAX) {
+                return MA_INVALID_ARGS; /* Trying to seek beyond the 32-bit maximum of stb_vorbis. */
+            }
+
+            vorbisResult = stb_vorbis_seek(pVorbis->stb, (unsigned int)frameIndex);  /* Safe cast. */
+            if (vorbisResult == 0) {
+                return MA_ERROR;    /* See failed. */
+            }
+
+            pVorbis->cursor = frameIndex;
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* vorbis is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+
+        (void)frameIndex;
+
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_stbvorbis_get_data_format(ma_stbvorbis* pVorbis, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    /* Defaults for safety. */
+    if (pFormat != NULL) {
+        *pFormat = ma_format_unknown;
+    }
+    if (pChannels != NULL) {
+        *pChannels = 0;
+    }
+    if (pSampleRate != NULL) {
+        *pSampleRate = 0;
+    }
+    if (pChannelMap != NULL) {
+        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
+    }
+
+    if (pVorbis == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    if (pFormat != NULL) {
+        *pFormat = pVorbis->format;
+    }
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        if (pChannels != NULL) {
+            *pChannels = pVorbis->channels;
+        }
+
+        if (pSampleRate != NULL) {
+            *pSampleRate = pVorbis->sampleRate;
+        }
+
+        if (pChannelMap != NULL) {
+            ma_channel_map_init_standard(ma_standard_channel_map_vorbis, pChannelMap, channelMapCap, pVorbis->channels);
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* vorbis is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_stbvorbis_get_cursor_in_pcm_frames(ma_stbvorbis* pVorbis, ma_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;   /* Safety. */
+
+    if (pVorbis == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        *pCursor = pVorbis->cursor;
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* vorbis is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+MA_API ma_result ma_stbvorbis_get_length_in_pcm_frames(ma_stbvorbis* pVorbis, ma_uint64* pLength)
+{
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;   /* Safety. */
+
+    if (pVorbis == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_VORBIS)
+    {
+        if (pVorbis->usingPushMode) {
+            *pLength = 0;   /* I don't know of a good way to determine this reliably with stb_vorbis and push mode. */
+        } else {
+            *pLength = stb_vorbis_stream_length_in_samples(pVorbis->stb);
+        }
+
+        return MA_SUCCESS;
+    }
+    #else
+    {
+        /* vorbis is disabled. Should never hit this since initialization would have failed. */
+        MA_ASSERT(MA_FALSE);
+        return MA_NOT_IMPLEMENTED;
+    }
+    #endif
+}
+
+
+static ma_result ma_decoding_backend_init__stbvorbis(void* pUserData, ma_read_proc onRead, ma_seek_proc onSeek, ma_tell_proc onTell, void* pReadSeekTellUserData, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_stbvorbis* pVorbis;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pVorbis = (ma_stbvorbis*)ma_malloc(sizeof(*pVorbis), pAllocationCallbacks);
+    if (pVorbis == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_stbvorbis_init(onRead, onSeek, onTell, pReadSeekTellUserData, pConfig, pAllocationCallbacks, pVorbis);
+    if (result != MA_SUCCESS) {
+        ma_free(pVorbis, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pVorbis;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_file__stbvorbis(void* pUserData, const char* pFilePath, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_stbvorbis* pVorbis;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pVorbis = (ma_stbvorbis*)ma_malloc(sizeof(*pVorbis), pAllocationCallbacks);
+    if (pVorbis == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_stbvorbis_init_file(pFilePath, pConfig, pAllocationCallbacks, pVorbis);
+    if (result != MA_SUCCESS) {
+        ma_free(pVorbis, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pVorbis;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoding_backend_init_memory__stbvorbis(void* pUserData, const void* pData, size_t dataSize, const ma_decoding_backend_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source** ppBackend)
+{
+    ma_result result;
+    ma_stbvorbis* pVorbis;
+
+    (void)pUserData;    /* For now not using pUserData, but once we start storing the vorbis decoder state within the ma_decoder structure this will be set to the decoder so we can avoid a malloc. */
+
+    /* For now we're just allocating the decoder backend on the heap. */
+    pVorbis = (ma_stbvorbis*)ma_malloc(sizeof(*pVorbis), pAllocationCallbacks);
+    if (pVorbis == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_stbvorbis_init_memory(pData, dataSize, pConfig, pAllocationCallbacks, pVorbis);
+    if (result != MA_SUCCESS) {
+        ma_free(pVorbis, pAllocationCallbacks);
+        return result;
+    }
+
+    *ppBackend = pVorbis;
+
+    return MA_SUCCESS;
+}
+
+static void ma_decoding_backend_uninit__stbvorbis(void* pUserData, ma_data_source* pBackend, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_stbvorbis* pVorbis = (ma_stbvorbis*)pBackend;
+
+    (void)pUserData;
+
+    ma_stbvorbis_uninit(pVorbis, pAllocationCallbacks);
+    ma_free(pVorbis, pAllocationCallbacks);
+}
+
+static ma_decoding_backend_vtable g_ma_decoding_backend_vtable_stbvorbis =
+{
+    ma_decoding_backend_init__stbvorbis,
+    ma_decoding_backend_init_file__stbvorbis,
+    NULL, /* onInitFileW() */
+    ma_decoding_backend_init_memory__stbvorbis,
+    ma_decoding_backend_uninit__stbvorbis
+};
+
+static ma_result ma_decoder_init_vorbis__internal(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_vtable__internal(&g_ma_decoding_backend_vtable_stbvorbis, NULL, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_vorbis_from_file__internal(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_file__internal(&g_ma_decoding_backend_vtable_stbvorbis, NULL, pFilePath, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_vorbis_from_file_w__internal(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_file_w__internal(&g_ma_decoding_backend_vtable_stbvorbis, NULL, pFilePath, pConfig, pDecoder);
+}
+
+static ma_result ma_decoder_init_vorbis_from_memory__internal(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    return ma_decoder_init_from_memory__internal(&g_ma_decoding_backend_vtable_stbvorbis, NULL, pData, dataSize, pConfig, pDecoder);
+}
+#endif  /* STB_VORBIS_INCLUDE_STB_VORBIS_H */
+
+
+
+static ma_result ma_decoder__init_allocation_callbacks(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    MA_ASSERT(pDecoder != NULL);
+
+    if (pConfig != NULL) {
+        return ma_allocation_callbacks_init_copy(&pDecoder->allocationCallbacks, &pConfig->allocationCallbacks);
+    } else {
+        pDecoder->allocationCallbacks = ma_allocation_callbacks_init_default();
+        return MA_SUCCESS;
+    }
+}
+
+static ma_result ma_decoder__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_decoder_read_pcm_frames((ma_decoder*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_decoder__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_decoder_seek_to_pcm_frame((ma_decoder*)pDataSource, frameIndex);
+}
+
+static ma_result ma_decoder__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    return ma_decoder_get_data_format((ma_decoder*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+}
+
+static ma_result ma_decoder__data_source_on_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    return ma_decoder_get_cursor_in_pcm_frames((ma_decoder*)pDataSource, pCursor);
+}
+
+static ma_result ma_decoder__data_source_on_get_length(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    return ma_decoder_get_length_in_pcm_frames((ma_decoder*)pDataSource, pLength);
+}
+
+static ma_data_source_vtable g_ma_decoder_data_source_vtable =
+{
+    ma_decoder__data_source_on_read,
+    ma_decoder__data_source_on_seek,
+    ma_decoder__data_source_on_get_data_format,
+    ma_decoder__data_source_on_get_cursor,
+    ma_decoder__data_source_on_get_length,
+    NULL,   /* onSetLooping */
+    0
+};
+
+static ma_result ma_decoder__preinit(ma_decoder_read_proc onRead, ma_decoder_seek_proc onSeek, ma_decoder_tell_proc onTell, void* pUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+
+    MA_ASSERT(pConfig != NULL);
+
+    if (pDecoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDecoder);
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_decoder_data_source_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pDecoder->ds);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pDecoder->onRead    = onRead;
+    pDecoder->onSeek    = onSeek;
+    pDecoder->onTell    = onTell;
+    pDecoder->pUserData = pUserData;
+
+    result = ma_decoder__init_allocation_callbacks(pConfig, pDecoder);
+    if (result != MA_SUCCESS) {
+        ma_data_source_uninit(&pDecoder->ds);
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoder__postinit(const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+
+    result = ma_decoder__init_data_converter(pDecoder, pConfig);
+
+    /* If we failed post initialization we need to uninitialize the decoder before returning to prevent a memory leak. */
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(pDecoder);
+        return result;
+    }
+
+    return result;
+}
+
+
+static ma_result ma_decoder_init__internal(ma_decoder_read_proc onRead, ma_decoder_seek_proc onSeek, void* pUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result = MA_NO_BACKEND;
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(pDecoder != NULL);
+
+    /* Silence some warnings in the case that we don't have any decoder backends enabled. */
+    (void)onRead;
+    (void)onSeek;
+    (void)pUserData;
+
+
+    /* If we've specified a specific encoding type, try that first. */
+    if (pConfig->encodingFormat != ma_encoding_format_unknown) {
+    #ifdef MA_HAS_WAV
+        if (pConfig->encodingFormat == ma_encoding_format_wav) {
+            result = ma_decoder_init_wav__internal(pConfig, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (pConfig->encodingFormat == ma_encoding_format_flac) {
+            result = ma_decoder_init_flac__internal(pConfig, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (pConfig->encodingFormat == ma_encoding_format_mp3) {
+            result = ma_decoder_init_mp3__internal(pConfig, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (pConfig->encodingFormat == ma_encoding_format_vorbis) {
+            result = ma_decoder_init_vorbis__internal(pConfig, pDecoder);
+        }
+    #endif
+
+        /* If we weren't able to initialize the decoder, seek back to the start to give the next attempts a clean start. */
+        if (result != MA_SUCCESS) {
+            onSeek(pDecoder, 0, ma_seek_origin_start);
+        }
+    }
+
+    if (result != MA_SUCCESS) {
+        /* Getting here means we couldn't load a specific decoding backend based on the encoding format. */
+
+        /*
+        We use trial and error to open a decoder. We prioritize custom decoders so that if they
+        implement the same encoding format they take priority over the built-in decoders.
+        */
+        if (result != MA_SUCCESS) {
+            result = ma_decoder_init_custom__internal(pConfig, pDecoder);
+            if (result != MA_SUCCESS) {
+                onSeek(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+
+        /*
+        If we get to this point and we still haven't found a decoder, and the caller has requested a
+        specific encoding format, there's no hope for it. Abort.
+        */
+        if (pConfig->encodingFormat != ma_encoding_format_unknown) {
+            return MA_NO_BACKEND;
+        }
+
+    #ifdef MA_HAS_WAV
+        if (result != MA_SUCCESS) {
+            result = ma_decoder_init_wav__internal(pConfig, pDecoder);
+            if (result != MA_SUCCESS) {
+                onSeek(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (result != MA_SUCCESS) {
+            result = ma_decoder_init_flac__internal(pConfig, pDecoder);
+            if (result != MA_SUCCESS) {
+                onSeek(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (result != MA_SUCCESS) {
+            result = ma_decoder_init_mp3__internal(pConfig, pDecoder);
+            if (result != MA_SUCCESS) {
+                onSeek(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (result != MA_SUCCESS) {
+            result = ma_decoder_init_vorbis__internal(pConfig, pDecoder);
+            if (result != MA_SUCCESS) {
+                onSeek(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    }
+
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return ma_decoder__postinit(pConfig, pDecoder);
+}
+
+MA_API ma_result ma_decoder_init(ma_decoder_read_proc onRead, ma_decoder_seek_proc onSeek, void* pUserData, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_decoder_config config;
+    ma_result result;
+
+    config = ma_decoder_config_init_copy(pConfig);
+
+    result = ma_decoder__preinit(onRead, onSeek, NULL, pUserData, &config, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return ma_decoder_init__internal(onRead, onSeek, pUserData, &config, pDecoder);
+}
+
+
+static ma_result ma_decoder__on_read_memory(ma_decoder* pDecoder, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead)
+{
+    size_t bytesRemaining;
+
+    MA_ASSERT(pDecoder->data.memory.dataSize >= pDecoder->data.memory.currentReadPos);
+
+    if (pBytesRead != NULL) {
+        *pBytesRead = 0;
+    }
+
+    bytesRemaining = pDecoder->data.memory.dataSize - pDecoder->data.memory.currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+
+    if (bytesRemaining == 0) {
+        return MA_AT_END;
+    }
+
+    if (bytesToRead > 0) {
+        MA_COPY_MEMORY(pBufferOut, pDecoder->data.memory.pData + pDecoder->data.memory.currentReadPos, bytesToRead);
+        pDecoder->data.memory.currentReadPos += bytesToRead;
+    }
+
+    if (pBytesRead != NULL) {
+        *pBytesRead = bytesToRead;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoder__on_seek_memory(ma_decoder* pDecoder, ma_int64 byteOffset, ma_seek_origin origin)
+{
+    if (byteOffset > 0 && (ma_uint64)byteOffset > MA_SIZE_MAX) {
+        return MA_BAD_SEEK;
+    }
+
+    if (origin == ma_seek_origin_current) {
+        if (byteOffset > 0) {
+            if (pDecoder->data.memory.currentReadPos + byteOffset > pDecoder->data.memory.dataSize) {
+                byteOffset = (ma_int64)(pDecoder->data.memory.dataSize - pDecoder->data.memory.currentReadPos);  /* Trying to seek too far forward. */
+            }
+
+            pDecoder->data.memory.currentReadPos += (size_t)byteOffset;
+        } else {
+            if (pDecoder->data.memory.currentReadPos < (size_t)-byteOffset) {
+                byteOffset = -(ma_int64)pDecoder->data.memory.currentReadPos;  /* Trying to seek too far backwards. */
+            }
+
+            pDecoder->data.memory.currentReadPos -= (size_t)-byteOffset;
+        }
+    } else {
+        if (origin == ma_seek_origin_end) {
+            if (byteOffset < 0) {
+                byteOffset = -byteOffset;
+            }
+
+            if (byteOffset > (ma_int64)pDecoder->data.memory.dataSize) {
+                pDecoder->data.memory.currentReadPos = 0;   /* Trying to seek too far back. */
+            } else {
+                pDecoder->data.memory.currentReadPos = pDecoder->data.memory.dataSize - (size_t)byteOffset;
+            }
+        } else {
+            if ((size_t)byteOffset <= pDecoder->data.memory.dataSize) {
+                pDecoder->data.memory.currentReadPos = (size_t)byteOffset;
+            } else {
+                pDecoder->data.memory.currentReadPos = pDecoder->data.memory.dataSize;  /* Trying to seek too far forward. */
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoder__on_tell_memory(ma_decoder* pDecoder, ma_int64* pCursor)
+{
+    MA_ASSERT(pDecoder != NULL);
+    MA_ASSERT(pCursor  != NULL);
+
+    *pCursor = (ma_int64)pDecoder->data.memory.currentReadPos;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoder__preinit_memory_wrapper(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result = ma_decoder__preinit(ma_decoder__on_read_memory, ma_decoder__on_seek_memory, ma_decoder__on_tell_memory, NULL, pConfig, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pData == NULL || dataSize == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pDecoder->data.memory.pData = (const ma_uint8*)pData;
+    pDecoder->data.memory.dataSize = dataSize;
+    pDecoder->data.memory.currentReadPos = 0;
+
+    (void)pConfig;
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_init_memory(const void* pData, size_t dataSize, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoder_config config;
+
+    config = ma_decoder_config_init_copy(pConfig);
+
+    result = ma_decoder__preinit(NULL, NULL, NULL, NULL, &config, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pData == NULL || dataSize == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* If the backend has support for loading from a file path we'll want to use that. If that all fails we'll fall back to the VFS path. */
+    result = MA_NO_BACKEND;
+
+    if (config.encodingFormat != ma_encoding_format_unknown) {
+    #ifdef MA_HAS_WAV
+        if (config.encodingFormat == ma_encoding_format_wav) {
+            result = ma_decoder_init_wav_from_memory__internal(pData, dataSize, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (config.encodingFormat == ma_encoding_format_flac) {
+            result = ma_decoder_init_flac_from_memory__internal(pData, dataSize, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (config.encodingFormat == ma_encoding_format_mp3) {
+            result = ma_decoder_init_mp3_from_memory__internal(pData, dataSize, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (config.encodingFormat == ma_encoding_format_vorbis) {
+            result = ma_decoder_init_vorbis_from_memory__internal(pData, dataSize, &config, pDecoder);
+        }
+    #endif
+    }
+
+    if (result != MA_SUCCESS) {
+        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
+
+        /*
+        We use trial and error to open a decoder. We prioritize custom decoders so that if they
+        implement the same encoding format they take priority over the built-in decoders.
+        */
+        result = ma_decoder_init_custom_from_memory__internal(pData, dataSize, &config, pDecoder);
+
+        /*
+        If we get to this point and we still haven't found a decoder, and the caller has requested a
+        specific encoding format, there's no hope for it. Abort.
+        */
+        if (result != MA_SUCCESS && config.encodingFormat != ma_encoding_format_unknown) {
+            return MA_NO_BACKEND;
+        }
+
+        /* Use trial and error for stock decoders. */
+        if (result != MA_SUCCESS) {
+        #ifdef MA_HAS_WAV
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_wav_from_memory__internal(pData, dataSize, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_FLAC
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_flac_from_memory__internal(pData, dataSize, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_MP3
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_mp3_from_memory__internal(pData, dataSize, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_VORBIS
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_vorbis_from_memory__internal(pData, dataSize, &config, pDecoder);
+            }
+        #endif
+        }
+    }
+
+    /*
+    If at this point we still haven't successfully initialized the decoder it most likely means
+    the backend doesn't have an implementation for loading from a file path. We'll try using
+    miniaudio's built-in file IO for loading file.
+    */
+    if (result == MA_SUCCESS) {
+        /* Initialization was successful. Finish up. */
+        result = ma_decoder__postinit(&config, pDecoder);
+        if (result != MA_SUCCESS) {
+            /*
+            The backend was initialized successfully, but for some reason post-initialization failed. This is most likely
+            due to an out of memory error. We're going to abort with an error here and not try to recover.
+            */
+            if (pDecoder->pBackendVTable != NULL && pDecoder->pBackendVTable->onUninit != NULL) {
+                pDecoder->pBackendVTable->onUninit(pDecoder->pBackendUserData, &pDecoder->pBackend, &pDecoder->allocationCallbacks);
+            }
+
+            return result;
+        }
+    } else {
+        /* Probably no implementation for loading from a block of memory. Use miniaudio's abstraction instead. */
+        result = ma_decoder__preinit_memory_wrapper(pData, dataSize, &config, pDecoder);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        result = ma_decoder_init__internal(ma_decoder__on_read_memory, ma_decoder__on_seek_memory, NULL, &config, pDecoder);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+
+#if defined(MA_HAS_WAV)    || \
+    defined(MA_HAS_MP3)    || \
+    defined(MA_HAS_FLAC)   || \
+    defined(MA_HAS_VORBIS)
+#define MA_HAS_PATH_API
+#endif
+
+#if defined(MA_HAS_PATH_API)
+static const char* ma_path_file_name(const char* path)
+{
+    const char* fileName;
+
+    if (path == NULL) {
+        return NULL;
+    }
+
+    fileName = path;
+
+    /* We just loop through the path until we find the last slash. */
+    while (path[0] != '\0') {
+        if (path[0] == '/' || path[0] == '\\') {
+            fileName = path;
+        }
+
+        path += 1;
+    }
+
+    /* At this point the file name is sitting on a slash, so just move forward. */
+    while (fileName[0] != '\0' && (fileName[0] == '/' || fileName[0] == '\\')) {
+        fileName += 1;
+    }
+
+    return fileName;
+}
+
+static const wchar_t* ma_path_file_name_w(const wchar_t* path)
+{
+    const wchar_t* fileName;
+
+    if (path == NULL) {
+        return NULL;
+    }
+
+    fileName = path;
+
+    /* We just loop through the path until we find the last slash. */
+    while (path[0] != '\0') {
+        if (path[0] == '/' || path[0] == '\\') {
+            fileName = path;
+        }
+
+        path += 1;
+    }
+
+    /* At this point the file name is sitting on a slash, so just move forward. */
+    while (fileName[0] != '\0' && (fileName[0] == '/' || fileName[0] == '\\')) {
+        fileName += 1;
+    }
+
+    return fileName;
+}
+
+
+static const char* ma_path_extension(const char* path)
+{
+    const char* extension;
+    const char* lastOccurance;
+
+    if (path == NULL) {
+        path = "";
+    }
+
+    extension = ma_path_file_name(path);
+    lastOccurance = NULL;
+
+    /* Just find the last '.' and return. */
+    while (extension[0] != '\0') {
+        if (extension[0] == '.') {
+            extension += 1;
+            lastOccurance = extension;
+        }
+
+        extension += 1;
+    }
+
+    return (lastOccurance != NULL) ? lastOccurance : extension;
+}
+
+static const wchar_t* ma_path_extension_w(const wchar_t* path)
+{
+    const wchar_t* extension;
+    const wchar_t* lastOccurance;
+
+    if (path == NULL) {
+        path = L"";
+    }
+
+    extension = ma_path_file_name_w(path);
+    lastOccurance = NULL;
+
+    /* Just find the last '.' and return. */
+    while (extension[0] != '\0') {
+        if (extension[0] == '.') {
+            extension += 1;
+            lastOccurance = extension;
+        }
+
+        extension += 1;
+    }
+
+    return (lastOccurance != NULL) ? lastOccurance : extension;
+}
+
+
+static ma_bool32 ma_path_extension_equal(const char* path, const char* extension)
+{
+    const char* ext1;
+    const char* ext2;
+
+    if (path == NULL || extension == NULL) {
+        return MA_FALSE;
+    }
+
+    ext1 = extension;
+    ext2 = ma_path_extension(path);
+
+#if defined(_MSC_VER) || defined(__DMC__)
+    return _stricmp(ext1, ext2) == 0;
+#else
+    return strcasecmp(ext1, ext2) == 0;
+#endif
+}
+
+static ma_bool32 ma_path_extension_equal_w(const wchar_t* path, const wchar_t* extension)
+{
+    const wchar_t* ext1;
+    const wchar_t* ext2;
+
+    if (path == NULL || extension == NULL) {
+        return MA_FALSE;
+    }
+
+    ext1 = extension;
+    ext2 = ma_path_extension_w(path);
+
+#if defined(_MSC_VER) || defined(__WATCOMC__) || defined(__DMC__)
+    return _wcsicmp(ext1, ext2) == 0;
+#else
+    /*
+    I'm not aware of a wide character version of strcasecmp(). I'm therefore converting the extensions to multibyte strings and comparing those. This
+    isn't the most efficient way to do it, but it should work OK.
+    */
+    {
+        char ext1MB[4096];
+        char ext2MB[4096];
+        const wchar_t* pext1 = ext1;
+        const wchar_t* pext2 = ext2;
+        mbstate_t mbs1;
+        mbstate_t mbs2;
+
+        MA_ZERO_OBJECT(&mbs1);
+        MA_ZERO_OBJECT(&mbs2);
+
+        if (wcsrtombs(ext1MB, &pext1, sizeof(ext1MB), &mbs1) == (size_t)-1) {
+            return MA_FALSE;
+        }
+        if (wcsrtombs(ext2MB, &pext2, sizeof(ext2MB), &mbs2) == (size_t)-1) {
+            return MA_FALSE;
+        }
+
+        return strcasecmp(ext1MB, ext2MB) == 0;
+    }
+#endif
+}
+#endif  /* MA_HAS_PATH_API */
+
+
+
+static ma_result ma_decoder__on_read_vfs(ma_decoder* pDecoder, void* pBufferOut, size_t bytesToRead, size_t* pBytesRead)
+{
+    MA_ASSERT(pDecoder   != NULL);
+    MA_ASSERT(pBufferOut != NULL);
+
+    return ma_vfs_or_default_read(pDecoder->data.vfs.pVFS, pDecoder->data.vfs.file, pBufferOut, bytesToRead, pBytesRead);
+}
+
+static ma_result ma_decoder__on_seek_vfs(ma_decoder* pDecoder, ma_int64 offset, ma_seek_origin origin)
+{
+    MA_ASSERT(pDecoder != NULL);
+
+    return ma_vfs_or_default_seek(pDecoder->data.vfs.pVFS, pDecoder->data.vfs.file, offset, origin);
+}
+
+static ma_result ma_decoder__on_tell_vfs(ma_decoder* pDecoder, ma_int64* pCursor)
+{
+    MA_ASSERT(pDecoder != NULL);
+
+    return ma_vfs_or_default_tell(pDecoder->data.vfs.pVFS, pDecoder->data.vfs.file, pCursor);
+}
+
+static ma_result ma_decoder__preinit_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_vfs_file file;
+
+    result = ma_decoder__preinit(ma_decoder__on_read_vfs, ma_decoder__on_seek_vfs, ma_decoder__on_tell_vfs, NULL, pConfig, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pFilePath == NULL || pFilePath[0] == '\0') {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_vfs_or_default_open(pVFS, pFilePath, MA_OPEN_MODE_READ, &file);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pDecoder->data.vfs.pVFS = pVFS;
+    pDecoder->data.vfs.file = file;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_init_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoder_config config;
+
+    config = ma_decoder_config_init_copy(pConfig);
+    result = ma_decoder__preinit_vfs(pVFS, pFilePath, &config, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = MA_NO_BACKEND;
+
+    if (config.encodingFormat != ma_encoding_format_unknown) {
+    #ifdef MA_HAS_WAV
+        if (config.encodingFormat == ma_encoding_format_wav) {
+            result = ma_decoder_init_wav__internal(&config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (config.encodingFormat == ma_encoding_format_flac) {
+            result = ma_decoder_init_flac__internal(&config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (config.encodingFormat == ma_encoding_format_mp3) {
+            result = ma_decoder_init_mp3__internal(&config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (config.encodingFormat == ma_encoding_format_vorbis) {
+            result = ma_decoder_init_vorbis__internal(&config, pDecoder);
+        }
+    #endif
+
+        /* Make sure we seek back to the start if we didn't initialize a decoder successfully so the next attempts have a fresh start. */
+        if (result != MA_SUCCESS) {
+            ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+        }
+    }
+
+    if (result != MA_SUCCESS) {
+        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
+
+        /*
+        We use trial and error to open a decoder. We prioritize custom decoders so that if they
+        implement the same encoding format they take priority over the built-in decoders.
+        */
+        if (result != MA_SUCCESS) {
+            result = ma_decoder_init_custom__internal(&config, pDecoder);
+            if (result != MA_SUCCESS) {
+                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+
+        /*
+        If we get to this point and we still haven't found a decoder, and the caller has requested a
+        specific encoding format, there's no hope for it. Abort.
+        */
+        if (config.encodingFormat != ma_encoding_format_unknown) {
+            return MA_NO_BACKEND;
+        }
+
+    #ifdef MA_HAS_WAV
+        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "wav")) {
+            result = ma_decoder_init_wav__internal(&config, pDecoder);
+            if (result != MA_SUCCESS) {
+                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "flac")) {
+            result = ma_decoder_init_flac__internal(&config, pDecoder);
+            if (result != MA_SUCCESS) {
+                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "mp3")) {
+            result = ma_decoder_init_mp3__internal(&config, pDecoder);
+            if (result != MA_SUCCESS) {
+                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    }
+
+    /* If we still haven't got a result just use trial and error. Otherwise we can finish up. */
+    if (result != MA_SUCCESS) {
+        result = ma_decoder_init__internal(ma_decoder__on_read_vfs, ma_decoder__on_seek_vfs, NULL, &config, pDecoder);
+    } else {
+        result = ma_decoder__postinit(&config, pDecoder);
+    }
+
+    if (result != MA_SUCCESS) {
+        if (pDecoder->data.vfs.file != NULL) {   /* <-- Will be reset to NULL if ma_decoder_uninit() is called in one of the steps above which allows us to avoid a double close of the file. */
+            ma_vfs_or_default_close(pVFS, pDecoder->data.vfs.file);
+        }
+
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_decoder__preinit_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_vfs_file file;
+
+    result = ma_decoder__preinit(ma_decoder__on_read_vfs, ma_decoder__on_seek_vfs, ma_decoder__on_tell_vfs, NULL, pConfig, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pFilePath == NULL || pFilePath[0] == '\0') {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_vfs_or_default_open_w(pVFS, pFilePath, MA_OPEN_MODE_READ, &file);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pDecoder->data.vfs.pVFS = pVFS;
+    pDecoder->data.vfs.file = file;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_init_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoder_config config;
+
+    config = ma_decoder_config_init_copy(pConfig);
+    result = ma_decoder__preinit_vfs_w(pVFS, pFilePath, &config, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = MA_NO_BACKEND;
+
+    if (config.encodingFormat != ma_encoding_format_unknown) {
+    #ifdef MA_HAS_WAV
+        if (config.encodingFormat == ma_encoding_format_wav) {
+            result = ma_decoder_init_wav__internal(&config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (config.encodingFormat == ma_encoding_format_flac) {
+            result = ma_decoder_init_flac__internal(&config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (config.encodingFormat == ma_encoding_format_mp3) {
+            result = ma_decoder_init_mp3__internal(&config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (config.encodingFormat == ma_encoding_format_vorbis) {
+            result = ma_decoder_init_vorbis__internal(&config, pDecoder);
+        }
+    #endif
+
+        /* Make sure we seek back to the start if we didn't initialize a decoder successfully so the next attempts have a fresh start. */
+        if (result != MA_SUCCESS) {
+            ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+        }
+    }
+
+    if (result != MA_SUCCESS) {
+        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
+
+        /*
+        We use trial and error to open a decoder. We prioritize custom decoders so that if they
+        implement the same encoding format they take priority over the built-in decoders.
+        */
+        if (result != MA_SUCCESS) {
+            result = ma_decoder_init_custom__internal(&config, pDecoder);
+            if (result != MA_SUCCESS) {
+                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+
+        /*
+        If we get to this point and we still haven't found a decoder, and the caller has requested a
+        specific encoding format, there's no hope for it. Abort.
+        */
+        if (config.encodingFormat != ma_encoding_format_unknown) {
+            return MA_NO_BACKEND;
+        }
+
+    #ifdef MA_HAS_WAV
+        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"wav")) {
+            result = ma_decoder_init_wav__internal(&config, pDecoder);
+            if (result != MA_SUCCESS) {
+                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"flac")) {
+            result = ma_decoder_init_flac__internal(&config, pDecoder);
+            if (result != MA_SUCCESS) {
+                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"mp3")) {
+            result = ma_decoder_init_mp3__internal(&config, pDecoder);
+            if (result != MA_SUCCESS) {
+                ma_decoder__on_seek_vfs(pDecoder, 0, ma_seek_origin_start);
+            }
+        }
+    #endif
+    }
+
+    /* If we still haven't got a result just use trial and error. Otherwise we can finish up. */
+    if (result != MA_SUCCESS) {
+        result = ma_decoder_init__internal(ma_decoder__on_read_vfs, ma_decoder__on_seek_vfs, NULL, &config, pDecoder);
+    } else {
+        result = ma_decoder__postinit(&config, pDecoder);
+    }
+
+    if (result != MA_SUCCESS) {
+        ma_vfs_or_default_close(pVFS, pDecoder->data.vfs.file);
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_decoder__preinit_file(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+
+    result = ma_decoder__preinit(NULL, NULL, NULL, NULL, pConfig, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pFilePath == NULL || pFilePath[0] == '\0') {
+        return MA_INVALID_ARGS;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_init_file(const char* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoder_config config;
+
+    config = ma_decoder_config_init_copy(pConfig);
+    result = ma_decoder__preinit_file(pFilePath, &config, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* If the backend has support for loading from a file path we'll want to use that. If that all fails we'll fall back to the VFS path. */
+    result = MA_NO_BACKEND;
+
+    if (config.encodingFormat != ma_encoding_format_unknown) {
+    #ifdef MA_HAS_WAV
+        if (config.encodingFormat == ma_encoding_format_wav) {
+            result = ma_decoder_init_wav_from_file__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (config.encodingFormat == ma_encoding_format_flac) {
+            result = ma_decoder_init_flac_from_file__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (config.encodingFormat == ma_encoding_format_mp3) {
+            result = ma_decoder_init_mp3_from_file__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (config.encodingFormat == ma_encoding_format_vorbis) {
+            result = ma_decoder_init_vorbis_from_file__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    }
+
+    if (result != MA_SUCCESS) {
+        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
+
+        /*
+        We use trial and error to open a decoder. We prioritize custom decoders so that if they
+        implement the same encoding format they take priority over the built-in decoders.
+        */
+        result = ma_decoder_init_custom_from_file__internal(pFilePath, &config, pDecoder);
+
+        /*
+        If we get to this point and we still haven't found a decoder, and the caller has requested a
+        specific encoding format, there's no hope for it. Abort.
+        */
+        if (result != MA_SUCCESS && config.encodingFormat != ma_encoding_format_unknown) {
+            return MA_NO_BACKEND;
+        }
+
+        /* First try loading based on the file extension so we don't waste time opening and closing files. */
+    #ifdef MA_HAS_WAV
+        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "wav")) {
+            result = ma_decoder_init_wav_from_file__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "flac")) {
+            result = ma_decoder_init_flac_from_file__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "mp3")) {
+            result = ma_decoder_init_mp3_from_file__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (result != MA_SUCCESS && ma_path_extension_equal(pFilePath, "ogg")) {
+            result = ma_decoder_init_vorbis_from_file__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+
+        /*
+        If we still haven't got a result just use trial and error. Custom decoders have already been attempted, so here we
+        need only iterate over our stock decoders.
+        */
+        if (result != MA_SUCCESS) {
+        #ifdef MA_HAS_WAV
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_wav_from_file__internal(pFilePath, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_FLAC
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_flac_from_file__internal(pFilePath, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_MP3
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_mp3_from_file__internal(pFilePath, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_VORBIS
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_vorbis_from_file__internal(pFilePath, &config, pDecoder);
+            }
+        #endif
+        }
+    }
+
+    /*
+    If at this point we still haven't successfully initialized the decoder it most likely means
+    the backend doesn't have an implementation for loading from a file path. We'll try using
+    miniaudio's built-in file IO for loading file.
+    */
+    if (result == MA_SUCCESS) {
+        /* Initialization was successful. Finish up. */
+        result = ma_decoder__postinit(&config, pDecoder);
+        if (result != MA_SUCCESS) {
+            /*
+            The backend was initialized successfully, but for some reason post-initialization failed. This is most likely
+            due to an out of memory error. We're going to abort with an error here and not try to recover.
+            */
+            if (pDecoder->pBackendVTable != NULL && pDecoder->pBackendVTable->onUninit != NULL) {
+                pDecoder->pBackendVTable->onUninit(pDecoder->pBackendUserData, &pDecoder->pBackend, &pDecoder->allocationCallbacks);
+            }
+
+            return result;
+        }
+    } else {
+        /* Probably no implementation for loading from a file path. Use miniaudio's file IO instead. */
+        result = ma_decoder_init_vfs(NULL, pFilePath, pConfig, pDecoder);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_decoder__preinit_file_w(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+
+    result = ma_decoder__preinit(NULL, NULL, NULL, NULL, pConfig, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pFilePath == NULL || pFilePath[0] == '\0') {
+        return MA_INVALID_ARGS;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_init_file_w(const wchar_t* pFilePath, const ma_decoder_config* pConfig, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoder_config config;
+
+    config = ma_decoder_config_init_copy(pConfig);
+    result = ma_decoder__preinit_file_w(pFilePath, &config, pDecoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* If the backend has support for loading from a file path we'll want to use that. If that all fails we'll fall back to the VFS path. */
+    result = MA_NO_BACKEND;
+
+    if (config.encodingFormat != ma_encoding_format_unknown) {
+    #ifdef MA_HAS_WAV
+        if (config.encodingFormat == ma_encoding_format_wav) {
+            result = ma_decoder_init_wav_from_file_w__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (config.encodingFormat == ma_encoding_format_flac) {
+            result = ma_decoder_init_flac_from_file_w__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (config.encodingFormat == ma_encoding_format_mp3) {
+            result = ma_decoder_init_mp3_from_file_w__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (config.encodingFormat == ma_encoding_format_vorbis) {
+            result = ma_decoder_init_vorbis_from_file_w__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    }
+
+    if (result != MA_SUCCESS) {
+        /* Getting here means we weren't able to initialize a decoder of a specific encoding format. */
+
+        /*
+        We use trial and error to open a decoder. We prioritize custom decoders so that if they
+        implement the same encoding format they take priority over the built-in decoders.
+        */
+        result = ma_decoder_init_custom_from_file_w__internal(pFilePath, &config, pDecoder);
+
+        /*
+        If we get to this point and we still haven't found a decoder, and the caller has requested a
+        specific encoding format, there's no hope for it. Abort.
+        */
+        if (result != MA_SUCCESS && config.encodingFormat != ma_encoding_format_unknown) {
+            return MA_NO_BACKEND;
+        }
+
+        /* First try loading based on the file extension so we don't waste time opening and closing files. */
+    #ifdef MA_HAS_WAV
+        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"wav")) {
+            result = ma_decoder_init_wav_from_file_w__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_FLAC
+        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"flac")) {
+            result = ma_decoder_init_flac_from_file_w__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_MP3
+        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"mp3")) {
+            result = ma_decoder_init_mp3_from_file_w__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+    #ifdef MA_HAS_VORBIS
+        if (result != MA_SUCCESS && ma_path_extension_equal_w(pFilePath, L"ogg")) {
+            result = ma_decoder_init_vorbis_from_file_w__internal(pFilePath, &config, pDecoder);
+        }
+    #endif
+
+        /*
+        If we still haven't got a result just use trial and error. Custom decoders have already been attempted, so here we
+        need only iterate over our stock decoders.
+        */
+        if (result != MA_SUCCESS) {
+        #ifdef MA_HAS_WAV
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_wav_from_file_w__internal(pFilePath, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_FLAC
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_flac_from_file_w__internal(pFilePath, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_MP3
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_mp3_from_file_w__internal(pFilePath, &config, pDecoder);
+            }
+        #endif
+        #ifdef MA_HAS_VORBIS
+            if (result != MA_SUCCESS) {
+                result = ma_decoder_init_vorbis_from_file_w__internal(pFilePath, &config, pDecoder);
+            }
+        #endif
+        }
+    }
+
+    /*
+    If at this point we still haven't successfully initialized the decoder it most likely means
+    the backend doesn't have an implementation for loading from a file path. We'll try using
+    miniaudio's built-in file IO for loading file.
+    */
+    if (result == MA_SUCCESS) {
+        /* Initialization was successful. Finish up. */
+        result = ma_decoder__postinit(&config, pDecoder);
+        if (result != MA_SUCCESS) {
+            /*
+            The backend was initialized successfully, but for some reason post-initialization failed. This is most likely
+            due to an out of memory error. We're going to abort with an error here and not try to recover.
+            */
+            if (pDecoder->pBackendVTable != NULL && pDecoder->pBackendVTable->onUninit != NULL) {
+                pDecoder->pBackendVTable->onUninit(pDecoder->pBackendUserData, &pDecoder->pBackend, &pDecoder->allocationCallbacks);
+            }
+
+            return result;
+        }
+    } else {
+        /* Probably no implementation for loading from a file path. Use miniaudio's file IO instead. */
+        result = ma_decoder_init_vfs_w(NULL, pFilePath, pConfig, pDecoder);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_uninit(ma_decoder* pDecoder)
+{
+    if (pDecoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDecoder->pBackend != NULL) {
+        if (pDecoder->pBackendVTable != NULL && pDecoder->pBackendVTable->onUninit != NULL) {
+            pDecoder->pBackendVTable->onUninit(pDecoder->pBackendUserData, pDecoder->pBackend, &pDecoder->allocationCallbacks);
+        }
+    }
+
+    if (pDecoder->onRead == ma_decoder__on_read_vfs) {
+        ma_vfs_or_default_close(pDecoder->data.vfs.pVFS, pDecoder->data.vfs.file);
+        pDecoder->data.vfs.file = NULL;
+    }
+
+    ma_data_converter_uninit(&pDecoder->converter, &pDecoder->allocationCallbacks);
+    ma_data_source_uninit(&pDecoder->ds);
+
+    if (pDecoder->pInputCache != NULL) {
+        ma_free(pDecoder->pInputCache, &pDecoder->allocationCallbacks);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_read_pcm_frames(ma_decoder* pDecoder, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 totalFramesReadOut;
+    void* pRunningFramesOut;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;   /* Safety. */
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDecoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDecoder->pBackend == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* Fast path. */
+    if (pDecoder->converter.isPassthrough) {
+        result = ma_data_source_read_pcm_frames(pDecoder->pBackend, pFramesOut, frameCount, &totalFramesReadOut);
+    } else {
+        /*
+        Getting here means we need to do data conversion. If we're seeking forward and are _not_ doing resampling we can run this in a fast path. If we're doing resampling we
+        need to run through each sample because we need to ensure its internal cache is updated.
+        */
+        if (pFramesOut == NULL && pDecoder->converter.hasResampler == MA_FALSE) {
+            result = ma_data_source_read_pcm_frames(pDecoder->pBackend, NULL, frameCount, &totalFramesReadOut);
+        } else {
+            /* Slow path. Need to run everything through the data converter. */
+            ma_format internalFormat;
+            ma_uint32 internalChannels;
+
+            totalFramesReadOut = 0;
+            pRunningFramesOut  = pFramesOut;
+
+            result = ma_data_source_get_data_format(pDecoder->pBackend, &internalFormat, &internalChannels, NULL, NULL, 0);
+            if (result != MA_SUCCESS) {
+                return result;   /* Failed to retrieve the internal format and channel count. */
+            }
+
+            /*
+            We run a different path depending on whether or not we are using a heap-allocated
+            intermediary buffer or not. If the data converter does not support the calculation of
+            the required number of input frames, we'll use the heap-allocated path. Otherwise we'll
+            use the stack-allocated path.
+            */
+            if (pDecoder->pInputCache != NULL) {
+                /* We don't have a way of determining the required number of input frames, so need to persistently store input data in a cache. */
+                while (totalFramesReadOut < frameCount) {
+                    ma_uint64 framesToReadThisIterationIn;
+                    ma_uint64 framesToReadThisIterationOut;
+
+                    /* If there's any data available in the cache, that needs to get processed first. */
+                    if (pDecoder->inputCacheRemaining > 0) {
+                        framesToReadThisIterationOut = (frameCount - totalFramesReadOut);
+                        framesToReadThisIterationIn  = framesToReadThisIterationOut;
+                        if (framesToReadThisIterationIn > pDecoder->inputCacheRemaining) {
+                            framesToReadThisIterationIn = pDecoder->inputCacheRemaining;
+                        }
+
+                        result = ma_data_converter_process_pcm_frames(&pDecoder->converter, ma_offset_pcm_frames_ptr(pDecoder->pInputCache, pDecoder->inputCacheConsumed, internalFormat, internalChannels), &framesToReadThisIterationIn, pRunningFramesOut, &framesToReadThisIterationOut);
+                        if (result != MA_SUCCESS) {
+                            break;
+                        }
+
+                        pDecoder->inputCacheConsumed  += framesToReadThisIterationIn;
+                        pDecoder->inputCacheRemaining -= framesToReadThisIterationIn;
+
+                        totalFramesReadOut += framesToReadThisIterationOut;
+
+                        if (pRunningFramesOut != NULL) {
+                            pRunningFramesOut = ma_offset_ptr(pRunningFramesOut, framesToReadThisIterationOut * ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels));
+                        }
+
+                        if (framesToReadThisIterationIn == 0 && framesToReadThisIterationOut == 0) {
+                            break;  /* We're done. */
+                        }
+                    }
+
+                    /* Getting here means there's no data in the cache and we need to fill it up from the data source. */
+                    if (pDecoder->inputCacheRemaining == 0) {
+                        pDecoder->inputCacheConsumed = 0;
+
+                        result = ma_data_source_read_pcm_frames(pDecoder->pBackend, pDecoder->pInputCache, pDecoder->inputCacheCap, &pDecoder->inputCacheRemaining);
+                        if (result != MA_SUCCESS) {
+                            break;
+                        }
+                    }
+                }
+            } else {
+                /* We have a way of determining the required number of input frames so just use the stack. */
+                while (totalFramesReadOut < frameCount) {
+                    ma_uint8 pIntermediaryBuffer[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];  /* In internal format. */
+                    ma_uint64 intermediaryBufferCap = sizeof(pIntermediaryBuffer) / ma_get_bytes_per_frame(internalFormat, internalChannels);
+                    ma_uint64 framesToReadThisIterationIn;
+                    ma_uint64 framesReadThisIterationIn;
+                    ma_uint64 framesToReadThisIterationOut;
+                    ma_uint64 framesReadThisIterationOut;
+                    ma_uint64 requiredInputFrameCount;
+
+                    framesToReadThisIterationOut = (frameCount - totalFramesReadOut);
+                    framesToReadThisIterationIn = framesToReadThisIterationOut;
+                    if (framesToReadThisIterationIn > intermediaryBufferCap) {
+                        framesToReadThisIterationIn = intermediaryBufferCap;
+                    }
+
+                    ma_data_converter_get_required_input_frame_count(&pDecoder->converter, framesToReadThisIterationOut, &requiredInputFrameCount);
+                    if (framesToReadThisIterationIn > requiredInputFrameCount) {
+                        framesToReadThisIterationIn = requiredInputFrameCount;
+                    }
+
+                    if (requiredInputFrameCount > 0) {
+                        result = ma_data_source_read_pcm_frames(pDecoder->pBackend, pIntermediaryBuffer, framesToReadThisIterationIn, &framesReadThisIterationIn);
+
+                        /*
+                        Note here that even if we've reached the end, we don't want to abort because there might be more output frames needing to be
+                        generated from cached input data, which might happen if resampling is being performed.
+                        */
+                        if (result != MA_SUCCESS && result != MA_AT_END) {
+                            break;
+                        }
+                    } else {
+                        framesReadThisIterationIn = 0;
+                        pIntermediaryBuffer[0] = 0; /* <-- This is just to silence a static analysis warning. */
+                    }
+
+                    /*
+                    At this point we have our decoded data in input format and now we need to convert to output format. Note that even if we didn't read any
+                    input frames, we still want to try processing frames because there may some output frames generated from cached input data.
+                    */
+                    framesReadThisIterationOut = framesToReadThisIterationOut;
+                    result = ma_data_converter_process_pcm_frames(&pDecoder->converter, pIntermediaryBuffer, &framesReadThisIterationIn, pRunningFramesOut, &framesReadThisIterationOut);
+                    if (result != MA_SUCCESS) {
+                        break;
+                    }
+
+                    totalFramesReadOut += framesReadThisIterationOut;
+
+                    if (pRunningFramesOut != NULL) {
+                        pRunningFramesOut = ma_offset_ptr(pRunningFramesOut, framesReadThisIterationOut * ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels));
+                    }
+
+                    if (framesReadThisIterationIn == 0 && framesReadThisIterationOut == 0) {
+                        break;  /* We're done. */
+                    }
+                }
+            }
+        }
+    }
+
+    pDecoder->readPointerInPCMFrames += totalFramesReadOut;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = totalFramesReadOut;
+    }
+
+    if (result == MA_SUCCESS && totalFramesReadOut == 0) {
+        result =  MA_AT_END;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_decoder_seek_to_pcm_frame(ma_decoder* pDecoder, ma_uint64 frameIndex)
+{
+    if (pDecoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDecoder->pBackend != NULL) {
+        ma_result result;
+        ma_uint64 internalFrameIndex;
+        ma_uint32 internalSampleRate;
+        ma_uint64 currentFrameIndex;
+
+        result = ma_data_source_get_data_format(pDecoder->pBackend, NULL, NULL, &internalSampleRate, NULL, 0);
+        if (result != MA_SUCCESS) {
+            return result;  /* Failed to retrieve the internal sample rate. */
+        }
+
+        if (internalSampleRate == pDecoder->outputSampleRate) {
+            internalFrameIndex = frameIndex;
+        } else {
+            internalFrameIndex = ma_calculate_frame_count_after_resampling(internalSampleRate, pDecoder->outputSampleRate, frameIndex);
+        }
+
+        /* Only seek if we're requesting a different frame to what we're currently sitting on. */
+        ma_data_source_get_cursor_in_pcm_frames(pDecoder->pBackend, &currentFrameIndex);
+        if (currentFrameIndex != internalFrameIndex) {
+            result = ma_data_source_seek_to_pcm_frame(pDecoder->pBackend, internalFrameIndex);
+            if (result == MA_SUCCESS) {
+                pDecoder->readPointerInPCMFrames = frameIndex;
+            }
+
+            /* Reset the data converter so that any cached data in the resampler is cleared. */
+            ma_data_converter_reset(&pDecoder->converter);
+        }
+
+        return result;
+    }
+
+    /* Should never get here, but if we do it means onSeekToPCMFrame was not set by the backend. */
+    return MA_INVALID_ARGS;
+}
+
+MA_API ma_result ma_decoder_get_data_format(ma_decoder* pDecoder, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    if (pDecoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pFormat != NULL) {
+        *pFormat = pDecoder->outputFormat;
+    }
+
+    if (pChannels != NULL) {
+        *pChannels = pDecoder->outputChannels;
+    }
+
+    if (pSampleRate != NULL) {
+        *pSampleRate = pDecoder->outputSampleRate;
+    }
+
+    if (pChannelMap != NULL) {
+        ma_data_converter_get_output_channel_map(&pDecoder->converter, pChannelMap, channelMapCap);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_get_cursor_in_pcm_frames(ma_decoder* pDecoder, ma_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;
+
+    if (pDecoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = pDecoder->readPointerInPCMFrames;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decoder_get_length_in_pcm_frames(ma_decoder* pDecoder, ma_uint64* pLength)
+{
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;
+
+    if (pDecoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDecoder->pBackend != NULL) {
+        ma_result result;
+        ma_uint64 internalLengthInPCMFrames;
+        ma_uint32 internalSampleRate;
+
+        result = ma_data_source_get_length_in_pcm_frames(pDecoder->pBackend, &internalLengthInPCMFrames);
+        if (result != MA_SUCCESS) {
+            return result;  /* Failed to retrieve the internal length. */
+        }
+
+        result = ma_data_source_get_data_format(pDecoder->pBackend, NULL, NULL, &internalSampleRate, NULL, 0);
+        if (result != MA_SUCCESS) {
+            return result;   /* Failed to retrieve the internal sample rate. */
+        }
+
+        if (internalSampleRate == pDecoder->outputSampleRate) {
+            *pLength = internalLengthInPCMFrames;
+        } else {
+            *pLength = ma_calculate_frame_count_after_resampling(pDecoder->outputSampleRate, internalSampleRate, internalLengthInPCMFrames);
+        }
+
+        return MA_SUCCESS;
+    } else {
+        return MA_NO_BACKEND;
+    }
+}
+
+MA_API ma_result ma_decoder_get_available_frames(ma_decoder* pDecoder, ma_uint64* pAvailableFrames)
+{
+    ma_result result;
+    ma_uint64 totalFrameCount;
+
+    if (pAvailableFrames == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pAvailableFrames = 0;
+
+    if (pDecoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_decoder_get_length_in_pcm_frames(pDecoder, &totalFrameCount);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (totalFrameCount <= pDecoder->readPointerInPCMFrames) {
+        *pAvailableFrames = 0;
+    } else {
+        *pAvailableFrames = totalFrameCount - pDecoder->readPointerInPCMFrames;
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_decoder__full_decode_and_uninit(ma_decoder* pDecoder, ma_decoder_config* pConfigOut, ma_uint64* pFrameCountOut, void** ppPCMFramesOut)
+{
+    ma_result result;
+    ma_uint64 totalFrameCount;
+    ma_uint64 bpf;
+    ma_uint64 dataCapInFrames;
+    void* pPCMFramesOut;
+
+    MA_ASSERT(pDecoder != NULL);
+
+    totalFrameCount = 0;
+    bpf = ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
+
+    /* The frame count is unknown until we try reading. Thus, we just run in a loop. */
+    dataCapInFrames = 0;
+    pPCMFramesOut = NULL;
+    for (;;) {
+        ma_uint64 frameCountToTryReading;
+        ma_uint64 framesJustRead;
+
+        /* Make room if there's not enough. */
+        if (totalFrameCount == dataCapInFrames) {
+            void* pNewPCMFramesOut;
+            ma_uint64 newDataCapInFrames = dataCapInFrames*2;
+            if (newDataCapInFrames == 0) {
+                newDataCapInFrames = 4096;
+            }
+
+            if ((newDataCapInFrames * bpf) > MA_SIZE_MAX) {
+                ma_free(pPCMFramesOut, &pDecoder->allocationCallbacks);
+                return MA_TOO_BIG;
+            }
+
+            pNewPCMFramesOut = (void*)ma_realloc(pPCMFramesOut, (size_t)(newDataCapInFrames * bpf), &pDecoder->allocationCallbacks);
+            if (pNewPCMFramesOut == NULL) {
+                ma_free(pPCMFramesOut, &pDecoder->allocationCallbacks);
+                return MA_OUT_OF_MEMORY;
+            }
+
+            dataCapInFrames = newDataCapInFrames;
+            pPCMFramesOut = pNewPCMFramesOut;
+        }
+
+        frameCountToTryReading = dataCapInFrames - totalFrameCount;
+        MA_ASSERT(frameCountToTryReading > 0);
+
+        result = ma_decoder_read_pcm_frames(pDecoder, (ma_uint8*)pPCMFramesOut + (totalFrameCount * bpf), frameCountToTryReading, &framesJustRead);
+        totalFrameCount += framesJustRead;
+
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        if (framesJustRead < frameCountToTryReading) {
+            break;
+        }
+    }
+
+
+    if (pConfigOut != NULL) {
+        pConfigOut->format     = pDecoder->outputFormat;
+        pConfigOut->channels   = pDecoder->outputChannels;
+        pConfigOut->sampleRate = pDecoder->outputSampleRate;
+    }
+
+    if (ppPCMFramesOut != NULL) {
+        *ppPCMFramesOut = pPCMFramesOut;
+    } else {
+        ma_free(pPCMFramesOut, &pDecoder->allocationCallbacks);
+    }
+
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = totalFrameCount;
+    }
+
+    ma_decoder_uninit(pDecoder);
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_decode_from_vfs(ma_vfs* pVFS, const char* pFilePath, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut)
+{
+    ma_result result;
+    ma_decoder_config config;
+    ma_decoder decoder;
+
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = 0;
+    }
+    if (ppPCMFramesOut != NULL) {
+        *ppPCMFramesOut = NULL;
+    }
+
+    config = ma_decoder_config_init_copy(pConfig);
+
+    result = ma_decoder_init_vfs(pVFS, pFilePath, &config, &decoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_decoder__full_decode_and_uninit(&decoder, pConfig, pFrameCountOut, ppPCMFramesOut);
+
+    return result;
+}
+
+MA_API ma_result ma_decode_file(const char* pFilePath, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut)
+{
+    return ma_decode_from_vfs(NULL, pFilePath, pConfig, pFrameCountOut, ppPCMFramesOut);
+}
+
+MA_API ma_result ma_decode_memory(const void* pData, size_t dataSize, ma_decoder_config* pConfig, ma_uint64* pFrameCountOut, void** ppPCMFramesOut)
+{
+    ma_decoder_config config;
+    ma_decoder decoder;
+    ma_result result;
+
+    if (pFrameCountOut != NULL) {
+        *pFrameCountOut = 0;
+    }
+    if (ppPCMFramesOut != NULL) {
+        *ppPCMFramesOut = NULL;
+    }
+
+    if (pData == NULL || dataSize == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    config = ma_decoder_config_init_copy(pConfig);
+
+    result = ma_decoder_init_memory(pData, dataSize, &config, &decoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return ma_decoder__full_decode_and_uninit(&decoder, pConfig, pFrameCountOut, ppPCMFramesOut);
+}
+#endif  /* MA_NO_DECODING */
+
+
+#ifndef MA_NO_ENCODING
+
+#if defined(MA_HAS_WAV)
+static size_t ma_encoder__internal_on_write_wav(void* pUserData, const void* pData, size_t bytesToWrite)
+{
+    ma_encoder* pEncoder = (ma_encoder*)pUserData;
+    size_t bytesWritten = 0;
+
+    MA_ASSERT(pEncoder != NULL);
+
+    pEncoder->onWrite(pEncoder, pData, bytesToWrite, &bytesWritten);
+    return bytesWritten;
+}
+
+static ma_bool32 ma_encoder__internal_on_seek_wav(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
+{
+    ma_encoder* pEncoder = (ma_encoder*)pUserData;
+    ma_result result;
+
+    MA_ASSERT(pEncoder != NULL);
+
+    result = pEncoder->onSeek(pEncoder, offset, (origin == ma_dr_wav_seek_origin_start) ? ma_seek_origin_start : ma_seek_origin_current);
+    if (result != MA_SUCCESS) {
+        return MA_FALSE;
+    } else {
+        return MA_TRUE;
+    }
+}
+
+static ma_result ma_encoder__on_init_wav(ma_encoder* pEncoder)
+{
+    ma_dr_wav_data_format wavFormat;
+    ma_allocation_callbacks allocationCallbacks;
+    ma_dr_wav* pWav;
+
+    MA_ASSERT(pEncoder != NULL);
+
+    pWav = (ma_dr_wav*)ma_malloc(sizeof(*pWav), &pEncoder->config.allocationCallbacks);
+    if (pWav == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    wavFormat.container     = ma_dr_wav_container_riff;
+    wavFormat.channels      = pEncoder->config.channels;
+    wavFormat.sampleRate    = pEncoder->config.sampleRate;
+    wavFormat.bitsPerSample = ma_get_bytes_per_sample(pEncoder->config.format) * 8;
+    if (pEncoder->config.format == ma_format_f32) {
+        wavFormat.format    = MA_DR_WAVE_FORMAT_IEEE_FLOAT;
+    } else {
+        wavFormat.format    = MA_DR_WAVE_FORMAT_PCM;
+    }
+
+    allocationCallbacks.pUserData = pEncoder->config.allocationCallbacks.pUserData;
+    allocationCallbacks.onMalloc  = pEncoder->config.allocationCallbacks.onMalloc;
+    allocationCallbacks.onRealloc = pEncoder->config.allocationCallbacks.onRealloc;
+    allocationCallbacks.onFree    = pEncoder->config.allocationCallbacks.onFree;
+
+    if (!ma_dr_wav_init_write(pWav, &wavFormat, ma_encoder__internal_on_write_wav, ma_encoder__internal_on_seek_wav, pEncoder, &allocationCallbacks)) {
+        return MA_ERROR;
+    }
+
+    pEncoder->pInternalEncoder = pWav;
+
+    return MA_SUCCESS;
+}
+
+static void ma_encoder__on_uninit_wav(ma_encoder* pEncoder)
+{
+    ma_dr_wav* pWav;
+
+    MA_ASSERT(pEncoder != NULL);
+
+    pWav = (ma_dr_wav*)pEncoder->pInternalEncoder;
+    MA_ASSERT(pWav != NULL);
+
+    ma_dr_wav_uninit(pWav);
+    ma_free(pWav, &pEncoder->config.allocationCallbacks);
+}
+
+static ma_result ma_encoder__on_write_pcm_frames_wav(ma_encoder* pEncoder, const void* pFramesIn, ma_uint64 frameCount, ma_uint64* pFramesWritten)
+{
+    ma_dr_wav* pWav;
+    ma_uint64 framesWritten;
+
+    MA_ASSERT(pEncoder != NULL);
+
+    pWav = (ma_dr_wav*)pEncoder->pInternalEncoder;
+    MA_ASSERT(pWav != NULL);
+
+    framesWritten = ma_dr_wav_write_pcm_frames(pWav, frameCount, pFramesIn);
+
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = framesWritten;
+    }
+
+    return MA_SUCCESS;
+}
+#endif
+
+MA_API ma_encoder_config ma_encoder_config_init(ma_encoding_format encodingFormat, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
+{
+    ma_encoder_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.encodingFormat = encodingFormat;
+    config.format = format;
+    config.channels = channels;
+    config.sampleRate = sampleRate;
+
+    return config;
+}
+
+MA_API ma_result ma_encoder_preinit(const ma_encoder_config* pConfig, ma_encoder* pEncoder)
+{
+    ma_result result;
+
+    if (pEncoder == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pEncoder);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->format == ma_format_unknown || pConfig->channels == 0 || pConfig->sampleRate == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pEncoder->config = *pConfig;
+
+    result = ma_allocation_callbacks_init_copy(&pEncoder->config.allocationCallbacks, &pConfig->allocationCallbacks);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_encoder_init__internal(ma_encoder_write_proc onWrite, ma_encoder_seek_proc onSeek, void* pUserData, ma_encoder* pEncoder)
+{
+    ma_result result = MA_SUCCESS;
+
+    /* This assumes ma_encoder_preinit() has been called prior. */
+    MA_ASSERT(pEncoder != NULL);
+
+    if (onWrite == NULL || onSeek == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pEncoder->onWrite   = onWrite;
+    pEncoder->onSeek    = onSeek;
+    pEncoder->pUserData = pUserData;
+
+    switch (pEncoder->config.encodingFormat)
+    {
+        case ma_encoding_format_wav:
+        {
+        #if defined(MA_HAS_WAV)
+            pEncoder->onInit           = ma_encoder__on_init_wav;
+            pEncoder->onUninit         = ma_encoder__on_uninit_wav;
+            pEncoder->onWritePCMFrames = ma_encoder__on_write_pcm_frames_wav;
+        #else
+            result = MA_NO_BACKEND;
+        #endif
+        } break;
+
+        default:
+        {
+            result = MA_INVALID_ARGS;
+        } break;
+    }
+
+    /* Getting here means we should have our backend callbacks set up. */
+    if (result == MA_SUCCESS) {
+        result = pEncoder->onInit(pEncoder);
+    }
+
+    return result;
+}
+
+static ma_result ma_encoder__on_write_vfs(ma_encoder* pEncoder, const void* pBufferIn, size_t bytesToWrite, size_t* pBytesWritten)
+{
+    return ma_vfs_or_default_write(pEncoder->data.vfs.pVFS, pEncoder->data.vfs.file, pBufferIn, bytesToWrite, pBytesWritten);
+}
+
+static ma_result ma_encoder__on_seek_vfs(ma_encoder* pEncoder, ma_int64 offset, ma_seek_origin origin)
+{
+    return ma_vfs_or_default_seek(pEncoder->data.vfs.pVFS, pEncoder->data.vfs.file, offset, origin);
+}
+
+MA_API ma_result ma_encoder_init_vfs(ma_vfs* pVFS, const char* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
+{
+    ma_result result;
+    ma_vfs_file file;
+
+    result = ma_encoder_preinit(pConfig, pEncoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* Now open the file. If this fails we don't need to uninitialize the encoder. */
+    result = ma_vfs_or_default_open(pVFS, pFilePath, MA_OPEN_MODE_WRITE, &file);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pEncoder->data.vfs.pVFS = pVFS;
+    pEncoder->data.vfs.file = file;
+
+    result = ma_encoder_init__internal(ma_encoder__on_write_vfs, ma_encoder__on_seek_vfs, NULL, pEncoder);
+    if (result != MA_SUCCESS) {
+        ma_vfs_or_default_close(pVFS, file);
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_encoder_init_vfs_w(ma_vfs* pVFS, const wchar_t* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
+{
+    ma_result result;
+    ma_vfs_file file;
+
+    result = ma_encoder_preinit(pConfig, pEncoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* Now open the file. If this fails we don't need to uninitialize the encoder. */
+    result = ma_vfs_or_default_open_w(pVFS, pFilePath, MA_OPEN_MODE_WRITE, &file);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pEncoder->data.vfs.pVFS = pVFS;
+    pEncoder->data.vfs.file = file;
+
+    result = ma_encoder_init__internal(ma_encoder__on_write_vfs, ma_encoder__on_seek_vfs, NULL, pEncoder);
+    if (result != MA_SUCCESS) {
+        ma_vfs_or_default_close(pVFS, file);
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_encoder_init_file(const char* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
+{
+    return ma_encoder_init_vfs(NULL, pFilePath, pConfig, pEncoder);
+}
+
+MA_API ma_result ma_encoder_init_file_w(const wchar_t* pFilePath, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
+{
+    return ma_encoder_init_vfs_w(NULL, pFilePath, pConfig, pEncoder);
+}
+
+MA_API ma_result ma_encoder_init(ma_encoder_write_proc onWrite, ma_encoder_seek_proc onSeek, void* pUserData, const ma_encoder_config* pConfig, ma_encoder* pEncoder)
+{
+    ma_result result;
+
+    result = ma_encoder_preinit(pConfig, pEncoder);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return ma_encoder_init__internal(onWrite, onSeek, pUserData, pEncoder);
+}
+
+
+MA_API void ma_encoder_uninit(ma_encoder* pEncoder)
+{
+    if (pEncoder == NULL) {
+        return;
+    }
+
+    if (pEncoder->onUninit) {
+        pEncoder->onUninit(pEncoder);
+    }
+
+    /* If we have a file handle, close it. */
+    if (pEncoder->onWrite == ma_encoder__on_write_vfs) {
+        ma_vfs_or_default_close(pEncoder->data.vfs.pVFS, pEncoder->data.vfs.file);
+        pEncoder->data.vfs.file = NULL;
+    }
+}
+
+
+MA_API ma_result ma_encoder_write_pcm_frames(ma_encoder* pEncoder, const void* pFramesIn, ma_uint64 frameCount, ma_uint64* pFramesWritten)
+{
+    if (pFramesWritten != NULL) {
+        *pFramesWritten = 0;
+    }
+
+    if (pEncoder == NULL || pFramesIn == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return pEncoder->onWritePCMFrames(pEncoder, pFramesIn, frameCount, pFramesWritten);
+}
+#endif  /* MA_NO_ENCODING */
+
+
+
+/**************************************************************************************************************************************************************
+
+Generation
+
+**************************************************************************************************************************************************************/
+#ifndef MA_NO_GENERATION
+MA_API ma_waveform_config ma_waveform_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, ma_waveform_type type, double amplitude, double frequency)
+{
+    ma_waveform_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format     = format;
+    config.channels   = channels;
+    config.sampleRate = sampleRate;
+    config.type       = type;
+    config.amplitude  = amplitude;
+    config.frequency  = frequency;
+
+    return config;
+}
+
+static ma_result ma_waveform__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_waveform_read_pcm_frames((ma_waveform*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_waveform__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_waveform_seek_to_pcm_frame((ma_waveform*)pDataSource, frameIndex);
+}
+
+static ma_result ma_waveform__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    ma_waveform* pWaveform = (ma_waveform*)pDataSource;
+
+    *pFormat     = pWaveform->config.format;
+    *pChannels   = pWaveform->config.channels;
+    *pSampleRate = pWaveform->config.sampleRate;
+    ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pWaveform->config.channels);
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_waveform__data_source_on_get_cursor(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    ma_waveform* pWaveform = (ma_waveform*)pDataSource;
+
+    *pCursor = (ma_uint64)(pWaveform->time / pWaveform->advance);
+
+    return MA_SUCCESS;
+}
+
+static double ma_waveform__calculate_advance(ma_uint32 sampleRate, double frequency)
+{
+    return (1.0 / (sampleRate / frequency));
+}
+
+static void ma_waveform__update_advance(ma_waveform* pWaveform)
+{
+    pWaveform->advance = ma_waveform__calculate_advance(pWaveform->config.sampleRate, pWaveform->config.frequency);
+}
+
+static ma_data_source_vtable g_ma_waveform_data_source_vtable =
+{
+    ma_waveform__data_source_on_read,
+    ma_waveform__data_source_on_seek,
+    ma_waveform__data_source_on_get_data_format,
+    ma_waveform__data_source_on_get_cursor,
+    NULL,   /* onGetLength. There's no notion of a length in waveforms. */
+    NULL,   /* onSetLooping */
+    0
+};
+
+MA_API ma_result ma_waveform_init(const ma_waveform_config* pConfig, ma_waveform* pWaveform)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pWaveform);
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_waveform_data_source_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pWaveform->ds);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pWaveform->config  = *pConfig;
+    pWaveform->advance = ma_waveform__calculate_advance(pWaveform->config.sampleRate, pWaveform->config.frequency);
+    pWaveform->time    = 0;
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_waveform_uninit(ma_waveform* pWaveform)
+{
+    if (pWaveform == NULL) {
+        return;
+    }
+
+    ma_data_source_uninit(&pWaveform->ds);
+}
+
+MA_API ma_result ma_waveform_set_amplitude(ma_waveform* pWaveform, double amplitude)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->config.amplitude = amplitude;
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_waveform_set_frequency(ma_waveform* pWaveform, double frequency)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->config.frequency = frequency;
+    ma_waveform__update_advance(pWaveform);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_waveform_set_type(ma_waveform* pWaveform, ma_waveform_type type)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->config.type = type;
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_waveform_set_sample_rate(ma_waveform* pWaveform, ma_uint32 sampleRate)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->config.sampleRate = sampleRate;
+    ma_waveform__update_advance(pWaveform);
+
+    return MA_SUCCESS;
+}
+
+static float ma_waveform_sine_f32(double time, double amplitude)
+{
+    return (float)(ma_sind(MA_TAU_D * time) * amplitude);
+}
+
+static ma_int16 ma_waveform_sine_s16(double time, double amplitude)
+{
+    return ma_pcm_sample_f32_to_s16(ma_waveform_sine_f32(time, amplitude));
+}
+
+static float ma_waveform_square_f32(double time, double dutyCycle, double amplitude)
+{
+    double f = time - (ma_int64)time;
+    double r;
+
+    if (f < dutyCycle) {
+        r =  amplitude;
+    } else {
+        r = -amplitude;
+    }
+
+    return (float)r;
+}
+
+static ma_int16 ma_waveform_square_s16(double time, double dutyCycle, double amplitude)
+{
+    return ma_pcm_sample_f32_to_s16(ma_waveform_square_f32(time, dutyCycle, amplitude));
+}
+
+static float ma_waveform_triangle_f32(double time, double amplitude)
+{
+    double f = time - (ma_int64)time;
+    double r;
+
+    r = 2 * ma_abs(2 * (f - 0.5)) - 1;
+
+    return (float)(r * amplitude);
+}
+
+static ma_int16 ma_waveform_triangle_s16(double time, double amplitude)
+{
+    return ma_pcm_sample_f32_to_s16(ma_waveform_triangle_f32(time, amplitude));
+}
+
+static float ma_waveform_sawtooth_f32(double time, double amplitude)
+{
+    double f = time - (ma_int64)time;
+    double r;
+
+    r = 2 * (f - 0.5);
+
+    return (float)(r * amplitude);
+}
+
+static ma_int16 ma_waveform_sawtooth_s16(double time, double amplitude)
+{
+    return ma_pcm_sample_f32_to_s16(ma_waveform_sawtooth_f32(time, amplitude));
+}
+
+static void ma_waveform_read_pcm_frames__sine(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint64 iChannel;
+    ma_uint32 bps = ma_get_bytes_per_sample(pWaveform->config.format);
+    ma_uint32 bpf = bps * pWaveform->config.channels;
+
+    MA_ASSERT(pWaveform  != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+
+    if (pWaveform->config.format == ma_format_f32) {
+        float* pFramesOutF32 = (float*)pFramesOut;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float s = ma_waveform_sine_f32(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                pFramesOutF32[iFrame*pWaveform->config.channels + iChannel] = s;
+            }
+        }
+    } else if (pWaveform->config.format == ma_format_s16) {
+        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            ma_int16 s = ma_waveform_sine_s16(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                pFramesOutS16[iFrame*pWaveform->config.channels + iChannel] = s;
+            }
+        }
+    } else {
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float s = ma_waveform_sine_f32(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pWaveform->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+            }
+        }
+    }
+}
+
+static void ma_waveform_read_pcm_frames__square(ma_waveform* pWaveform, double dutyCycle, void* pFramesOut, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint64 iChannel;
+    ma_uint32 bps = ma_get_bytes_per_sample(pWaveform->config.format);
+    ma_uint32 bpf = bps * pWaveform->config.channels;
+
+    MA_ASSERT(pWaveform  != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+
+    if (pWaveform->config.format == ma_format_f32) {
+        float* pFramesOutF32 = (float*)pFramesOut;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float s = ma_waveform_square_f32(pWaveform->time, dutyCycle, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                pFramesOutF32[iFrame*pWaveform->config.channels + iChannel] = s;
+            }
+        }
+    } else if (pWaveform->config.format == ma_format_s16) {
+        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            ma_int16 s = ma_waveform_square_s16(pWaveform->time, dutyCycle, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                pFramesOutS16[iFrame*pWaveform->config.channels + iChannel] = s;
+            }
+        }
+    } else {
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float s = ma_waveform_square_f32(pWaveform->time, dutyCycle, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pWaveform->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+            }
+        }
+    }
+}
+
+static void ma_waveform_read_pcm_frames__triangle(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint64 iChannel;
+    ma_uint32 bps = ma_get_bytes_per_sample(pWaveform->config.format);
+    ma_uint32 bpf = bps * pWaveform->config.channels;
+
+    MA_ASSERT(pWaveform  != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+
+    if (pWaveform->config.format == ma_format_f32) {
+        float* pFramesOutF32 = (float*)pFramesOut;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float s = ma_waveform_triangle_f32(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                pFramesOutF32[iFrame*pWaveform->config.channels + iChannel] = s;
+            }
+        }
+    } else if (pWaveform->config.format == ma_format_s16) {
+        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            ma_int16 s = ma_waveform_triangle_s16(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                pFramesOutS16[iFrame*pWaveform->config.channels + iChannel] = s;
+            }
+        }
+    } else {
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float s = ma_waveform_triangle_f32(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pWaveform->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+            }
+        }
+    }
+}
+
+static void ma_waveform_read_pcm_frames__sawtooth(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint64 iChannel;
+    ma_uint32 bps = ma_get_bytes_per_sample(pWaveform->config.format);
+    ma_uint32 bpf = bps * pWaveform->config.channels;
+
+    MA_ASSERT(pWaveform  != NULL);
+    MA_ASSERT(pFramesOut != NULL);
+
+    if (pWaveform->config.format == ma_format_f32) {
+        float* pFramesOutF32 = (float*)pFramesOut;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float s = ma_waveform_sawtooth_f32(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                pFramesOutF32[iFrame*pWaveform->config.channels + iChannel] = s;
+            }
+        }
+    } else if (pWaveform->config.format == ma_format_s16) {
+        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            ma_int16 s = ma_waveform_sawtooth_s16(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                pFramesOutS16[iFrame*pWaveform->config.channels + iChannel] = s;
+            }
+        }
+    } else {
+        for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+            float s = ma_waveform_sawtooth_f32(pWaveform->time, pWaveform->config.amplitude);
+            pWaveform->time += pWaveform->advance;
+
+            for (iChannel = 0; iChannel < pWaveform->config.channels; iChannel += 1) {
+                ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pWaveform->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+            }
+        }
+    }
+}
+
+MA_API ma_result ma_waveform_read_pcm_frames(ma_waveform* pWaveform, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pFramesOut != NULL) {
+        switch (pWaveform->config.type)
+        {
+            case ma_waveform_type_sine:
+            {
+                ma_waveform_read_pcm_frames__sine(pWaveform, pFramesOut, frameCount);
+            } break;
+
+            case ma_waveform_type_square:
+            {
+                ma_waveform_read_pcm_frames__square(pWaveform, 0.5, pFramesOut, frameCount);
+            } break;
+
+            case ma_waveform_type_triangle:
+            {
+                ma_waveform_read_pcm_frames__triangle(pWaveform, pFramesOut, frameCount);
+            } break;
+
+            case ma_waveform_type_sawtooth:
+            {
+                ma_waveform_read_pcm_frames__sawtooth(pWaveform, pFramesOut, frameCount);
+            } break;
+
+            default: return MA_INVALID_OPERATION;   /* Unknown waveform type. */
+        }
+    } else {
+        pWaveform->time += pWaveform->advance * (ma_int64)frameCount; /* Cast to int64 required for VC6. Won't affect anything in practice. */
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = frameCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_waveform_seek_to_pcm_frame(ma_waveform* pWaveform, ma_uint64 frameIndex)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->time = pWaveform->advance * (ma_int64)frameIndex;    /* Casting for VC6. Won't be an issue in practice. */
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_pulsewave_config ma_pulsewave_config_init(ma_format format, ma_uint32 channels, ma_uint32 sampleRate, double dutyCycle, double amplitude, double frequency)
+{
+    ma_pulsewave_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.format     = format;
+    config.channels   = channels;
+    config.sampleRate = sampleRate;
+    config.dutyCycle  = dutyCycle;
+    config.amplitude  = amplitude;
+    config.frequency  = frequency;
+
+    return config;
+}
+
+MA_API ma_result ma_pulsewave_init(const ma_pulsewave_config* pConfig, ma_pulsewave* pWaveform)
+{
+    ma_result result;
+    ma_waveform_config config;
+
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pWaveform);
+
+    config = ma_waveform_config_init(
+        pConfig->format,
+        pConfig->channels,
+        pConfig->sampleRate,
+        ma_waveform_type_square,
+        pConfig->amplitude,
+        pConfig->frequency
+    );
+
+    result = ma_waveform_init(&config, &pWaveform->waveform);
+    ma_pulsewave_set_duty_cycle(pWaveform, pConfig->dutyCycle);
+
+    return result;
+}
+
+MA_API void ma_pulsewave_uninit(ma_pulsewave* pWaveform)
+{
+    if (pWaveform == NULL) {
+        return;
+    }
+
+    ma_waveform_uninit(&pWaveform->waveform);
+}
+
+MA_API ma_result ma_pulsewave_read_pcm_frames(ma_pulsewave* pWaveform, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pFramesOut != NULL) {
+        ma_waveform_read_pcm_frames__square(&pWaveform->waveform, pWaveform->config.dutyCycle, pFramesOut, frameCount);
+    } else {
+        pWaveform->waveform.time += pWaveform->waveform.advance * (ma_int64)frameCount; /* Cast to int64 required for VC6. Won't affect anything in practice. */
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = frameCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_pulsewave_seek_to_pcm_frame(ma_pulsewave* pWaveform, ma_uint64 frameIndex)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_waveform_seek_to_pcm_frame(&pWaveform->waveform, frameIndex);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_pulsewave_set_amplitude(ma_pulsewave* pWaveform, double amplitude)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->config.amplitude = amplitude;
+    ma_waveform_set_amplitude(&pWaveform->waveform, amplitude);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_pulsewave_set_frequency(ma_pulsewave* pWaveform, double frequency)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->config.frequency = frequency;
+    ma_waveform_set_frequency(&pWaveform->waveform, frequency);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_pulsewave_set_sample_rate(ma_pulsewave* pWaveform, ma_uint32 sampleRate)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->config.sampleRate = sampleRate;
+    ma_waveform_set_sample_rate(&pWaveform->waveform, sampleRate);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_pulsewave_set_duty_cycle(ma_pulsewave* pWaveform, double dutyCycle)
+{
+    if (pWaveform == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pWaveform->config.dutyCycle = dutyCycle;
+
+    return MA_SUCCESS;
+}
+
+
+
+MA_API ma_noise_config ma_noise_config_init(ma_format format, ma_uint32 channels, ma_noise_type type, ma_int32 seed, double amplitude)
+{
+    ma_noise_config config;
+    MA_ZERO_OBJECT(&config);
+
+    config.format    = format;
+    config.channels  = channels;
+    config.type      = type;
+    config.seed      = seed;
+    config.amplitude = amplitude;
+
+    if (config.seed == 0) {
+        config.seed = MA_DEFAULT_LCG_SEED;
+    }
+
+    return config;
+}
+
+
+static ma_result ma_noise__data_source_on_read(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_noise_read_pcm_frames((ma_noise*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_noise__data_source_on_seek(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    /* No-op. Just pretend to be successful. */
+    (void)pDataSource;
+    (void)frameIndex;
+    return MA_SUCCESS;
+}
+
+static ma_result ma_noise__data_source_on_get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    ma_noise* pNoise = (ma_noise*)pDataSource;
+
+    *pFormat     = pNoise->config.format;
+    *pChannels   = pNoise->config.channels;
+    *pSampleRate = 0;   /* There is no notion of sample rate with noise generation. */
+    ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pNoise->config.channels);
+
+    return MA_SUCCESS;
+}
+
+static ma_data_source_vtable g_ma_noise_data_source_vtable =
+{
+    ma_noise__data_source_on_read,
+    ma_noise__data_source_on_seek,  /* No-op for noise. */
+    ma_noise__data_source_on_get_data_format,
+    NULL,   /* onGetCursor. No notion of a cursor for noise. */
+    NULL,   /* onGetLength. No notion of a length for noise. */
+    NULL,   /* onSetLooping */
+    0
+};
+
+
+#ifndef MA_PINK_NOISE_BIN_SIZE
+#define MA_PINK_NOISE_BIN_SIZE 16
+#endif
+
+typedef struct
+{
+    size_t sizeInBytes;
+    struct
+    {
+        size_t binOffset;
+        size_t accumulationOffset;
+        size_t counterOffset;
+    } pink;
+    struct
+    {
+        size_t accumulationOffset;
+    } brownian;
+} ma_noise_heap_layout;
+
+static ma_result ma_noise_get_heap_layout(const ma_noise_config* pConfig, ma_noise_heap_layout* pHeapLayout)
+{
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Pink. */
+    if (pConfig->type == ma_noise_type_pink) {
+        /* bin */
+        pHeapLayout->pink.binOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += sizeof(double*) * pConfig->channels;
+        pHeapLayout->sizeInBytes += sizeof(double ) * pConfig->channels * MA_PINK_NOISE_BIN_SIZE;
+
+        /* accumulation */
+        pHeapLayout->pink.accumulationOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += sizeof(double) * pConfig->channels;
+
+        /* counter */
+        pHeapLayout->pink.counterOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += sizeof(ma_uint32) * pConfig->channels;
+    }
+
+    /* Brownian. */
+    if (pConfig->type == ma_noise_type_brownian) {
+        /* accumulation */
+        pHeapLayout->brownian.accumulationOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += sizeof(double) * pConfig->channels;
+    }
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_noise_get_heap_size(const ma_noise_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_noise_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_noise_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_noise_init_preallocated(const ma_noise_config* pConfig, void* pHeap, ma_noise* pNoise)
+{
+    ma_result result;
+    ma_noise_heap_layout heapLayout;
+    ma_data_source_config dataSourceConfig;
+    ma_uint32 iChannel;
+
+    if (pNoise == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNoise);
+
+    result = ma_noise_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pNoise->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pNoise->_pHeap, heapLayout.sizeInBytes);
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_noise_data_source_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pNoise->ds);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pNoise->config = *pConfig;
+    ma_lcg_seed(&pNoise->lcg, pConfig->seed);
+
+    if (pNoise->config.type == ma_noise_type_pink) {
+        pNoise->state.pink.bin          = (double**  )ma_offset_ptr(pHeap, heapLayout.pink.binOffset);
+        pNoise->state.pink.accumulation = (double*   )ma_offset_ptr(pHeap, heapLayout.pink.accumulationOffset);
+        pNoise->state.pink.counter      = (ma_uint32*)ma_offset_ptr(pHeap, heapLayout.pink.counterOffset);
+
+        for (iChannel = 0; iChannel < pConfig->channels; iChannel += 1) {
+            pNoise->state.pink.bin[iChannel]          = (double*)ma_offset_ptr(pHeap, heapLayout.pink.binOffset + (sizeof(double*) * pConfig->channels) + (sizeof(double) * MA_PINK_NOISE_BIN_SIZE * iChannel));
+            pNoise->state.pink.accumulation[iChannel] = 0;
+            pNoise->state.pink.counter[iChannel]      = 1;
+        }
+    }
+
+    if (pNoise->config.type == ma_noise_type_brownian) {
+        pNoise->state.brownian.accumulation = (double*)ma_offset_ptr(pHeap, heapLayout.brownian.accumulationOffset);
+
+        for (iChannel = 0; iChannel < pConfig->channels; iChannel += 1) {
+            pNoise->state.brownian.accumulation[iChannel] = 0;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_noise_init(const ma_noise_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_noise* pNoise)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_noise_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_noise_init_preallocated(pConfig, pHeap, pNoise);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pNoise->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_noise_uninit(ma_noise* pNoise, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pNoise == NULL) {
+        return;
+    }
+
+    ma_data_source_uninit(&pNoise->ds);
+
+    if (pNoise->_ownsHeap) {
+        ma_free(pNoise->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_result ma_noise_set_amplitude(ma_noise* pNoise, double amplitude)
+{
+    if (pNoise == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pNoise->config.amplitude = amplitude;
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_noise_set_seed(ma_noise* pNoise, ma_int32 seed)
+{
+    if (pNoise == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pNoise->lcg.state = seed;
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_noise_set_type(ma_noise* pNoise, ma_noise_type type)
+{
+    if (pNoise == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    This function should never have been implemented in the first place. Changing the type dynamically is not
+    supported. Instead you need to uninitialize and reinitialize a fresh `ma_noise` object. This function
+    will be removed in version 0.12.
+    */
+    MA_ASSERT(MA_FALSE);
+    (void)type;
+
+    return MA_INVALID_OPERATION;
+}
+
+static MA_INLINE float ma_noise_f32_white(ma_noise* pNoise)
+{
+    return (float)(ma_lcg_rand_f64(&pNoise->lcg) * pNoise->config.amplitude);
+}
+
+static MA_INLINE ma_int16 ma_noise_s16_white(ma_noise* pNoise)
+{
+    return ma_pcm_sample_f32_to_s16(ma_noise_f32_white(pNoise));
+}
+
+static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__white(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannel;
+    const ma_uint32 channels = pNoise->config.channels;
+    MA_ASSUME(channels > 0);
+
+    if (pNoise->config.format == ma_format_f32) {
+        float* pFramesOutF32 = (float*)pFramesOut;
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                float s = ma_noise_f32_white(pNoise);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = s;
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_white(pNoise);
+                }
+            }
+        }
+    } else if (pNoise->config.format == ma_format_s16) {
+        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                ma_int16 s = ma_noise_s16_white(pNoise);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = s;
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_white(pNoise);
+                }
+            }
+        }
+    } else {
+        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
+        const ma_uint32 bpf = bps * channels;
+
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                float s = ma_noise_f32_white(pNoise);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    float s = ma_noise_f32_white(pNoise);
+                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+                }
+            }
+        }
+    }
+
+    return frameCount;
+}
+
+
+static MA_INLINE unsigned int ma_tzcnt32(unsigned int x)
+{
+    unsigned int n;
+
+    /* Special case for odd numbers since they should happen about half the time. */
+    if (x & 0x1)  {
+        return 0;
+    }
+
+    if (x == 0) {
+        return sizeof(x) << 3;
+    }
+
+    n = 1;
+    if ((x & 0x0000FFFF) == 0) { x >>= 16; n += 16; }
+    if ((x & 0x000000FF) == 0) { x >>=  8; n +=  8; }
+    if ((x & 0x0000000F) == 0) { x >>=  4; n +=  4; }
+    if ((x & 0x00000003) == 0) { x >>=  2; n +=  2; }
+    n -= x & 0x00000001;
+
+    return n;
+}
+
+/*
+Pink noise generation based on Tonic (public domain) with modifications. https://github.com/TonicAudio/Tonic/blob/master/src/Tonic/Noise.h
+
+This is basically _the_ reference for pink noise from what I've found: http://www.firstpr.com.au/dsp/pink-noise/
+*/
+static MA_INLINE float ma_noise_f32_pink(ma_noise* pNoise, ma_uint32 iChannel)
+{
+    double result;
+    double binPrev;
+    double binNext;
+    unsigned int ibin;
+
+    ibin = ma_tzcnt32(pNoise->state.pink.counter[iChannel]) & (MA_PINK_NOISE_BIN_SIZE - 1);
+
+    binPrev = pNoise->state.pink.bin[iChannel][ibin];
+    binNext = ma_lcg_rand_f64(&pNoise->lcg);
+    pNoise->state.pink.bin[iChannel][ibin] = binNext;
+
+    pNoise->state.pink.accumulation[iChannel] += (binNext - binPrev);
+    pNoise->state.pink.counter[iChannel]      += 1;
+
+    result = (ma_lcg_rand_f64(&pNoise->lcg) + pNoise->state.pink.accumulation[iChannel]);
+    result /= 10;
+
+    return (float)(result * pNoise->config.amplitude);
+}
+
+static MA_INLINE ma_int16 ma_noise_s16_pink(ma_noise* pNoise, ma_uint32 iChannel)
+{
+    return ma_pcm_sample_f32_to_s16(ma_noise_f32_pink(pNoise, iChannel));
+}
+
+static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__pink(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannel;
+    const ma_uint32 channels = pNoise->config.channels;
+    MA_ASSUME(channels > 0);
+
+    if (pNoise->config.format == ma_format_f32) {
+        float* pFramesOutF32 = (float*)pFramesOut;
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                float s = ma_noise_f32_pink(pNoise, 0);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = s;
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_pink(pNoise, iChannel);
+                }
+            }
+        }
+    } else if (pNoise->config.format == ma_format_s16) {
+        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                ma_int16 s = ma_noise_s16_pink(pNoise, 0);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = s;
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_pink(pNoise, iChannel);
+                }
+            }
+        }
+    } else {
+        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
+        const ma_uint32 bpf = bps * channels;
+
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                float s = ma_noise_f32_pink(pNoise, 0);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    float s = ma_noise_f32_pink(pNoise, iChannel);
+                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+                }
+            }
+        }
+    }
+
+    return frameCount;
+}
+
+
+static MA_INLINE float ma_noise_f32_brownian(ma_noise* pNoise, ma_uint32 iChannel)
+{
+    double result;
+
+    result = (ma_lcg_rand_f64(&pNoise->lcg) + pNoise->state.brownian.accumulation[iChannel]);
+    result /= 1.005; /* Don't escape the -1..1 range on average. */
+
+    pNoise->state.brownian.accumulation[iChannel] = result;
+    result /= 20;
+
+    return (float)(result * pNoise->config.amplitude);
+}
+
+static MA_INLINE ma_int16 ma_noise_s16_brownian(ma_noise* pNoise, ma_uint32 iChannel)
+{
+    return ma_pcm_sample_f32_to_s16(ma_noise_f32_brownian(pNoise, iChannel));
+}
+
+static MA_INLINE ma_uint64 ma_noise_read_pcm_frames__brownian(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount)
+{
+    ma_uint64 iFrame;
+    ma_uint32 iChannel;
+    const ma_uint32 channels = pNoise->config.channels;
+    MA_ASSUME(channels > 0);
+
+    if (pNoise->config.format == ma_format_f32) {
+        float* pFramesOutF32 = (float*)pFramesOut;
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                float s = ma_noise_f32_brownian(pNoise, 0);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = s;
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutF32[iFrame*channels + iChannel] = ma_noise_f32_brownian(pNoise, iChannel);
+                }
+            }
+        }
+    } else if (pNoise->config.format == ma_format_s16) {
+        ma_int16* pFramesOutS16 = (ma_int16*)pFramesOut;
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                ma_int16 s = ma_noise_s16_brownian(pNoise, 0);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = s;
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    pFramesOutS16[iFrame*channels + iChannel] = ma_noise_s16_brownian(pNoise, iChannel);
+                }
+            }
+        }
+    } else {
+        const ma_uint32 bps = ma_get_bytes_per_sample(pNoise->config.format);
+        const ma_uint32 bpf = bps * channels;
+
+        if (pNoise->config.duplicateChannels) {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                float s = ma_noise_f32_brownian(pNoise, 0);
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+                }
+            }
+        } else {
+            for (iFrame = 0; iFrame < frameCount; iFrame += 1) {
+                for (iChannel = 0; iChannel < channels; iChannel += 1) {
+                    float s = ma_noise_f32_brownian(pNoise, iChannel);
+                    ma_pcm_convert(ma_offset_ptr(pFramesOut, iFrame*bpf + iChannel*bps), pNoise->config.format, &s, ma_format_f32, 1, ma_dither_mode_none);
+                }
+            }
+        }
+    }
+
+    return frameCount;
+}
+
+MA_API ma_result ma_noise_read_pcm_frames(ma_noise* pNoise, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_uint64 framesRead = 0;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pNoise == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The output buffer is allowed to be NULL. Since we aren't tracking cursors or anything we can just do nothing and pretend to be successful. */
+    if (pFramesOut == NULL) {
+        framesRead = frameCount;
+    } else {
+        switch (pNoise->config.type) {
+            case ma_noise_type_white:    framesRead = ma_noise_read_pcm_frames__white   (pNoise, pFramesOut, frameCount); break;
+            case ma_noise_type_pink:     framesRead = ma_noise_read_pcm_frames__pink    (pNoise, pFramesOut, frameCount); break;
+            case ma_noise_type_brownian: framesRead = ma_noise_read_pcm_frames__brownian(pNoise, pFramesOut, frameCount); break;
+            default: return MA_INVALID_OPERATION;   /* Unknown noise type. */
+        }
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = framesRead;
+    }
+
+    return MA_SUCCESS;
+}
+#endif /* MA_NO_GENERATION */
+
+
+
+#ifndef MA_NO_RESOURCE_MANAGER
+#ifndef MA_RESOURCE_MANAGER_PAGE_SIZE_IN_MILLISECONDS
+#define MA_RESOURCE_MANAGER_PAGE_SIZE_IN_MILLISECONDS   1000
+#endif
+
+#ifndef MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_CAPACITY
+#define MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_CAPACITY          1024
+#endif
+
+MA_API ma_resource_manager_pipeline_notifications ma_resource_manager_pipeline_notifications_init(void)
+{
+    ma_resource_manager_pipeline_notifications notifications;
+
+    MA_ZERO_OBJECT(&notifications);
+
+    return notifications;
+}
+
+static void ma_resource_manager_pipeline_notifications_signal_all_notifications(const ma_resource_manager_pipeline_notifications* pPipelineNotifications)
+{
+    if (pPipelineNotifications == NULL) {
+        return;
+    }
+
+    if (pPipelineNotifications->init.pNotification) { ma_async_notification_signal(pPipelineNotifications->init.pNotification); }
+    if (pPipelineNotifications->done.pNotification) { ma_async_notification_signal(pPipelineNotifications->done.pNotification); }
+}
+
+static void ma_resource_manager_pipeline_notifications_acquire_all_fences(const ma_resource_manager_pipeline_notifications* pPipelineNotifications)
+{
+    if (pPipelineNotifications == NULL) {
+        return;
+    }
+
+    if (pPipelineNotifications->init.pFence != NULL) { ma_fence_acquire(pPipelineNotifications->init.pFence); }
+    if (pPipelineNotifications->done.pFence != NULL) { ma_fence_acquire(pPipelineNotifications->done.pFence); }
+}
+
+static void ma_resource_manager_pipeline_notifications_release_all_fences(const ma_resource_manager_pipeline_notifications* pPipelineNotifications)
+{
+    if (pPipelineNotifications == NULL) {
+        return;
+    }
+
+    if (pPipelineNotifications->init.pFence != NULL) { ma_fence_release(pPipelineNotifications->init.pFence); }
+    if (pPipelineNotifications->done.pFence != NULL) { ma_fence_release(pPipelineNotifications->done.pFence); }
+}
+
+
+
+#ifndef MA_DEFAULT_HASH_SEED
+#define MA_DEFAULT_HASH_SEED    42
+#endif
+
+/* MurmurHash3. Based on code from https://github.com/PeterScott/murmur3/blob/master/murmur3.c (public domain). */
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic push
+    #if __GNUC__ >= 7
+    #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+    #endif
+#endif
+
+static MA_INLINE ma_uint32 ma_rotl32(ma_uint32 x, ma_int8 r)
+{
+    return (x << r) | (x >> (32 - r));
+}
+
+static MA_INLINE ma_uint32 ma_hash_getblock(const ma_uint32* blocks, int i)
+{
+    ma_uint32 block;
+
+    /* Try silencing a sanitization warning about unaligned access by doing a memcpy() instead of assignment. */
+    MA_COPY_MEMORY(&block, ma_offset_ptr(blocks, i * sizeof(block)), sizeof(block));
+
+    if (ma_is_little_endian()) {
+        return block;
+    } else {
+        return ma_swap_endian_uint32(block);
+    }
+}
+
+static MA_INLINE ma_uint32 ma_hash_fmix32(ma_uint32 h)
+{
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    h ^= h >> 16;
+
+    return h;
+}
+
+static ma_uint32 ma_hash_32(const void* key, int len, ma_uint32 seed)
+{
+    const ma_uint8* data = (const ma_uint8*)key;
+    const ma_uint32* blocks;
+    const ma_uint8* tail;
+    const int nblocks = len / 4;
+    ma_uint32 h1 = seed;
+    ma_uint32 c1 = 0xcc9e2d51;
+    ma_uint32 c2 = 0x1b873593;
+    ma_uint32 k1;
+    int i;
+
+    blocks = (const ma_uint32 *)(data + nblocks*4);
+
+    for(i = -nblocks; i; i++) {
+        k1 = ma_hash_getblock(blocks,i);
+
+        k1 *= c1;
+        k1 = ma_rotl32(k1, 15);
+        k1 *= c2;
+
+        h1 ^= k1;
+        h1 = ma_rotl32(h1, 13);
+        h1 = h1*5 + 0xe6546b64;
+    }
+
+
+    tail = (const ma_uint8*)(data + nblocks*4);
+
+    k1 = 0;
+    switch(len & 3) {
+        case 3: k1 ^= tail[2] << 16;
+        case 2: k1 ^= tail[1] << 8;
+        case 1: k1 ^= tail[0];
+                k1 *= c1; k1 = ma_rotl32(k1, 15); k1 *= c2; h1 ^= k1;
+    };
+
+
+    h1 ^= len;
+    h1  = ma_hash_fmix32(h1);
+
+    return h1;
+}
+
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic push
+#endif
+/* End MurmurHash3 */
+
+static ma_uint32 ma_hash_string_32(const char* str)
+{
+    return ma_hash_32(str, (int)strlen(str), MA_DEFAULT_HASH_SEED);
+}
+
+static ma_uint32 ma_hash_string_w_32(const wchar_t* str)
+{
+    return ma_hash_32(str, (int)wcslen(str) * sizeof(*str), MA_DEFAULT_HASH_SEED);
+}
+
+
+
+
+/*
+Basic BST Functions
+*/
+static ma_result ma_resource_manager_data_buffer_node_search(ma_resource_manager* pResourceManager, ma_uint32 hashedName32, ma_resource_manager_data_buffer_node** ppDataBufferNode)
+{
+    ma_resource_manager_data_buffer_node* pCurrentNode;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(ppDataBufferNode != NULL);
+
+    pCurrentNode = pResourceManager->pRootDataBufferNode;
+    while (pCurrentNode != NULL) {
+        if (hashedName32 == pCurrentNode->hashedName32) {
+            break;  /* Found. */
+        } else if (hashedName32 < pCurrentNode->hashedName32) {
+            pCurrentNode = pCurrentNode->pChildLo;
+        } else {
+            pCurrentNode = pCurrentNode->pChildHi;
+        }
+    }
+
+    *ppDataBufferNode = pCurrentNode;
+
+    if (pCurrentNode == NULL) {
+        return MA_DOES_NOT_EXIST;
+    } else {
+        return MA_SUCCESS;
+    }
+}
+
+static ma_result ma_resource_manager_data_buffer_node_insert_point(ma_resource_manager* pResourceManager, ma_uint32 hashedName32, ma_resource_manager_data_buffer_node** ppInsertPoint)
+{
+    ma_result result = MA_SUCCESS;
+    ma_resource_manager_data_buffer_node* pCurrentNode;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(ppInsertPoint    != NULL);
+
+    *ppInsertPoint = NULL;
+
+    if (pResourceManager->pRootDataBufferNode == NULL) {
+        return MA_SUCCESS;  /* No items. */
+    }
+
+    /* We need to find the node that will become the parent of the new node. If a node is found that already has the same hashed name we need to return MA_ALREADY_EXISTS. */
+    pCurrentNode = pResourceManager->pRootDataBufferNode;
+    while (pCurrentNode != NULL) {
+        if (hashedName32 == pCurrentNode->hashedName32) {
+            result = MA_ALREADY_EXISTS;
+            break;
+        } else {
+            if (hashedName32 < pCurrentNode->hashedName32) {
+                if (pCurrentNode->pChildLo == NULL) {
+                    result = MA_SUCCESS;
+                    break;
+                } else {
+                    pCurrentNode = pCurrentNode->pChildLo;
+                }
+            } else {
+                if (pCurrentNode->pChildHi == NULL) {
+                    result = MA_SUCCESS;
+                    break;
+                } else {
+                    pCurrentNode = pCurrentNode->pChildHi;
+                }
+            }
+        }
+    }
+
+    *ppInsertPoint = pCurrentNode;
+    return result;
+}
+
+static ma_result ma_resource_manager_data_buffer_node_insert_at(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, ma_resource_manager_data_buffer_node* pInsertPoint)
+{
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+
+    /* The key must have been set before calling this function. */
+    MA_ASSERT(pDataBufferNode->hashedName32 != 0);
+
+    if (pInsertPoint == NULL) {
+        /* It's the first node. */
+        pResourceManager->pRootDataBufferNode = pDataBufferNode;
+    } else {
+        /* It's not the first node. It needs to be inserted. */
+        if (pDataBufferNode->hashedName32 < pInsertPoint->hashedName32) {
+            MA_ASSERT(pInsertPoint->pChildLo == NULL);
+            pInsertPoint->pChildLo = pDataBufferNode;
+        } else {
+            MA_ASSERT(pInsertPoint->pChildHi == NULL);
+            pInsertPoint->pChildHi = pDataBufferNode;
+        }
+    }
+
+    pDataBufferNode->pParent = pInsertPoint;
+
+    return MA_SUCCESS;
+}
+
+#if 0   /* Unused for now. */
+static ma_result ma_resource_manager_data_buffer_node_insert(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    ma_result result;
+    ma_resource_manager_data_buffer_node* pInsertPoint;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+
+    result = ma_resource_manager_data_buffer_node_insert_point(pResourceManager, pDataBufferNode->hashedName32, &pInsertPoint);
+    if (result != MA_SUCCESS) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_resource_manager_data_buffer_node_insert_at(pResourceManager, pDataBufferNode, pInsertPoint);
+}
+#endif
+
+static MA_INLINE ma_resource_manager_data_buffer_node* ma_resource_manager_data_buffer_node_find_min(ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    ma_resource_manager_data_buffer_node* pCurrentNode;
+
+    MA_ASSERT(pDataBufferNode != NULL);
+
+    pCurrentNode = pDataBufferNode;
+    while (pCurrentNode->pChildLo != NULL) {
+        pCurrentNode = pCurrentNode->pChildLo;
+    }
+
+    return pCurrentNode;
+}
+
+static MA_INLINE ma_resource_manager_data_buffer_node* ma_resource_manager_data_buffer_node_find_max(ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    ma_resource_manager_data_buffer_node* pCurrentNode;
+
+    MA_ASSERT(pDataBufferNode != NULL);
+
+    pCurrentNode = pDataBufferNode;
+    while (pCurrentNode->pChildHi != NULL) {
+        pCurrentNode = pCurrentNode->pChildHi;
+    }
+
+    return pCurrentNode;
+}
+
+static MA_INLINE ma_resource_manager_data_buffer_node* ma_resource_manager_data_buffer_node_find_inorder_successor(ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    MA_ASSERT(pDataBufferNode           != NULL);
+    MA_ASSERT(pDataBufferNode->pChildHi != NULL);
+
+    return ma_resource_manager_data_buffer_node_find_min(pDataBufferNode->pChildHi);
+}
+
+static MA_INLINE ma_resource_manager_data_buffer_node* ma_resource_manager_data_buffer_node_find_inorder_predecessor(ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    MA_ASSERT(pDataBufferNode           != NULL);
+    MA_ASSERT(pDataBufferNode->pChildLo != NULL);
+
+    return ma_resource_manager_data_buffer_node_find_max(pDataBufferNode->pChildLo);
+}
+
+static ma_result ma_resource_manager_data_buffer_node_remove(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+
+    if (pDataBufferNode->pChildLo == NULL) {
+        if (pDataBufferNode->pChildHi == NULL) {
+            /* Simple case - deleting a buffer with no children. */
+            if (pDataBufferNode->pParent == NULL) {
+                MA_ASSERT(pResourceManager->pRootDataBufferNode == pDataBufferNode);    /* There is only a single buffer in the tree which should be equal to the root node. */
+                pResourceManager->pRootDataBufferNode = NULL;
+            } else {
+                if (pDataBufferNode->pParent->pChildLo == pDataBufferNode) {
+                    pDataBufferNode->pParent->pChildLo = NULL;
+                } else {
+                    pDataBufferNode->pParent->pChildHi = NULL;
+                }
+            }
+        } else {
+            /* Node has one child - pChildHi != NULL. */
+            pDataBufferNode->pChildHi->pParent = pDataBufferNode->pParent;
+
+            if (pDataBufferNode->pParent == NULL) {
+                MA_ASSERT(pResourceManager->pRootDataBufferNode == pDataBufferNode);
+                pResourceManager->pRootDataBufferNode = pDataBufferNode->pChildHi;
+            } else {
+                if (pDataBufferNode->pParent->pChildLo == pDataBufferNode) {
+                    pDataBufferNode->pParent->pChildLo = pDataBufferNode->pChildHi;
+                } else {
+                    pDataBufferNode->pParent->pChildHi = pDataBufferNode->pChildHi;
+                }
+            }
+        }
+    } else {
+        if (pDataBufferNode->pChildHi == NULL) {
+            /* Node has one child - pChildLo != NULL. */
+            pDataBufferNode->pChildLo->pParent = pDataBufferNode->pParent;
+
+            if (pDataBufferNode->pParent == NULL) {
+                MA_ASSERT(pResourceManager->pRootDataBufferNode == pDataBufferNode);
+                pResourceManager->pRootDataBufferNode = pDataBufferNode->pChildLo;
+            } else {
+                if (pDataBufferNode->pParent->pChildLo == pDataBufferNode) {
+                    pDataBufferNode->pParent->pChildLo = pDataBufferNode->pChildLo;
+                } else {
+                    pDataBufferNode->pParent->pChildHi = pDataBufferNode->pChildLo;
+                }
+            }
+        } else {
+            /* Complex case - deleting a node with two children. */
+            ma_resource_manager_data_buffer_node* pReplacementDataBufferNode;
+
+            /* For now we are just going to use the in-order successor as the replacement, but we may want to try to keep this balanced by switching between the two. */
+            pReplacementDataBufferNode = ma_resource_manager_data_buffer_node_find_inorder_successor(pDataBufferNode);
+            MA_ASSERT(pReplacementDataBufferNode != NULL);
+
+            /*
+            Now that we have our replacement node we can make the change. The simple way to do this would be to just exchange the values, and then remove the replacement
+            node, however we track specific nodes via pointers which means we can't just swap out the values. We need to instead just change the pointers around. The
+            replacement node should have at most 1 child. Therefore, we can detach it in terms of our simpler cases above. What we're essentially doing is detaching the
+            replacement node and reinserting it into the same position as the deleted node.
+            */
+            MA_ASSERT(pReplacementDataBufferNode->pParent  != NULL);  /* The replacement node should never be the root which means it should always have a parent. */
+            MA_ASSERT(pReplacementDataBufferNode->pChildLo == NULL);  /* Because we used in-order successor. This would be pChildHi == NULL if we used in-order predecessor. */
+
+            if (pReplacementDataBufferNode->pChildHi == NULL) {
+                if (pReplacementDataBufferNode->pParent->pChildLo == pReplacementDataBufferNode) {
+                    pReplacementDataBufferNode->pParent->pChildLo = NULL;
+                } else {
+                    pReplacementDataBufferNode->pParent->pChildHi = NULL;
+                }
+            } else {
+                pReplacementDataBufferNode->pChildHi->pParent = pReplacementDataBufferNode->pParent;
+                if (pReplacementDataBufferNode->pParent->pChildLo == pReplacementDataBufferNode) {
+                    pReplacementDataBufferNode->pParent->pChildLo = pReplacementDataBufferNode->pChildHi;
+                } else {
+                    pReplacementDataBufferNode->pParent->pChildHi = pReplacementDataBufferNode->pChildHi;
+                }
+            }
+
+
+            /* The replacement node has essentially been detached from the binary tree, so now we need to replace the old data buffer with it. The first thing to update is the parent */
+            if (pDataBufferNode->pParent != NULL) {
+                if (pDataBufferNode->pParent->pChildLo == pDataBufferNode) {
+                    pDataBufferNode->pParent->pChildLo = pReplacementDataBufferNode;
+                } else {
+                    pDataBufferNode->pParent->pChildHi = pReplacementDataBufferNode;
+                }
+            }
+
+            /* Now need to update the replacement node's pointers. */
+            pReplacementDataBufferNode->pParent  = pDataBufferNode->pParent;
+            pReplacementDataBufferNode->pChildLo = pDataBufferNode->pChildLo;
+            pReplacementDataBufferNode->pChildHi = pDataBufferNode->pChildHi;
+
+            /* Now the children of the replacement node need to have their parent pointers updated. */
+            if (pReplacementDataBufferNode->pChildLo != NULL) {
+                pReplacementDataBufferNode->pChildLo->pParent = pReplacementDataBufferNode;
+            }
+            if (pReplacementDataBufferNode->pChildHi != NULL) {
+                pReplacementDataBufferNode->pChildHi->pParent = pReplacementDataBufferNode;
+            }
+
+            /* Now the root node needs to be updated. */
+            if (pResourceManager->pRootDataBufferNode == pDataBufferNode) {
+                pResourceManager->pRootDataBufferNode = pReplacementDataBufferNode;
+            }
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+#if 0   /* Unused for now. */
+static ma_result ma_resource_manager_data_buffer_node_remove_by_key(ma_resource_manager* pResourceManager, ma_uint32 hashedName32)
+{
+    ma_result result;
+    ma_resource_manager_data_buffer_node* pDataBufferNode;
+
+    result = ma_resource_manager_data_buffer_search(pResourceManager, hashedName32, &pDataBufferNode);
+    if (result != MA_SUCCESS) {
+        return result;  /* Could not find the data buffer. */
+    }
+
+    return ma_resource_manager_data_buffer_remove(pResourceManager, pDataBufferNode);
+}
+#endif
+
+static ma_resource_manager_data_supply_type ma_resource_manager_data_buffer_node_get_data_supply_type(ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    return (ma_resource_manager_data_supply_type)ma_atomic_load_i32(&pDataBufferNode->data.type);
+}
+
+static void ma_resource_manager_data_buffer_node_set_data_supply_type(ma_resource_manager_data_buffer_node* pDataBufferNode, ma_resource_manager_data_supply_type supplyType)
+{
+    ma_atomic_exchange_i32(&pDataBufferNode->data.type, supplyType);
+}
+
+static ma_result ma_resource_manager_data_buffer_node_increment_ref(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, ma_uint32* pNewRefCount)
+{
+    ma_uint32 refCount;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+
+    (void)pResourceManager;
+
+    refCount = ma_atomic_fetch_add_32(&pDataBufferNode->refCount, 1) + 1;
+
+    if (pNewRefCount != NULL) {
+        *pNewRefCount = refCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_resource_manager_data_buffer_node_decrement_ref(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, ma_uint32* pNewRefCount)
+{
+    ma_uint32 refCount;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+
+    (void)pResourceManager;
+
+    refCount = ma_atomic_fetch_sub_32(&pDataBufferNode->refCount, 1) - 1;
+
+    if (pNewRefCount != NULL) {
+        *pNewRefCount = refCount;
+    }
+
+    return MA_SUCCESS;
+}
+
+static void ma_resource_manager_data_buffer_node_free(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+
+    if (pDataBufferNode->isDataOwnedByResourceManager) {
+        if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode) == ma_resource_manager_data_supply_type_encoded) {
+            ma_free((void*)pDataBufferNode->data.backend.encoded.pData, &pResourceManager->config.allocationCallbacks);
+            pDataBufferNode->data.backend.encoded.pData       = NULL;
+            pDataBufferNode->data.backend.encoded.sizeInBytes = 0;
+        } else if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode) == ma_resource_manager_data_supply_type_decoded) {
+            ma_free((void*)pDataBufferNode->data.backend.decoded.pData, &pResourceManager->config.allocationCallbacks);
+            pDataBufferNode->data.backend.decoded.pData           = NULL;
+            pDataBufferNode->data.backend.decoded.totalFrameCount = 0;
+        } else if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode) == ma_resource_manager_data_supply_type_decoded_paged) {
+            ma_paged_audio_buffer_data_uninit(&pDataBufferNode->data.backend.decodedPaged.data, &pResourceManager->config.allocationCallbacks);
+        } else {
+            /* Should never hit this if the node was successfully initialized. */
+            MA_ASSERT(pDataBufferNode->result != MA_SUCCESS);
+        }
+    }
+
+    /* The data buffer itself needs to be freed. */
+    ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
+}
+
+static ma_result ma_resource_manager_data_buffer_node_result(const ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    MA_ASSERT(pDataBufferNode != NULL);
+
+    return (ma_result)ma_atomic_load_i32((ma_result*)&pDataBufferNode->result);    /* Need a naughty const-cast here. */
+}
+
+
+static ma_bool32 ma_resource_manager_is_threading_enabled(const ma_resource_manager* pResourceManager)
+{
+    MA_ASSERT(pResourceManager != NULL);
+
+    return (pResourceManager->config.flags & MA_RESOURCE_MANAGER_FLAG_NO_THREADING) == 0;
+}
+
+
+typedef struct
+{
+    union
+    {
+        ma_async_notification_event e;
+        ma_async_notification_poll p;
+    } backend;  /* Must be the first member. */
+    ma_resource_manager* pResourceManager;
+} ma_resource_manager_inline_notification;
+
+static ma_result ma_resource_manager_inline_notification_init(ma_resource_manager* pResourceManager, ma_resource_manager_inline_notification* pNotification)
+{
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pNotification    != NULL);
+
+    pNotification->pResourceManager = pResourceManager;
+
+    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
+        return ma_async_notification_event_init(&pNotification->backend.e);
+    } else {
+        return ma_async_notification_poll_init(&pNotification->backend.p);
+    }
+}
+
+static void ma_resource_manager_inline_notification_uninit(ma_resource_manager_inline_notification* pNotification)
+{
+    MA_ASSERT(pNotification != NULL);
+
+    if (ma_resource_manager_is_threading_enabled(pNotification->pResourceManager)) {
+        ma_async_notification_event_uninit(&pNotification->backend.e);
+    } else {
+        /* No need to uninitialize a polling notification. */
+    }
+}
+
+static void ma_resource_manager_inline_notification_wait(ma_resource_manager_inline_notification* pNotification)
+{
+    MA_ASSERT(pNotification != NULL);
+
+    if (ma_resource_manager_is_threading_enabled(pNotification->pResourceManager)) {
+        ma_async_notification_event_wait(&pNotification->backend.e);
+    } else {
+        while (ma_async_notification_poll_is_signalled(&pNotification->backend.p) == MA_FALSE) {
+            ma_result result = ma_resource_manager_process_next_job(pNotification->pResourceManager);
+            if (result == MA_NO_DATA_AVAILABLE || result == MA_CANCELLED) {
+                break;
+            }
+        }
+    }
+}
+
+static void ma_resource_manager_inline_notification_wait_and_uninit(ma_resource_manager_inline_notification* pNotification)
+{
+    ma_resource_manager_inline_notification_wait(pNotification);
+    ma_resource_manager_inline_notification_uninit(pNotification);
+}
+
+
+static void ma_resource_manager_data_buffer_bst_lock(ma_resource_manager* pResourceManager)
+{
+    MA_ASSERT(pResourceManager != NULL);
+
+    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_mutex_lock(&pResourceManager->dataBufferBSTLock);
+        }
+        #else
+        {
+            MA_ASSERT(MA_FALSE);    /* Should never hit this. */
+        }
+        #endif
+    } else {
+        /* Threading not enabled. Do nothing. */
+    }
+}
+
+static void ma_resource_manager_data_buffer_bst_unlock(ma_resource_manager* pResourceManager)
+{
+    MA_ASSERT(pResourceManager != NULL);
+
+    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_mutex_unlock(&pResourceManager->dataBufferBSTLock);
+        }
+        #else
+        {
+            MA_ASSERT(MA_FALSE);    /* Should never hit this. */
+        }
+        #endif
+    } else {
+        /* Threading not enabled. Do nothing. */
+    }
+}
+
+#ifndef MA_NO_THREADING
+static ma_thread_result MA_THREADCALL ma_resource_manager_job_thread(void* pUserData)
+{
+    ma_resource_manager* pResourceManager = (ma_resource_manager*)pUserData;
+    MA_ASSERT(pResourceManager != NULL);
+
+    for (;;) {
+        ma_result result;
+        ma_job job;
+
+        result = ma_resource_manager_next_job(pResourceManager, &job);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        /* Terminate if we got a quit message. */
+        if (job.toc.breakup.code == MA_JOB_TYPE_QUIT) {
+            break;
+        }
+
+        ma_job_process(&job);
+    }
+
+    return (ma_thread_result)0;
+}
+#endif
+
+MA_API ma_resource_manager_config ma_resource_manager_config_init(void)
+{
+    ma_resource_manager_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.decodedFormat     = ma_format_unknown;
+    config.decodedChannels   = 0;
+    config.decodedSampleRate = 0;
+    config.jobThreadCount    = 1;   /* A single miniaudio-managed job thread by default. */
+    config.jobQueueCapacity  = MA_JOB_TYPE_RESOURCE_MANAGER_QUEUE_CAPACITY;
+
+    /* Flags. */
+    config.flags = 0;
+    #ifdef MA_NO_THREADING
+    {
+        /* Threading is disabled at compile time so disable threading at runtime as well by default. */
+        config.flags |= MA_RESOURCE_MANAGER_FLAG_NO_THREADING;
+        config.jobThreadCount = 0;
+    }
+    #endif
+
+    return config;
+}
+
+
+MA_API ma_result ma_resource_manager_init(const ma_resource_manager_config* pConfig, ma_resource_manager* pResourceManager)
+{
+    ma_result result;
+    ma_job_queue_config jobQueueConfig;
+
+    if (pResourceManager == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pResourceManager);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #ifndef MA_NO_THREADING
+    {
+        if (pConfig->jobThreadCount > ma_countof(pResourceManager->jobThreads)) {
+            return MA_INVALID_ARGS; /* Requesting too many job threads. */
+        }
+    }
+    #endif
+
+    pResourceManager->config = *pConfig;
+    ma_allocation_callbacks_init_copy(&pResourceManager->config.allocationCallbacks, &pConfig->allocationCallbacks);
+
+    /* Get the log set up early so we can start using it as soon as possible. */
+    if (pResourceManager->config.pLog == NULL) {
+        result = ma_log_init(&pResourceManager->config.allocationCallbacks, &pResourceManager->log);
+        if (result == MA_SUCCESS) {
+            pResourceManager->config.pLog = &pResourceManager->log;
+        } else {
+            pResourceManager->config.pLog = NULL;   /* Logging is unavailable. */
+        }
+    }
+
+    if (pResourceManager->config.pVFS == NULL) {
+        result = ma_default_vfs_init(&pResourceManager->defaultVFS, &pResourceManager->config.allocationCallbacks);
+        if (result != MA_SUCCESS) {
+            return result;  /* Failed to initialize the default file system. */
+        }
+
+        pResourceManager->config.pVFS = &pResourceManager->defaultVFS;
+    }
+
+    /* If threading has been disabled at compile time, enforce it at run time as well. */
+    #ifdef MA_NO_THREADING
+    {
+        pResourceManager->config.flags |= MA_RESOURCE_MANAGER_FLAG_NO_THREADING;
+    }
+    #endif
+
+    /* We need to force MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING if MA_RESOURCE_MANAGER_FLAG_NO_THREADING is set. */
+    if ((pResourceManager->config.flags & MA_RESOURCE_MANAGER_FLAG_NO_THREADING) != 0) {
+        pResourceManager->config.flags |= MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING;
+
+        /* We cannot allow job threads when MA_RESOURCE_MANAGER_FLAG_NO_THREADING has been set. This is an invalid use case. */
+        if (pResourceManager->config.jobThreadCount > 0) {
+            return MA_INVALID_ARGS;
+        }
+    }
+
+    /* Job queue. */
+    jobQueueConfig.capacity = pResourceManager->config.jobQueueCapacity;
+    jobQueueConfig.flags    = 0;
+    if ((pResourceManager->config.flags & MA_RESOURCE_MANAGER_FLAG_NON_BLOCKING) != 0) {
+        if (pResourceManager->config.jobThreadCount > 0) {
+            return MA_INVALID_ARGS; /* Non-blocking mode is only valid for self-managed job threads. */
+        }
+
+        jobQueueConfig.flags |= MA_JOB_QUEUE_FLAG_NON_BLOCKING;
+    }
+
+    result = ma_job_queue_init(&jobQueueConfig, &pResourceManager->config.allocationCallbacks, &pResourceManager->jobQueue);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+
+    /* Custom decoding backends. */
+    if (pConfig->ppCustomDecodingBackendVTables != NULL && pConfig->customDecodingBackendCount > 0) {
+        size_t sizeInBytes = sizeof(*pResourceManager->config.ppCustomDecodingBackendVTables) * pConfig->customDecodingBackendCount;
+        ma_decoding_backend_vtable** ppCustomDecodingBackendVTables;
+
+        ppCustomDecodingBackendVTables = (ma_decoding_backend_vtable**)ma_malloc(sizeInBytes, &pResourceManager->config.allocationCallbacks);
+        if (pResourceManager->config.ppCustomDecodingBackendVTables == NULL) {
+            ma_job_queue_uninit(&pResourceManager->jobQueue, &pResourceManager->config.allocationCallbacks);
+            return MA_OUT_OF_MEMORY;
+        }
+
+        MA_COPY_MEMORY(ppCustomDecodingBackendVTables, pConfig->ppCustomDecodingBackendVTables, sizeInBytes);
+
+        pResourceManager->config.ppCustomDecodingBackendVTables = ppCustomDecodingBackendVTables;
+        pResourceManager->config.customDecodingBackendCount     = pConfig->customDecodingBackendCount;
+        pResourceManager->config.pCustomDecodingBackendUserData = pConfig->pCustomDecodingBackendUserData;
+    }
+
+
+
+    /* Here is where we initialize our threading stuff. We don't do this if we don't support threading. */
+    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_uint32 iJobThread;
+
+            /* Data buffer lock. */
+            result = ma_mutex_init(&pResourceManager->dataBufferBSTLock);
+            if (result != MA_SUCCESS) {
+                ma_job_queue_uninit(&pResourceManager->jobQueue, &pResourceManager->config.allocationCallbacks);
+                return result;
+            }
+
+            /* Create the job threads last to ensure the threads has access to valid data. */
+            for (iJobThread = 0; iJobThread < pResourceManager->config.jobThreadCount; iJobThread += 1) {
+                result = ma_thread_create(&pResourceManager->jobThreads[iJobThread], ma_thread_priority_normal, pResourceManager->config.jobThreadStackSize, ma_resource_manager_job_thread, pResourceManager, &pResourceManager->config.allocationCallbacks);
+                if (result != MA_SUCCESS) {
+                    ma_mutex_uninit(&pResourceManager->dataBufferBSTLock);
+                    ma_job_queue_uninit(&pResourceManager->jobQueue, &pResourceManager->config.allocationCallbacks);
+                    return result;
+                }
+            }
+        }
+        #else
+        {
+            /* Threading is disabled at compile time. We should never get here because validation checks should have already been performed. */
+            MA_ASSERT(MA_FALSE);
+        }
+        #endif
+    }
+
+    return MA_SUCCESS;
+}
+
+
+static void ma_resource_manager_delete_all_data_buffer_nodes(ma_resource_manager* pResourceManager)
+{
+    MA_ASSERT(pResourceManager);
+
+    /* If everything was done properly, there shouldn't be any active data buffers. */
+    while (pResourceManager->pRootDataBufferNode != NULL) {
+        ma_resource_manager_data_buffer_node* pDataBufferNode = pResourceManager->pRootDataBufferNode;
+        ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
+
+        /* The data buffer has been removed from the BST, so now we need to free its data. */
+        ma_resource_manager_data_buffer_node_free(pResourceManager, pDataBufferNode);
+    }
+}
+
+MA_API void ma_resource_manager_uninit(ma_resource_manager* pResourceManager)
+{
+    if (pResourceManager == NULL) {
+        return;
+    }
+
+    /*
+    Job threads need to be killed first. To do this we need to post a quit message to the message queue and then wait for the thread. The quit message will never be removed from the
+    queue which means it will never not be returned after being encountered for the first time which means all threads will eventually receive it.
+    */
+    ma_resource_manager_post_job_quit(pResourceManager);
+
+    /* Wait for every job to finish before continuing to ensure nothing is sill trying to access any of our objects below. */
+    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_uint32 iJobThread;
+
+            for (iJobThread = 0; iJobThread < pResourceManager->config.jobThreadCount; iJobThread += 1) {
+                ma_thread_wait(&pResourceManager->jobThreads[iJobThread]);
+            }
+        }
+        #else
+        {
+            MA_ASSERT(MA_FALSE);    /* Should never hit this. */
+        }
+        #endif
+    }
+
+    /* At this point the thread should have returned and no other thread should be accessing our data. We can now delete all data buffers. */
+    ma_resource_manager_delete_all_data_buffer_nodes(pResourceManager);
+
+    /* The job queue is no longer needed. */
+    ma_job_queue_uninit(&pResourceManager->jobQueue, &pResourceManager->config.allocationCallbacks);
+
+    /* We're no longer doing anything with data buffers so the lock can now be uninitialized. */
+    if (ma_resource_manager_is_threading_enabled(pResourceManager)) {
+        #ifndef MA_NO_THREADING
+        {
+            ma_mutex_uninit(&pResourceManager->dataBufferBSTLock);
+        }
+        #else
+        {
+            MA_ASSERT(MA_FALSE);    /* Should never hit this. */
+        }
+        #endif
+    }
+
+    ma_free((ma_decoding_backend_vtable**)pResourceManager->config.ppCustomDecodingBackendVTables, &pResourceManager->config.allocationCallbacks);  /* <-- Naughty const-cast, but this is safe. */
+
+    if (pResourceManager->config.pLog == &pResourceManager->log) {
+        ma_log_uninit(&pResourceManager->log);
+    }
+}
+
+MA_API ma_log* ma_resource_manager_get_log(ma_resource_manager* pResourceManager)
+{
+    if (pResourceManager == NULL) {
+        return NULL;
+    }
+
+    return pResourceManager->config.pLog;
+}
+
+
+
+MA_API ma_resource_manager_data_source_config ma_resource_manager_data_source_config_init(void)
+{
+    ma_resource_manager_data_source_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.rangeBegInPCMFrames     = MA_DATA_SOURCE_DEFAULT_RANGE_BEG;
+    config.rangeEndInPCMFrames     = MA_DATA_SOURCE_DEFAULT_RANGE_END;
+    config.loopPointBegInPCMFrames = MA_DATA_SOURCE_DEFAULT_LOOP_POINT_BEG;
+    config.loopPointEndInPCMFrames = MA_DATA_SOURCE_DEFAULT_LOOP_POINT_END;
+    config.isLooping               = MA_FALSE;
+
+    return config;
+}
+
+
+static ma_decoder_config ma_resource_manager__init_decoder_config(ma_resource_manager* pResourceManager)
+{
+    ma_decoder_config config;
+
+    config = ma_decoder_config_init(pResourceManager->config.decodedFormat, pResourceManager->config.decodedChannels, pResourceManager->config.decodedSampleRate);
+    config.allocationCallbacks    = pResourceManager->config.allocationCallbacks;
+    config.ppCustomBackendVTables = pResourceManager->config.ppCustomDecodingBackendVTables;
+    config.customBackendCount     = pResourceManager->config.customDecodingBackendCount;
+    config.pCustomBackendUserData = pResourceManager->config.pCustomDecodingBackendUserData;
+
+    return config;
+}
+
+static ma_result ma_resource_manager__init_decoder(ma_resource_manager* pResourceManager, const char* pFilePath, const wchar_t* pFilePathW, ma_decoder* pDecoder)
+{
+    ma_result result;
+    ma_decoder_config config;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pFilePath        != NULL || pFilePathW != NULL);
+    MA_ASSERT(pDecoder         != NULL);
+
+    config = ma_resource_manager__init_decoder_config(pResourceManager);
+
+    if (pFilePath != NULL) {
+        result = ma_decoder_init_vfs(pResourceManager->config.pVFS, pFilePath, &config, pDecoder);
+        if (result != MA_SUCCESS) {
+            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to load file \"%s\". %s.\n", pFilePath, ma_result_description(result));
+            return result;
+        }
+    } else {
+        result = ma_decoder_init_vfs_w(pResourceManager->config.pVFS, pFilePathW, &config, pDecoder);
+        if (result != MA_SUCCESS) {
+            #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(_MSC_VER)
+                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to load file \"%ls\". %s.\n", pFilePathW, ma_result_description(result));
+            #endif
+            return result;
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_bool32 ma_resource_manager_data_buffer_has_connector(ma_resource_manager_data_buffer* pDataBuffer)
+{
+    return ma_atomic_bool32_get(&pDataBuffer->isConnectorInitialized);
+}
+
+static ma_data_source* ma_resource_manager_data_buffer_get_connector(ma_resource_manager_data_buffer* pDataBuffer)
+{
+    if (ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE) {
+        return NULL;    /* Connector not yet initialized. */
+    }
+
+    switch (pDataBuffer->pNode->data.type)
+    {
+        case ma_resource_manager_data_supply_type_encoded:       return &pDataBuffer->connector.decoder;
+        case ma_resource_manager_data_supply_type_decoded:       return &pDataBuffer->connector.buffer;
+        case ma_resource_manager_data_supply_type_decoded_paged: return &pDataBuffer->connector.pagedBuffer;
+
+        case ma_resource_manager_data_supply_type_unknown:
+        default:
+        {
+            ma_log_postf(ma_resource_manager_get_log(pDataBuffer->pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to retrieve data buffer connector. Unknown data supply type.\n");
+            return NULL;
+        };
+    };
+}
+
+static ma_result ma_resource_manager_data_buffer_init_connector(ma_resource_manager_data_buffer* pDataBuffer, const ma_resource_manager_data_source_config* pConfig, ma_async_notification* pInitNotification, ma_fence* pInitFence)
+{
+    ma_result result;
+
+    MA_ASSERT(pDataBuffer != NULL);
+    MA_ASSERT(pConfig     != NULL);
+    MA_ASSERT(ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE);
+
+    /* The underlying data buffer must be initialized before we'll be able to know how to initialize the backend. */
+    result = ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode);
+    if (result != MA_SUCCESS && result != MA_BUSY) {
+        return result;  /* The data buffer is in an erroneous state. */
+    }
+
+    /*
+    We need to initialize either a ma_decoder or an ma_audio_buffer depending on whether or not the backing data is encoded or decoded. These act as the
+    "instance" to the data and are used to form the connection between underlying data buffer and the data source. If the data buffer is decoded, we can use
+    an ma_audio_buffer. This enables us to use memory mapping when mixing which saves us a bit of data movement overhead.
+    */
+    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
+    {
+        case ma_resource_manager_data_supply_type_encoded:          /* Connector is a decoder. */
+        {
+            ma_decoder_config config;
+            config = ma_resource_manager__init_decoder_config(pDataBuffer->pResourceManager);
+            result = ma_decoder_init_memory(pDataBuffer->pNode->data.backend.encoded.pData, pDataBuffer->pNode->data.backend.encoded.sizeInBytes, &config, &pDataBuffer->connector.decoder);
+        } break;
+
+        case ma_resource_manager_data_supply_type_decoded:          /* Connector is an audio buffer. */
+        {
+            ma_audio_buffer_config config;
+            config = ma_audio_buffer_config_init(pDataBuffer->pNode->data.backend.decoded.format, pDataBuffer->pNode->data.backend.decoded.channels, pDataBuffer->pNode->data.backend.decoded.totalFrameCount, pDataBuffer->pNode->data.backend.decoded.pData, NULL);
+            result = ma_audio_buffer_init(&config, &pDataBuffer->connector.buffer);
+        } break;
+
+        case ma_resource_manager_data_supply_type_decoded_paged:    /* Connector is a paged audio buffer. */
+        {
+            ma_paged_audio_buffer_config config;
+            config = ma_paged_audio_buffer_config_init(&pDataBuffer->pNode->data.backend.decodedPaged.data);
+            result = ma_paged_audio_buffer_init(&config, &pDataBuffer->connector.pagedBuffer);
+        } break;
+
+        case ma_resource_manager_data_supply_type_unknown:
+        default:
+        {
+            /* Unknown data supply type. Should never happen. Need to post an error here. */
+            return MA_INVALID_ARGS;
+        };
+    }
+
+    /*
+    Initialization of the connector is when we can fire the init notification. This will give the application access to
+    the format/channels/rate of the data source.
+    */
+    if (result == MA_SUCCESS) {
+        /*
+        The resource manager supports the ability to set the range and loop settings via a config at
+        initialization time. This results in an case where the ranges could be set explicitly via
+        ma_data_source_set_*() before we get to this point here. If this happens, we'll end up
+        hitting a case where we just override those settings which results in what feels like a bug.
+
+        To address this we only change the relevant properties if they're not equal to defaults. If
+        they're equal to defaults there's no need to change them anyway. If they're *not* set to the
+        default values, we can assume the user has set the range and loop settings via the config. If
+        they're doing their own calls to ma_data_source_set_*() in addition to setting them via the
+        config, that's entirely on the caller and any synchronization issue becomes their problem.
+        */
+        if (pConfig->rangeBegInPCMFrames != MA_DATA_SOURCE_DEFAULT_RANGE_BEG || pConfig->rangeEndInPCMFrames != MA_DATA_SOURCE_DEFAULT_RANGE_END) {
+            ma_data_source_set_range_in_pcm_frames(pDataBuffer, pConfig->rangeBegInPCMFrames, pConfig->rangeEndInPCMFrames);
+        }
+
+        if (pConfig->loopPointBegInPCMFrames != MA_DATA_SOURCE_DEFAULT_LOOP_POINT_BEG || pConfig->loopPointEndInPCMFrames != MA_DATA_SOURCE_DEFAULT_LOOP_POINT_END) {
+            ma_data_source_set_loop_point_in_pcm_frames(pDataBuffer, pConfig->loopPointBegInPCMFrames, pConfig->loopPointEndInPCMFrames);
+        }
+
+        if (pConfig->isLooping != MA_FALSE) {
+            ma_data_source_set_looping(pDataBuffer, pConfig->isLooping);
+        }
+
+        ma_atomic_bool32_set(&pDataBuffer->isConnectorInitialized, MA_TRUE);
+
+        if (pInitNotification != NULL) {
+            ma_async_notification_signal(pInitNotification);
+        }
+
+        if (pInitFence != NULL) {
+            ma_fence_release(pInitFence);
+        }
+    }
+
+    /* At this point the backend should be initialized. We do *not* want to set pDataSource->result here - that needs to be done at a higher level to ensure it's done as the last step. */
+    return result;
+}
+
+static ma_result ma_resource_manager_data_buffer_uninit_connector(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer* pDataBuffer)
+{
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBuffer      != NULL);
+
+    (void)pResourceManager;
+
+    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
+    {
+        case ma_resource_manager_data_supply_type_encoded:          /* Connector is a decoder. */
+        {
+            ma_decoder_uninit(&pDataBuffer->connector.decoder);
+        } break;
+
+        case ma_resource_manager_data_supply_type_decoded:          /* Connector is an audio buffer. */
+        {
+            ma_audio_buffer_uninit(&pDataBuffer->connector.buffer);
+        } break;
+
+        case ma_resource_manager_data_supply_type_decoded_paged:    /* Connector is a paged audio buffer. */
+        {
+            ma_paged_audio_buffer_uninit(&pDataBuffer->connector.pagedBuffer);
+        } break;
+
+        case ma_resource_manager_data_supply_type_unknown:
+        default:
+        {
+            /* Unknown data supply type. Should never happen. Need to post an error here. */
+            return MA_INVALID_ARGS;
+        };
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_uint32 ma_resource_manager_data_buffer_node_next_execution_order(ma_resource_manager_data_buffer_node* pDataBufferNode)
+{
+    MA_ASSERT(pDataBufferNode != NULL);
+    return ma_atomic_fetch_add_32(&pDataBufferNode->executionCounter, 1);
+}
+
+static ma_result ma_resource_manager_data_buffer_node_init_supply_encoded(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, const char* pFilePath, const wchar_t* pFilePathW)
+{
+    ma_result result;
+    size_t dataSizeInBytes;
+    void* pData;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+    MA_ASSERT(pFilePath != NULL || pFilePathW != NULL);
+
+    result = ma_vfs_open_and_read_file_ex(pResourceManager->config.pVFS, pFilePath, pFilePathW, &pData, &dataSizeInBytes, &pResourceManager->config.allocationCallbacks);
+    if (result != MA_SUCCESS) {
+        if (pFilePath != NULL) {
+            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to load file \"%s\". %s.\n", pFilePath, ma_result_description(result));
+        } else {
+            #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(_MSC_VER)
+                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to load file \"%ls\". %s.\n", pFilePathW, ma_result_description(result));
+            #endif
+        }
+
+        return result;
+    }
+
+    pDataBufferNode->data.backend.encoded.pData       = pData;
+    pDataBufferNode->data.backend.encoded.sizeInBytes = dataSizeInBytes;
+    ma_resource_manager_data_buffer_node_set_data_supply_type(pDataBufferNode, ma_resource_manager_data_supply_type_encoded);  /* <-- Must be set last. */
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_resource_manager_data_buffer_node_init_supply_decoded(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, const char* pFilePath, const wchar_t* pFilePathW, ma_uint32 flags, ma_decoder** ppDecoder)
+{
+    ma_result result = MA_SUCCESS;
+    ma_decoder* pDecoder;
+    ma_uint64 totalFrameCount;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+    MA_ASSERT(ppDecoder         != NULL);
+    MA_ASSERT(pFilePath != NULL || pFilePathW != NULL);
+
+    *ppDecoder = NULL;  /* For safety. */
+
+    pDecoder = (ma_decoder*)ma_malloc(sizeof(*pDecoder), &pResourceManager->config.allocationCallbacks);
+    if (pDecoder == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_resource_manager__init_decoder(pResourceManager, pFilePath, pFilePathW, pDecoder);
+    if (result != MA_SUCCESS) {
+        ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
+        return result;
+    }
+
+    /*
+    At this point we have the decoder and we now need to initialize the data supply. This will
+    be either a decoded buffer, or a decoded paged buffer. A regular buffer is just one big heap
+    allocated buffer, whereas a paged buffer is a linked list of paged-sized buffers. The latter
+    is used when the length of a sound is unknown until a full decode has been performed.
+    */
+    if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_UNKNOWN_LENGTH) == 0) {
+        result = ma_decoder_get_length_in_pcm_frames(pDecoder, &totalFrameCount);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    } else {
+        totalFrameCount = 0;
+    }
+
+    if (totalFrameCount > 0) {
+        /* It's a known length. The data supply is a regular decoded buffer. */
+        ma_uint64 dataSizeInBytes;
+        void* pData;
+
+        dataSizeInBytes = totalFrameCount * ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
+        if (dataSizeInBytes > MA_SIZE_MAX) {
+            ma_decoder_uninit(pDecoder);
+            ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
+            return MA_TOO_BIG;
+        }
+
+        pData = ma_malloc((size_t)dataSizeInBytes, &pResourceManager->config.allocationCallbacks);
+        if (pData == NULL) {
+            ma_decoder_uninit(pDecoder);
+            ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
+            return MA_OUT_OF_MEMORY;
+        }
+
+        /* The buffer needs to be initialized to silence in case the caller reads from it. */
+        ma_silence_pcm_frames(pData, totalFrameCount, pDecoder->outputFormat, pDecoder->outputChannels);
+
+        /* Data has been allocated and the data supply can now be initialized. */
+        pDataBufferNode->data.backend.decoded.pData             = pData;
+        pDataBufferNode->data.backend.decoded.totalFrameCount   = totalFrameCount;
+        pDataBufferNode->data.backend.decoded.format            = pDecoder->outputFormat;
+        pDataBufferNode->data.backend.decoded.channels          = pDecoder->outputChannels;
+        pDataBufferNode->data.backend.decoded.sampleRate        = pDecoder->outputSampleRate;
+        pDataBufferNode->data.backend.decoded.decodedFrameCount = 0;
+        ma_resource_manager_data_buffer_node_set_data_supply_type(pDataBufferNode, ma_resource_manager_data_supply_type_decoded);  /* <-- Must be set last. */
+    } else {
+        /*
+        It's an unknown length. The data supply is a paged decoded buffer. Setting this up is
+        actually easier than the non-paged decoded buffer because we just need to initialize
+        a ma_paged_audio_buffer object.
+        */
+        result = ma_paged_audio_buffer_data_init(pDecoder->outputFormat, pDecoder->outputChannels, &pDataBufferNode->data.backend.decodedPaged.data);
+        if (result != MA_SUCCESS) {
+            ma_decoder_uninit(pDecoder);
+            ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
+            return result;
+        }
+
+        pDataBufferNode->data.backend.decodedPaged.sampleRate        = pDecoder->outputSampleRate;
+        pDataBufferNode->data.backend.decodedPaged.decodedFrameCount = 0;
+        ma_resource_manager_data_buffer_node_set_data_supply_type(pDataBufferNode, ma_resource_manager_data_supply_type_decoded_paged);  /* <-- Must be set last. */
+    }
+
+    *ppDecoder = pDecoder;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_resource_manager_data_buffer_node_decode_next_page(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, ma_decoder* pDecoder)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 pageSizeInFrames;
+    ma_uint64 framesToTryReading;
+    ma_uint64 framesRead;
+
+    MA_ASSERT(pResourceManager != NULL);
+    MA_ASSERT(pDataBufferNode  != NULL);
+    MA_ASSERT(pDecoder         != NULL);
+
+    /* We need to know the size of a page in frames to know how many frames to decode. */
+    pageSizeInFrames = MA_RESOURCE_MANAGER_PAGE_SIZE_IN_MILLISECONDS * (pDecoder->outputSampleRate/1000);
+    framesToTryReading = pageSizeInFrames;
+
+    /*
+    Here is where we do the decoding of the next page. We'll run a slightly different path depending
+    on whether or not we're using a flat or paged buffer because the allocation of the page differs
+    between the two. For a flat buffer it's an offset to an already-allocated buffer. For a paged
+    buffer, we need to allocate a new page and attach it to the linked list.
+    */
+    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode))
+    {
+        case ma_resource_manager_data_supply_type_decoded:
+        {
+            /* The destination buffer is an offset to the existing buffer. Don't read more than we originally retrieved when we first initialized the decoder. */
+            void* pDst;
+            ma_uint64 framesRemaining = pDataBufferNode->data.backend.decoded.totalFrameCount - pDataBufferNode->data.backend.decoded.decodedFrameCount;
+            if (framesToTryReading > framesRemaining) {
+                framesToTryReading = framesRemaining;
+            }
+
+            if (framesToTryReading > 0) {
+                pDst = ma_offset_ptr(
+                    pDataBufferNode->data.backend.decoded.pData,
+                    pDataBufferNode->data.backend.decoded.decodedFrameCount * ma_get_bytes_per_frame(pDataBufferNode->data.backend.decoded.format, pDataBufferNode->data.backend.decoded.channels)
+                );
+                MA_ASSERT(pDst != NULL);
+
+                result = ma_decoder_read_pcm_frames(pDecoder, pDst, framesToTryReading, &framesRead);
+                if (framesRead > 0) {
+                    pDataBufferNode->data.backend.decoded.decodedFrameCount += framesRead;
+                }
+            } else {
+                framesRead = 0;
+            }
+        } break;
+
+        case ma_resource_manager_data_supply_type_decoded_paged:
+        {
+            /* The destination buffer is a freshly allocated page. */
+            ma_paged_audio_buffer_page* pPage;
+
+            result = ma_paged_audio_buffer_data_allocate_page(&pDataBufferNode->data.backend.decodedPaged.data, framesToTryReading, NULL, &pResourceManager->config.allocationCallbacks, &pPage);
+            if (result != MA_SUCCESS) {
+                return result;
+            }
+
+            result = ma_decoder_read_pcm_frames(pDecoder, pPage->pAudioData, framesToTryReading, &framesRead);
+            if (result == MA_SUCCESS && framesRead > 0) {
+                pPage->sizeInFrames = framesRead;
+
+                result = ma_paged_audio_buffer_data_append_page(&pDataBufferNode->data.backend.decodedPaged.data, pPage);
+                if (result == MA_SUCCESS) {
+                    pDataBufferNode->data.backend.decodedPaged.decodedFrameCount += framesRead;
+                } else {
+                    /* Failed to append the page. Just abort and set the status to MA_AT_END. */
+                    ma_paged_audio_buffer_data_free_page(&pDataBufferNode->data.backend.decodedPaged.data, pPage, &pResourceManager->config.allocationCallbacks);
+                    result = MA_AT_END;
+                }
+            } else {
+                /* No frames were read. Free the page and just set the status to MA_AT_END. */
+                ma_paged_audio_buffer_data_free_page(&pDataBufferNode->data.backend.decodedPaged.data, pPage, &pResourceManager->config.allocationCallbacks);
+                result = MA_AT_END;
+            }
+        } break;
+
+        case ma_resource_manager_data_supply_type_encoded:
+        case ma_resource_manager_data_supply_type_unknown:
+        default:
+        {
+            /* Unexpected data supply type. */
+            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Unexpected data supply type (%d) when decoding page.", ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBufferNode));
+            return MA_ERROR;
+        };
+    }
+
+    if (result == MA_SUCCESS && framesRead == 0) {
+        result = MA_AT_END;
+    }
+
+    return result;
+}
+
+static ma_result ma_resource_manager_data_buffer_node_acquire_critical_section(ma_resource_manager* pResourceManager, const char* pFilePath, const wchar_t* pFilePathW, ma_uint32 hashedName32, ma_uint32 flags, const ma_resource_manager_data_supply* pExistingData, ma_fence* pInitFence, ma_fence* pDoneFence, ma_resource_manager_inline_notification* pInitNotification, ma_resource_manager_data_buffer_node** ppDataBufferNode)
+{
+    ma_result result = MA_SUCCESS;
+    ma_resource_manager_data_buffer_node* pDataBufferNode = NULL;
+    ma_resource_manager_data_buffer_node* pInsertPoint;
+
+    if (ppDataBufferNode != NULL) {
+        *ppDataBufferNode = NULL;
+    }
+
+    result = ma_resource_manager_data_buffer_node_insert_point(pResourceManager, hashedName32, &pInsertPoint);
+    if (result == MA_ALREADY_EXISTS) {
+        /* The node already exists. We just need to increment the reference count. */
+        pDataBufferNode = pInsertPoint;
+
+        result = ma_resource_manager_data_buffer_node_increment_ref(pResourceManager, pDataBufferNode, NULL);
+        if (result != MA_SUCCESS) {
+            return result;  /* Should never happen. Failed to increment the reference count. */
+        }
+
+        result = MA_ALREADY_EXISTS;
+        goto done;
+    } else {
+        /*
+        The node does not already exist. We need to post a LOAD_DATA_BUFFER_NODE job here. This
+        needs to be done inside the critical section to ensure an uninitialization of the node
+        does not occur before initialization on another thread.
+        */
+        pDataBufferNode = (ma_resource_manager_data_buffer_node*)ma_malloc(sizeof(*pDataBufferNode), &pResourceManager->config.allocationCallbacks);
+        if (pDataBufferNode == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+
+        MA_ZERO_OBJECT(pDataBufferNode);
+        pDataBufferNode->hashedName32 = hashedName32;
+        pDataBufferNode->refCount     = 1;        /* Always set to 1 by default (this is our first reference). */
+
+        if (pExistingData == NULL) {
+            pDataBufferNode->data.type    = ma_resource_manager_data_supply_type_unknown;    /* <-- We won't know this until we start decoding. */
+            pDataBufferNode->result       = MA_BUSY;  /* Must be set to MA_BUSY before we leave the critical section, so might as well do it now. */
+            pDataBufferNode->isDataOwnedByResourceManager = MA_TRUE;
+        } else {
+            pDataBufferNode->data         = *pExistingData;
+            pDataBufferNode->result       = MA_SUCCESS;   /* Not loading asynchronously, so just set the status */
+            pDataBufferNode->isDataOwnedByResourceManager = MA_FALSE;
+        }
+
+        result = ma_resource_manager_data_buffer_node_insert_at(pResourceManager, pDataBufferNode, pInsertPoint);
+        if (result != MA_SUCCESS) {
+            ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
+            return result;  /* Should never happen. Failed to insert the data buffer into the BST. */
+        }
+
+        /*
+        Here is where we'll post the job, but only if we're loading asynchronously. If we're
+        loading synchronously we'll defer loading to a later stage, outside of the critical
+        section.
+        */
+        if (pDataBufferNode->isDataOwnedByResourceManager && (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) != 0) {
+            /* Loading asynchronously. Post the job. */
+            ma_job job;
+            char* pFilePathCopy = NULL;
+            wchar_t* pFilePathWCopy = NULL;
+
+            /* We need a copy of the file path. We should probably make this more efficient, but for now we'll do a transient memory allocation. */
+            if (pFilePath != NULL) {
+                pFilePathCopy = ma_copy_string(pFilePath, &pResourceManager->config.allocationCallbacks);
+            } else {
+                pFilePathWCopy = ma_copy_string_w(pFilePathW, &pResourceManager->config.allocationCallbacks);
+            }
+
+            if (pFilePathCopy == NULL && pFilePathWCopy == NULL) {
+                ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
+                ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
+                return MA_OUT_OF_MEMORY;
+            }
+
+            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+                ma_resource_manager_inline_notification_init(pResourceManager, pInitNotification);
+            }
+
+            /* Acquire init and done fences before posting the job. These will be unacquired by the job thread. */
+            if (pInitFence != NULL) { ma_fence_acquire(pInitFence); }
+            if (pDoneFence != NULL) { ma_fence_acquire(pDoneFence); }
+
+            /* We now have everything we need to post the job to the job thread. */
+            job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE);
+            job.order = ma_resource_manager_data_buffer_node_next_execution_order(pDataBufferNode);
+            job.data.resourceManager.loadDataBufferNode.pResourceManager  = pResourceManager;
+            job.data.resourceManager.loadDataBufferNode.pDataBufferNode   = pDataBufferNode;
+            job.data.resourceManager.loadDataBufferNode.pFilePath         = pFilePathCopy;
+            job.data.resourceManager.loadDataBufferNode.pFilePathW        = pFilePathWCopy;
+            job.data.resourceManager.loadDataBufferNode.flags             = flags;
+            job.data.resourceManager.loadDataBufferNode.pInitNotification = ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) ? pInitNotification : NULL;
+            job.data.resourceManager.loadDataBufferNode.pDoneNotification = NULL;
+            job.data.resourceManager.loadDataBufferNode.pInitFence        = pInitFence;
+            job.data.resourceManager.loadDataBufferNode.pDoneFence        = pDoneFence;
+
+            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+                result = ma_job_process(&job);
+            } else {
+                result = ma_resource_manager_post_job(pResourceManager, &job);
+            }
+
+            if (result != MA_SUCCESS) {
+                /* Failed to post job. Probably ran out of memory. */
+                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to post MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER_NODE job. %s.\n", ma_result_description(result));
+
+                /*
+                Fences were acquired before posting the job, but since the job was not able to
+                be posted, we need to make sure we release them so nothing gets stuck waiting.
+                */
+                if (pInitFence != NULL) { ma_fence_release(pInitFence); }
+                if (pDoneFence != NULL) { ma_fence_release(pDoneFence); }
+
+                if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+                    ma_resource_manager_inline_notification_uninit(pInitNotification);
+                } else {
+                    /* These will have been freed by the job thread, but with WAIT_INIT they will already have happened since the job has already been handled. */
+                    ma_free(pFilePathCopy,  &pResourceManager->config.allocationCallbacks);
+                    ma_free(pFilePathWCopy, &pResourceManager->config.allocationCallbacks);
+                }
+
+                ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
+                ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
+
+                return result;
+            }
+        }
+    }
+
+done:
+    if (ppDataBufferNode != NULL) {
+        *ppDataBufferNode = pDataBufferNode;
+    }
+
+    return result;
+}
+
+static ma_result ma_resource_manager_data_buffer_node_acquire(ma_resource_manager* pResourceManager, const char* pFilePath, const wchar_t* pFilePathW, ma_uint32 hashedName32, ma_uint32 flags, const ma_resource_manager_data_supply* pExistingData, ma_fence* pInitFence, ma_fence* pDoneFence, ma_resource_manager_data_buffer_node** ppDataBufferNode)
+{
+    ma_result result = MA_SUCCESS;
+    ma_bool32 nodeAlreadyExists = MA_FALSE;
+    ma_resource_manager_data_buffer_node* pDataBufferNode = NULL;
+    ma_resource_manager_inline_notification initNotification;   /* Used when the WAIT_INIT flag is set. */
+
+    if (ppDataBufferNode != NULL) {
+        *ppDataBufferNode = NULL;   /* Safety. */
+    }
+
+    if (pResourceManager == NULL || (pFilePath == NULL && pFilePathW == NULL && hashedName32 == 0)) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* If we're specifying existing data, it must be valid. */
+    if (pExistingData != NULL && pExistingData->type == ma_resource_manager_data_supply_type_unknown) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* If we don't support threading, remove the ASYNC flag to make the rest of this a bit simpler. */
+    if (ma_resource_manager_is_threading_enabled(pResourceManager) == MA_FALSE) {
+        flags &= ~MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC;
+    }
+
+    if (hashedName32 == 0) {
+        if (pFilePath != NULL) {
+            hashedName32 = ma_hash_string_32(pFilePath);
+        } else {
+            hashedName32 = ma_hash_string_w_32(pFilePathW);
+        }
+    }
+
+    /*
+    Here is where we either increment the node's reference count or allocate a new one and add it
+    to the BST. When allocating a new node, we need to make sure the LOAD_DATA_BUFFER_NODE job is
+    posted inside the critical section just in case the caller immediately uninitializes the node
+    as this will ensure the FREE_DATA_BUFFER_NODE job is given an execution order such that the
+    node is not uninitialized before initialization.
+    */
+    ma_resource_manager_data_buffer_bst_lock(pResourceManager);
+    {
+        result = ma_resource_manager_data_buffer_node_acquire_critical_section(pResourceManager, pFilePath, pFilePathW, hashedName32, flags, pExistingData, pInitFence, pDoneFence, &initNotification, &pDataBufferNode);
+    }
+    ma_resource_manager_data_buffer_bst_unlock(pResourceManager);
+
+    if (result == MA_ALREADY_EXISTS) {
+        nodeAlreadyExists = MA_TRUE;
+        result = MA_SUCCESS;
+    } else {
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    /*
+    If we're loading synchronously, we'll need to load everything now. When loading asynchronously,
+    a job will have been posted inside the BST critical section so that an uninitialization can be
+    allocated an appropriate execution order thereby preventing it from being uninitialized before
+    the node is initialized by the decoding thread(s).
+    */
+    if (nodeAlreadyExists == MA_FALSE) {    /* Don't need to try loading anything if the node already exists. */
+        if (pFilePath == NULL && pFilePathW == NULL) {
+            /*
+            If this path is hit, it means a buffer is being copied (i.e. initialized from only the
+            hashed name), but that node has been freed in the meantime, probably from some other
+            thread. This is an invalid operation.
+            */
+            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Cloning data buffer node failed because the source node was released. The source node must remain valid until the cloning has completed.\n");
+            result = MA_INVALID_OPERATION;
+            goto done;
+        }
+
+        if (pDataBufferNode->isDataOwnedByResourceManager) {
+            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) == 0) {
+                /* Loading synchronously. Load the sound in it's entirety here. */
+                if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE) == 0) {
+                    /* No decoding. This is the simple case - just store the file contents in memory. */
+                    result = ma_resource_manager_data_buffer_node_init_supply_encoded(pResourceManager, pDataBufferNode, pFilePath, pFilePathW);
+                    if (result != MA_SUCCESS) {
+                        goto done;
+                    }
+                } else {
+                    /* Decoding. We do this the same way as we do when loading asynchronously. */
+                    ma_decoder* pDecoder;
+                    result = ma_resource_manager_data_buffer_node_init_supply_decoded(pResourceManager, pDataBufferNode, pFilePath, pFilePathW, flags, &pDecoder);
+                    if (result != MA_SUCCESS) {
+                        goto done;
+                    }
+
+                    /* We have the decoder, now decode page by page just like we do when loading asynchronously. */
+                    for (;;) {
+                        /* Decode next page. */
+                        result = ma_resource_manager_data_buffer_node_decode_next_page(pResourceManager, pDataBufferNode, pDecoder);
+                        if (result != MA_SUCCESS) {
+                            break;  /* Will return MA_AT_END when the last page has been decoded. */
+                        }
+                    }
+
+                    /* Reaching the end needs to be considered successful. */
+                    if (result == MA_AT_END) {
+                        result  = MA_SUCCESS;
+                    }
+
+                    /*
+                    At this point the data buffer is either fully decoded or some error occurred. Either
+                    way, the decoder is no longer necessary.
+                    */
+                    ma_decoder_uninit(pDecoder);
+                    ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
+                }
+
+                /* Getting here means we were successful. Make sure the status of the node is updated accordingly. */
+                ma_atomic_exchange_i32(&pDataBufferNode->result, result);
+            } else {
+                /* Loading asynchronously. We may need to wait for initialization. */
+                if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+                    ma_resource_manager_inline_notification_wait(&initNotification);
+                }
+            }
+        } else {
+            /* The data is not managed by the resource manager so there's nothing else to do. */
+            MA_ASSERT(pExistingData != NULL);
+        }
+    }
+
+done:
+    /* If we failed to initialize the data buffer we need to free it. */
+    if (result != MA_SUCCESS) {
+        if (nodeAlreadyExists == MA_FALSE) {
+            ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
+            ma_free(pDataBufferNode, &pResourceManager->config.allocationCallbacks);
+        }
+    }
+
+    /*
+    The init notification needs to be uninitialized. This will be used if the node does not already
+    exist, and we've specified ASYNC | WAIT_INIT.
+    */
+    if (nodeAlreadyExists == MA_FALSE && pDataBufferNode->isDataOwnedByResourceManager && (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) != 0) {
+        if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+            ma_resource_manager_inline_notification_uninit(&initNotification);
+        }
+    }
+
+    if (ppDataBufferNode != NULL) {
+        *ppDataBufferNode = pDataBufferNode;
+    }
+
+    return result;
+}
+
+static ma_result ma_resource_manager_data_buffer_node_unacquire(ma_resource_manager* pResourceManager, ma_resource_manager_data_buffer_node* pDataBufferNode, const char* pName, const wchar_t* pNameW)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint32 refCount = 0xFFFFFFFF; /* The new reference count of the node after decrementing. Initialize to non-0 to be safe we don't fall into the freeing path. */
+    ma_uint32 hashedName32 = 0;
+
+    if (pResourceManager == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pDataBufferNode == NULL) {
+        if (pName == NULL && pNameW == NULL) {
+            return MA_INVALID_ARGS;
+        }
+
+        if (pName != NULL) {
+            hashedName32 = ma_hash_string_32(pName);
+        } else {
+            hashedName32 = ma_hash_string_w_32(pNameW);
+        }
+    }
+
+    /*
+    The first thing to do is decrement the reference counter of the node. Then, if the reference
+    count is zero, we need to free the node. If the node is still in the process of loading, we'll
+    need to post a job to the job queue to free the node. Otherwise we'll just do it here.
+    */
+    ma_resource_manager_data_buffer_bst_lock(pResourceManager);
+    {
+        /* Might need to find the node. Must be done inside the critical section. */
+        if (pDataBufferNode == NULL) {
+            result = ma_resource_manager_data_buffer_node_search(pResourceManager, hashedName32, &pDataBufferNode);
+            if (result != MA_SUCCESS) {
+                goto stage2;    /* Couldn't find the node. */
+            }
+        }
+
+        result = ma_resource_manager_data_buffer_node_decrement_ref(pResourceManager, pDataBufferNode, &refCount);
+        if (result != MA_SUCCESS) {
+            goto stage2;    /* Should never happen. */
+        }
+
+        if (refCount == 0) {
+            result = ma_resource_manager_data_buffer_node_remove(pResourceManager, pDataBufferNode);
+            if (result != MA_SUCCESS) {
+                goto stage2;  /* An error occurred when trying to remove the data buffer. This should never happen. */
+            }
+        }
+    }
+    ma_resource_manager_data_buffer_bst_unlock(pResourceManager);
+
+stage2:
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /*
+    Here is where we need to free the node. We don't want to do this inside the critical section
+    above because we want to keep that as small as possible for multi-threaded efficiency.
+    */
+    if (refCount == 0) {
+        if (ma_resource_manager_data_buffer_node_result(pDataBufferNode) == MA_BUSY) {
+            /* The sound is still loading. We need to delay the freeing of the node to a safe time. */
+            ma_job job;
+
+            /* We need to mark the node as unavailable for the sake of the resource manager worker threads. */
+            ma_atomic_exchange_i32(&pDataBufferNode->result, MA_UNAVAILABLE);
+
+            job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER_NODE);
+            job.order = ma_resource_manager_data_buffer_node_next_execution_order(pDataBufferNode);
+            job.data.resourceManager.freeDataBufferNode.pResourceManager = pResourceManager;
+            job.data.resourceManager.freeDataBufferNode.pDataBufferNode  = pDataBufferNode;
+
+            result = ma_resource_manager_post_job(pResourceManager, &job);
+            if (result != MA_SUCCESS) {
+                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to post MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER_NODE job. %s.\n", ma_result_description(result));
+                return result;
+            }
+
+            /* If we don't support threading, process the job queue here. */
+            if (ma_resource_manager_is_threading_enabled(pResourceManager) == MA_FALSE) {
+                while (ma_resource_manager_data_buffer_node_result(pDataBufferNode) == MA_BUSY) {
+                    result = ma_resource_manager_process_next_job(pResourceManager);
+                    if (result == MA_NO_DATA_AVAILABLE || result == MA_CANCELLED) {
+                        result = MA_SUCCESS;
+                        break;
+                    }
+                }
+            } else {
+                /* Threading is enabled. The job queue will deal with the rest of the cleanup from here. */
+            }
+        } else {
+            /* The sound isn't loading so we can just free the node here. */
+            ma_resource_manager_data_buffer_node_free(pResourceManager, pDataBufferNode);
+        }
+    }
+
+    return result;
+}
+
+
+
+static ma_uint32 ma_resource_manager_data_buffer_next_execution_order(ma_resource_manager_data_buffer* pDataBuffer)
+{
+    MA_ASSERT(pDataBuffer != NULL);
+    return ma_atomic_fetch_add_32(&pDataBuffer->executionCounter, 1);
+}
+
+static ma_result ma_resource_manager_data_buffer_cb__read_pcm_frames(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_resource_manager_data_buffer_read_pcm_frames((ma_resource_manager_data_buffer*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_resource_manager_data_buffer_cb__seek_to_pcm_frame(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_resource_manager_data_buffer_seek_to_pcm_frame((ma_resource_manager_data_buffer*)pDataSource, frameIndex);
+}
+
+static ma_result ma_resource_manager_data_buffer_cb__get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    return ma_resource_manager_data_buffer_get_data_format((ma_resource_manager_data_buffer*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+}
+
+static ma_result ma_resource_manager_data_buffer_cb__get_cursor_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    return ma_resource_manager_data_buffer_get_cursor_in_pcm_frames((ma_resource_manager_data_buffer*)pDataSource, pCursor);
+}
+
+static ma_result ma_resource_manager_data_buffer_cb__get_length_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    return ma_resource_manager_data_buffer_get_length_in_pcm_frames((ma_resource_manager_data_buffer*)pDataSource, pLength);
+}
+
+static ma_result ma_resource_manager_data_buffer_cb__set_looping(ma_data_source* pDataSource, ma_bool32 isLooping)
+{
+    ma_resource_manager_data_buffer* pDataBuffer = (ma_resource_manager_data_buffer*)pDataSource;
+    MA_ASSERT(pDataBuffer != NULL);
+
+    ma_atomic_exchange_32(&pDataBuffer->isLooping, isLooping);
+
+    /* The looping state needs to be set on the connector as well or else looping won't work when we read audio data. */
+    ma_data_source_set_looping(ma_resource_manager_data_buffer_get_connector(pDataBuffer), isLooping);
+
+    return MA_SUCCESS;
+}
+
+static ma_data_source_vtable g_ma_resource_manager_data_buffer_vtable =
+{
+    ma_resource_manager_data_buffer_cb__read_pcm_frames,
+    ma_resource_manager_data_buffer_cb__seek_to_pcm_frame,
+    ma_resource_manager_data_buffer_cb__get_data_format,
+    ma_resource_manager_data_buffer_cb__get_cursor_in_pcm_frames,
+    ma_resource_manager_data_buffer_cb__get_length_in_pcm_frames,
+    ma_resource_manager_data_buffer_cb__set_looping,
+    0
+};
+
+static ma_result ma_resource_manager_data_buffer_init_ex_internal(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_uint32 hashedName32, ma_resource_manager_data_buffer* pDataBuffer)
+{
+    ma_result result = MA_SUCCESS;
+    ma_resource_manager_data_buffer_node* pDataBufferNode;
+    ma_data_source_config dataSourceConfig;
+    ma_bool32 async;
+    ma_uint32 flags;
+    ma_resource_manager_pipeline_notifications notifications;
+
+    if (pDataBuffer == NULL) {
+        if (pConfig != NULL && pConfig->pNotifications != NULL) {
+            ma_resource_manager_pipeline_notifications_signal_all_notifications(pConfig->pNotifications);
+        }
+
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDataBuffer);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->pNotifications != NULL) {
+        notifications = *pConfig->pNotifications;   /* From here on out we should be referencing `notifications` instead of `pNotifications`. Set this to NULL to catch errors at testing time. */
+    } else {
+        MA_ZERO_OBJECT(&notifications);
+    }
+
+    /* For safety, always remove the ASYNC flag if threading is disabled on the resource manager. */
+    flags = pConfig->flags;
+    if (ma_resource_manager_is_threading_enabled(pResourceManager) == MA_FALSE) {
+        flags &= ~MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC;
+    }
+
+    if (pConfig->isLooping) {
+        flags |= MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING;
+    }
+
+    async = (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) != 0;
+
+    /*
+    Fences need to be acquired before doing anything. These must be acquired and released outside of
+    the node to ensure there's no holes where ma_fence_wait() could prematurely return before the
+    data buffer has completed initialization.
+
+    When loading asynchronously, the node acquisition routine below will acquire the fences on this
+    thread and then release them on the async thread when the operation is complete.
+
+    These fences are always released at the "done" tag at the end of this function. They'll be
+    acquired a second if loading asynchronously. This double acquisition system is just done to
+    simplify code maintenance.
+    */
+    ma_resource_manager_pipeline_notifications_acquire_all_fences(&notifications);
+    {
+        /* We first need to acquire a node. If ASYNC is not set, this will not return until the entire sound has been loaded. */
+        result = ma_resource_manager_data_buffer_node_acquire(pResourceManager, pConfig->pFilePath, pConfig->pFilePathW, hashedName32, flags, NULL, notifications.init.pFence, notifications.done.pFence, &pDataBufferNode);
+        if (result != MA_SUCCESS) {
+            ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
+            goto done;
+        }
+
+        dataSourceConfig = ma_data_source_config_init();
+        dataSourceConfig.vtable = &g_ma_resource_manager_data_buffer_vtable;
+
+        result = ma_data_source_init(&dataSourceConfig, &pDataBuffer->ds);
+        if (result != MA_SUCCESS) {
+            ma_resource_manager_data_buffer_node_unacquire(pResourceManager, pDataBufferNode, NULL, NULL);
+            ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
+            goto done;
+        }
+
+        pDataBuffer->pResourceManager = pResourceManager;
+        pDataBuffer->pNode  = pDataBufferNode;
+        pDataBuffer->flags  = flags;
+        pDataBuffer->result = MA_BUSY;  /* Always default to MA_BUSY for safety. It'll be overwritten when loading completes or an error occurs. */
+
+        /* If we're loading asynchronously we need to post a job to the job queue to initialize the connector. */
+        if (async == MA_FALSE || ma_resource_manager_data_buffer_node_result(pDataBufferNode) == MA_SUCCESS) {
+            /* Loading synchronously or the data has already been fully loaded. We can just initialize the connector from here without a job. */
+            result = ma_resource_manager_data_buffer_init_connector(pDataBuffer, pConfig, NULL, NULL);
+            ma_atomic_exchange_i32(&pDataBuffer->result, result);
+
+            ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
+            goto done;
+        } else {
+            /* The node's data supply isn't initialized yet. The caller has requested that we load asynchronously so we need to post a job to do this. */
+            ma_job job;
+            ma_resource_manager_inline_notification initNotification;   /* Used when the WAIT_INIT flag is set. */
+
+            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+                ma_resource_manager_inline_notification_init(pResourceManager, &initNotification);
+            }
+
+            /*
+            The status of the data buffer needs to be set to MA_BUSY before posting the job so that the
+            worker thread is aware of its busy state. If the LOAD_DATA_BUFFER job sees a status other
+            than MA_BUSY, it'll assume an error and fall through to an early exit.
+            */
+            ma_atomic_exchange_i32(&pDataBuffer->result, MA_BUSY);
+
+            /* Acquire fences a second time. These will be released by the async thread. */
+            ma_resource_manager_pipeline_notifications_acquire_all_fences(&notifications);
+
+            job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER);
+            job.order = ma_resource_manager_data_buffer_next_execution_order(pDataBuffer);
+            job.data.resourceManager.loadDataBuffer.pDataBuffer             = pDataBuffer;
+            job.data.resourceManager.loadDataBuffer.pInitNotification       = ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) ? &initNotification : notifications.init.pNotification;
+            job.data.resourceManager.loadDataBuffer.pDoneNotification       = notifications.done.pNotification;
+            job.data.resourceManager.loadDataBuffer.pInitFence              = notifications.init.pFence;
+            job.data.resourceManager.loadDataBuffer.pDoneFence              = notifications.done.pFence;
+            job.data.resourceManager.loadDataBuffer.rangeBegInPCMFrames     = pConfig->rangeBegInPCMFrames;
+            job.data.resourceManager.loadDataBuffer.rangeEndInPCMFrames     = pConfig->rangeEndInPCMFrames;
+            job.data.resourceManager.loadDataBuffer.loopPointBegInPCMFrames = pConfig->loopPointBegInPCMFrames;
+            job.data.resourceManager.loadDataBuffer.loopPointEndInPCMFrames = pConfig->loopPointEndInPCMFrames;
+            job.data.resourceManager.loadDataBuffer.isLooping               = (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING) != 0;
+
+            /* If we need to wait for initialization to complete we can just process the job in place. */
+            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+                result = ma_job_process(&job);
+            } else {
+                result = ma_resource_manager_post_job(pResourceManager, &job);
+            }
+
+            if (result != MA_SUCCESS) {
+                /* We failed to post the job. Most likely there isn't enough room in the queue's buffer. */
+                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to post MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_BUFFER job. %s.\n", ma_result_description(result));
+                ma_atomic_exchange_i32(&pDataBuffer->result, result);
+
+                /* Release the fences after the result has been set on the data buffer. */
+                ma_resource_manager_pipeline_notifications_release_all_fences(&notifications);
+            } else {
+                if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+                    ma_resource_manager_inline_notification_wait(&initNotification);
+
+                    if (notifications.init.pNotification != NULL) {
+                        ma_async_notification_signal(notifications.init.pNotification);
+                    }
+
+                    /* NOTE: Do not release the init fence here. It will have been done by the job. */
+
+                    /* Make sure we return an error if initialization failed on the async thread. */
+                    result = ma_resource_manager_data_buffer_result(pDataBuffer);
+                    if (result == MA_BUSY) {
+                        result  = MA_SUCCESS;
+                    }
+                }
+            }
+
+            if ((flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+                ma_resource_manager_inline_notification_uninit(&initNotification);
+            }
+        }
+
+        if (result != MA_SUCCESS) {
+            ma_resource_manager_data_buffer_node_unacquire(pResourceManager, pDataBufferNode, NULL, NULL);
+            goto done;
+        }
+    }
+done:
+    if (result == MA_SUCCESS) {
+        if (pConfig->initialSeekPointInPCMFrames > 0) {
+            ma_resource_manager_data_buffer_seek_to_pcm_frame(pDataBuffer, pConfig->initialSeekPointInPCMFrames);
+        }
+    }
+
+    ma_resource_manager_pipeline_notifications_release_all_fences(&notifications);
+
+    return result;
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_buffer* pDataBuffer)
+{
+    return ma_resource_manager_data_buffer_init_ex_internal(pResourceManager, pConfig, 0, pDataBuffer);
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_init(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_buffer* pDataBuffer)
+{
+    ma_resource_manager_data_source_config config;
+
+    config = ma_resource_manager_data_source_config_init();
+    config.pFilePath      = pFilePath;
+    config.flags          = flags;
+    config.pNotifications = pNotifications;
+
+    return ma_resource_manager_data_buffer_init_ex(pResourceManager, &config, pDataBuffer);
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_init_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_buffer* pDataBuffer)
+{
+    ma_resource_manager_data_source_config config;
+
+    config = ma_resource_manager_data_source_config_init();
+    config.pFilePathW     = pFilePath;
+    config.flags          = flags;
+    config.pNotifications = pNotifications;
+
+    return ma_resource_manager_data_buffer_init_ex(pResourceManager, &config, pDataBuffer);
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_init_copy(ma_resource_manager* pResourceManager, const ma_resource_manager_data_buffer* pExistingDataBuffer, ma_resource_manager_data_buffer* pDataBuffer)
+{
+    ma_resource_manager_data_source_config config;
+
+    if (pExistingDataBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ASSERT(pExistingDataBuffer->pNode != NULL);  /* <-- If you've triggered this, you've passed in an invalid existing data buffer. */
+
+    config = ma_resource_manager_data_source_config_init();
+    config.flags = pExistingDataBuffer->flags;
+
+    return ma_resource_manager_data_buffer_init_ex_internal(pResourceManager, &config, pExistingDataBuffer->pNode->hashedName32, pDataBuffer);
+}
+
+static ma_result ma_resource_manager_data_buffer_uninit_internal(ma_resource_manager_data_buffer* pDataBuffer)
+{
+    MA_ASSERT(pDataBuffer != NULL);
+
+    /* The connector should be uninitialized first. */
+    ma_resource_manager_data_buffer_uninit_connector(pDataBuffer->pResourceManager, pDataBuffer);
+
+    /* With the connector uninitialized we can unacquire the node. */
+    ma_resource_manager_data_buffer_node_unacquire(pDataBuffer->pResourceManager, pDataBuffer->pNode, NULL, NULL);
+
+    /* The base data source needs to be uninitialized as well. */
+    ma_data_source_uninit(&pDataBuffer->ds);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_uninit(ma_resource_manager_data_buffer* pDataBuffer)
+{
+    ma_result result;
+
+    if (pDataBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_resource_manager_data_buffer_result(pDataBuffer) == MA_SUCCESS) {
+        /* The data buffer can be deleted synchronously. */
+        return ma_resource_manager_data_buffer_uninit_internal(pDataBuffer);
+    } else {
+        /*
+        The data buffer needs to be deleted asynchronously because it's still loading. With the status set to MA_UNAVAILABLE, no more pages will
+        be loaded and the uninitialization should happen fairly quickly. Since the caller owns the data buffer, we need to wait for this event
+        to get processed before returning.
+        */
+        ma_resource_manager_inline_notification notification;
+        ma_job job;
+
+        /*
+        We need to mark the node as unavailable so we don't try reading from it anymore, but also to
+        let the loading thread know that it needs to abort it's loading procedure.
+        */
+        ma_atomic_exchange_i32(&pDataBuffer->result, MA_UNAVAILABLE);
+
+        result = ma_resource_manager_inline_notification_init(pDataBuffer->pResourceManager, &notification);
+        if (result != MA_SUCCESS) {
+            return result;  /* Failed to create the notification. This should rarely, if ever, happen. */
+        }
+
+        job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER);
+        job.order = ma_resource_manager_data_buffer_next_execution_order(pDataBuffer);
+        job.data.resourceManager.freeDataBuffer.pDataBuffer       = pDataBuffer;
+        job.data.resourceManager.freeDataBuffer.pDoneNotification = &notification;
+        job.data.resourceManager.freeDataBuffer.pDoneFence        = NULL;
+
+        result = ma_resource_manager_post_job(pDataBuffer->pResourceManager, &job);
+        if (result != MA_SUCCESS) {
+            ma_resource_manager_inline_notification_uninit(&notification);
+            return result;
+        }
+
+        ma_resource_manager_inline_notification_wait_and_uninit(&notification);
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_read_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 framesRead = 0;
+    ma_bool32 isDecodedBufferBusy = MA_FALSE;
+
+    /* Safety. */
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    We cannot be using the data buffer after it's been uninitialized. If you trigger this assert it means you're trying to read from the data buffer after
+    it's been uninitialized or is in the process of uninitializing.
+    */
+    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
+
+    /* If the node is not initialized we need to abort with a busy code. */
+    if (ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE) {
+        return MA_BUSY; /* Still loading. */
+    }
+
+    /*
+    If we've got a seek scheduled we'll want to do that before reading. However, for paged buffers, there's
+    a chance that the sound hasn't yet been decoded up to the seek point will result in the seek failing. If
+    this happens, we need to keep the seek scheduled and return MA_BUSY.
+    */
+    if (pDataBuffer->seekToCursorOnNextRead) {
+        pDataBuffer->seekToCursorOnNextRead = MA_FALSE;
+
+        result = ma_data_source_seek_to_pcm_frame(ma_resource_manager_data_buffer_get_connector(pDataBuffer), pDataBuffer->seekTargetInPCMFrames);
+        if (result != MA_SUCCESS) {
+            if (result == MA_BAD_SEEK && ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode) == ma_resource_manager_data_supply_type_decoded_paged) {
+                pDataBuffer->seekToCursorOnNextRead = MA_TRUE;  /* Keep the seek scheduled. We just haven't loaded enough data yet to do the seek properly. */
+                return MA_BUSY;
+            }
+
+            return result;
+        }
+    }
+
+    /*
+    For decoded buffers (not paged) we need to check beforehand how many frames we have available. We cannot
+    exceed this amount. We'll read as much as we can, and then return MA_BUSY.
+    */
+    if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode) == ma_resource_manager_data_supply_type_decoded) {
+        ma_uint64 availableFrames;
+
+        isDecodedBufferBusy = (ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) == MA_BUSY);
+
+        if (ma_resource_manager_data_buffer_get_available_frames(pDataBuffer, &availableFrames) == MA_SUCCESS) {
+            /* Don't try reading more than the available frame count if the data buffer node is still loading. */
+            if (isDecodedBufferBusy) {
+                if (frameCount > availableFrames) {
+                    frameCount = availableFrames;
+
+                    /*
+                    If there's no frames available we want to set the status to MA_AT_END. The logic below
+                    will check if the node is busy, and if so, change it to MA_BUSY. The reason we do this
+                    is because we don't want to call `ma_data_source_read_pcm_frames()` if the frame count
+                    is 0 because that'll result in a situation where it's possible MA_AT_END won't get
+                    returned.
+                    */
+                    if (frameCount == 0) {
+                        result = MA_AT_END;
+                    }
+                } else {
+                    isDecodedBufferBusy = MA_FALSE; /* We have enough frames available in the buffer to avoid a MA_BUSY status. */
+                }
+            } else {
+                /*
+                Getting here means the buffer has been fully loaded. We can just pass the frame count straight
+                into ma_data_source_read_pcm_frames() below and let ma_data_source handle it.
+                */
+            }
+        }
+    }
+
+    /* Don't attempt to read anything if we've got no frames available. */
+    if (frameCount > 0) {
+        result = ma_data_source_read_pcm_frames(ma_resource_manager_data_buffer_get_connector(pDataBuffer), pFramesOut, frameCount, &framesRead);
+    }
+
+    /*
+    If we returned MA_AT_END, but the node is still loading, we don't want to return that code or else the caller will interpret the sound
+    as at the end and terminate decoding.
+    */
+    if (result == MA_AT_END) {
+        if (ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) == MA_BUSY) {
+            result = MA_BUSY;
+        }
+    }
+
+    if (isDecodedBufferBusy) {
+        result = MA_BUSY;
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = framesRead;
+    }
+
+    if (result == MA_SUCCESS && framesRead == 0) {
+        result  = MA_AT_END;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_seek_to_pcm_frame(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64 frameIndex)
+{
+    ma_result result;
+
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
+
+    /* If we haven't yet got a connector we need to abort. */
+    if (ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE) {
+        pDataBuffer->seekTargetInPCMFrames = frameIndex;
+        pDataBuffer->seekToCursorOnNextRead = MA_TRUE;
+        return MA_BUSY; /* Still loading. */
+    }
+
+    result = ma_data_source_seek_to_pcm_frame(ma_resource_manager_data_buffer_get_connector(pDataBuffer), frameIndex);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pDataBuffer->seekTargetInPCMFrames = ~(ma_uint64)0; /* <-- For identification purposes. */
+    pDataBuffer->seekToCursorOnNextRead = MA_FALSE;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_get_data_format(ma_resource_manager_data_buffer* pDataBuffer, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
+
+    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
+    {
+        case ma_resource_manager_data_supply_type_encoded:
+        {
+            return ma_data_source_get_data_format(&pDataBuffer->connector.decoder, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+        };
+
+        case ma_resource_manager_data_supply_type_decoded:
+        {
+            *pFormat     = pDataBuffer->pNode->data.backend.decoded.format;
+            *pChannels   = pDataBuffer->pNode->data.backend.decoded.channels;
+            *pSampleRate = pDataBuffer->pNode->data.backend.decoded.sampleRate;
+            ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pDataBuffer->pNode->data.backend.decoded.channels);
+            return MA_SUCCESS;
+        };
+
+        case ma_resource_manager_data_supply_type_decoded_paged:
+        {
+            *pFormat     = pDataBuffer->pNode->data.backend.decodedPaged.data.format;
+            *pChannels   = pDataBuffer->pNode->data.backend.decodedPaged.data.channels;
+            *pSampleRate = pDataBuffer->pNode->data.backend.decodedPaged.sampleRate;
+            ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, pDataBuffer->pNode->data.backend.decoded.channels);
+            return MA_SUCCESS;
+        };
+
+        case ma_resource_manager_data_supply_type_unknown:
+        {
+            return MA_BUSY; /* Still loading. */
+        };
+
+        default:
+        {
+            /* Unknown supply type. Should never hit this. */
+            return MA_INVALID_ARGS;
+        }
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_get_cursor_in_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pCursor)
+{
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
+
+    if (pDataBuffer == NULL || pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;
+
+    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
+    {
+        case ma_resource_manager_data_supply_type_encoded:
+        {
+            return ma_decoder_get_cursor_in_pcm_frames(&pDataBuffer->connector.decoder, pCursor);
+        };
+
+        case ma_resource_manager_data_supply_type_decoded:
+        {
+            return ma_audio_buffer_get_cursor_in_pcm_frames(&pDataBuffer->connector.buffer, pCursor);
+        };
+
+        case ma_resource_manager_data_supply_type_decoded_paged:
+        {
+            return ma_paged_audio_buffer_get_cursor_in_pcm_frames(&pDataBuffer->connector.pagedBuffer, pCursor);
+        };
+
+        case ma_resource_manager_data_supply_type_unknown:
+        {
+            return MA_BUSY;
+        };
+
+        default:
+        {
+            return MA_INVALID_ARGS;
+        }
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_get_length_in_pcm_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pLength)
+{
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) != MA_UNAVAILABLE);
+
+    if (pDataBuffer == NULL || pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode) == ma_resource_manager_data_supply_type_unknown) {
+        return MA_BUSY; /* Still loading. */
+    }
+
+    return ma_data_source_get_length_in_pcm_frames(ma_resource_manager_data_buffer_get_connector(pDataBuffer), pLength);
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_result(const ma_resource_manager_data_buffer* pDataBuffer)
+{
+    if (pDataBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return (ma_result)ma_atomic_load_i32((ma_result*)&pDataBuffer->result);    /* Need a naughty const-cast here. */
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_set_looping(ma_resource_manager_data_buffer* pDataBuffer, ma_bool32 isLooping)
+{
+    return ma_data_source_set_looping(pDataBuffer, isLooping);
+}
+
+MA_API ma_bool32 ma_resource_manager_data_buffer_is_looping(const ma_resource_manager_data_buffer* pDataBuffer)
+{
+    return ma_data_source_is_looping(pDataBuffer);
+}
+
+MA_API ma_result ma_resource_manager_data_buffer_get_available_frames(ma_resource_manager_data_buffer* pDataBuffer, ma_uint64* pAvailableFrames)
+{
+    if (pAvailableFrames == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pAvailableFrames = 0;
+
+    if (pDataBuffer == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode) == ma_resource_manager_data_supply_type_unknown) {
+        if (ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode) == MA_BUSY) {
+            return MA_BUSY;
+        } else {
+            return MA_INVALID_OPERATION;    /* No connector. */
+        }
+    }
+
+    switch (ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode))
+    {
+        case ma_resource_manager_data_supply_type_encoded:
+        {
+            return ma_decoder_get_available_frames(&pDataBuffer->connector.decoder, pAvailableFrames);
+        };
+
+        case ma_resource_manager_data_supply_type_decoded:
+        {
+            return ma_audio_buffer_get_available_frames(&pDataBuffer->connector.buffer, pAvailableFrames);
+        };
+
+        case ma_resource_manager_data_supply_type_decoded_paged:
+        {
+            ma_uint64 cursor;
+            ma_paged_audio_buffer_get_cursor_in_pcm_frames(&pDataBuffer->connector.pagedBuffer, &cursor);
+
+            if (pDataBuffer->pNode->data.backend.decodedPaged.decodedFrameCount > cursor) {
+                *pAvailableFrames = pDataBuffer->pNode->data.backend.decodedPaged.decodedFrameCount - cursor;
+            } else {
+                *pAvailableFrames = 0;
+            }
+
+            return MA_SUCCESS;
+        };
+
+        case ma_resource_manager_data_supply_type_unknown:
+        default:
+        {
+            /* Unknown supply type. Should never hit this. */
+            return MA_INVALID_ARGS;
+        }
+    }
+}
+
+MA_API ma_result ma_resource_manager_register_file(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags)
+{
+    return ma_resource_manager_data_buffer_node_acquire(pResourceManager, pFilePath, NULL, 0, flags, NULL, NULL, NULL, NULL);
+}
+
+MA_API ma_result ma_resource_manager_register_file_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags)
+{
+    return ma_resource_manager_data_buffer_node_acquire(pResourceManager, NULL, pFilePath, 0, flags, NULL, NULL, NULL, NULL);
+}
+
+
+static ma_result ma_resource_manager_register_data(ma_resource_manager* pResourceManager, const char* pName, const wchar_t* pNameW, ma_resource_manager_data_supply* pExistingData)
+{
+    return ma_resource_manager_data_buffer_node_acquire(pResourceManager, pName, pNameW, 0, 0, pExistingData, NULL, NULL, NULL);
+}
+
+static ma_result ma_resource_manager_register_decoded_data_internal(ma_resource_manager* pResourceManager, const char* pName, const wchar_t* pNameW, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
+{
+    ma_resource_manager_data_supply data;
+    data.type                            = ma_resource_manager_data_supply_type_decoded;
+    data.backend.decoded.pData           = pData;
+    data.backend.decoded.totalFrameCount = frameCount;
+    data.backend.decoded.format          = format;
+    data.backend.decoded.channels        = channels;
+    data.backend.decoded.sampleRate      = sampleRate;
+
+    return ma_resource_manager_register_data(pResourceManager, pName, pNameW, &data);
+}
+
+MA_API ma_result ma_resource_manager_register_decoded_data(ma_resource_manager* pResourceManager, const char* pName, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
+{
+    return ma_resource_manager_register_decoded_data_internal(pResourceManager, pName, NULL, pData, frameCount, format, channels, sampleRate);
+}
+
+MA_API ma_result ma_resource_manager_register_decoded_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName, const void* pData, ma_uint64 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
+{
+    return ma_resource_manager_register_decoded_data_internal(pResourceManager, NULL, pName, pData, frameCount, format, channels, sampleRate);
+}
+
+
+static ma_result ma_resource_manager_register_encoded_data_internal(ma_resource_manager* pResourceManager, const char* pName, const wchar_t* pNameW, const void* pData, size_t sizeInBytes)
+{
+    ma_resource_manager_data_supply data;
+    data.type                        = ma_resource_manager_data_supply_type_encoded;
+    data.backend.encoded.pData       = pData;
+    data.backend.encoded.sizeInBytes = sizeInBytes;
+
+    return ma_resource_manager_register_data(pResourceManager, pName, pNameW, &data);
+}
+
+MA_API ma_result ma_resource_manager_register_encoded_data(ma_resource_manager* pResourceManager, const char* pName, const void* pData, size_t sizeInBytes)
+{
+    return ma_resource_manager_register_encoded_data_internal(pResourceManager, pName, NULL, pData, sizeInBytes);
+}
+
+MA_API ma_result ma_resource_manager_register_encoded_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName, const void* pData, size_t sizeInBytes)
+{
+    return ma_resource_manager_register_encoded_data_internal(pResourceManager, NULL, pName, pData, sizeInBytes);
+}
+
+
+MA_API ma_result ma_resource_manager_unregister_file(ma_resource_manager* pResourceManager, const char* pFilePath)
+{
+    return ma_resource_manager_unregister_data(pResourceManager, pFilePath);
+}
+
+MA_API ma_result ma_resource_manager_unregister_file_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath)
+{
+    return ma_resource_manager_unregister_data_w(pResourceManager, pFilePath);
+}
+
+MA_API ma_result ma_resource_manager_unregister_data(ma_resource_manager* pResourceManager, const char* pName)
+{
+    return ma_resource_manager_data_buffer_node_unacquire(pResourceManager, NULL, pName, NULL);
+}
+
+MA_API ma_result ma_resource_manager_unregister_data_w(ma_resource_manager* pResourceManager, const wchar_t* pName)
+{
+    return ma_resource_manager_data_buffer_node_unacquire(pResourceManager, NULL, NULL, pName);
+}
+
+
+static ma_uint32 ma_resource_manager_data_stream_next_execution_order(ma_resource_manager_data_stream* pDataStream)
+{
+    MA_ASSERT(pDataStream != NULL);
+    return ma_atomic_fetch_add_32(&pDataStream->executionCounter, 1);
+}
+
+static ma_bool32 ma_resource_manager_data_stream_is_decoder_at_end(const ma_resource_manager_data_stream* pDataStream)
+{
+    MA_ASSERT(pDataStream != NULL);
+    return ma_atomic_load_32((ma_bool32*)&pDataStream->isDecoderAtEnd);
+}
+
+static ma_uint32 ma_resource_manager_data_stream_seek_counter(const ma_resource_manager_data_stream* pDataStream)
+{
+    MA_ASSERT(pDataStream != NULL);
+    return ma_atomic_load_32((ma_uint32*)&pDataStream->seekCounter);
+}
+
+
+static ma_result ma_resource_manager_data_stream_cb__read_pcm_frames(ma_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    return ma_resource_manager_data_stream_read_pcm_frames((ma_resource_manager_data_stream*)pDataSource, pFramesOut, frameCount, pFramesRead);
+}
+
+static ma_result ma_resource_manager_data_stream_cb__seek_to_pcm_frame(ma_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    return ma_resource_manager_data_stream_seek_to_pcm_frame((ma_resource_manager_data_stream*)pDataSource, frameIndex);
+}
+
+static ma_result ma_resource_manager_data_stream_cb__get_data_format(ma_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    return ma_resource_manager_data_stream_get_data_format((ma_resource_manager_data_stream*)pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+}
+
+static ma_result ma_resource_manager_data_stream_cb__get_cursor_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pCursor)
+{
+    return ma_resource_manager_data_stream_get_cursor_in_pcm_frames((ma_resource_manager_data_stream*)pDataSource, pCursor);
+}
+
+static ma_result ma_resource_manager_data_stream_cb__get_length_in_pcm_frames(ma_data_source* pDataSource, ma_uint64* pLength)
+{
+    return ma_resource_manager_data_stream_get_length_in_pcm_frames((ma_resource_manager_data_stream*)pDataSource, pLength);
+}
+
+static ma_result ma_resource_manager_data_stream_cb__set_looping(ma_data_source* pDataSource, ma_bool32 isLooping)
+{
+    ma_resource_manager_data_stream* pDataStream = (ma_resource_manager_data_stream*)pDataSource;
+    MA_ASSERT(pDataStream != NULL);
+
+    ma_atomic_exchange_32(&pDataStream->isLooping, isLooping);
+
+    return MA_SUCCESS;
+}
+
+static ma_data_source_vtable g_ma_resource_manager_data_stream_vtable =
+{
+    ma_resource_manager_data_stream_cb__read_pcm_frames,
+    ma_resource_manager_data_stream_cb__seek_to_pcm_frame,
+    ma_resource_manager_data_stream_cb__get_data_format,
+    ma_resource_manager_data_stream_cb__get_cursor_in_pcm_frames,
+    ma_resource_manager_data_stream_cb__get_length_in_pcm_frames,
+    ma_resource_manager_data_stream_cb__set_looping,
+    0 /*MA_DATA_SOURCE_SELF_MANAGED_RANGE_AND_LOOP_POINT*/
+};
+
+static void ma_resource_manager_data_stream_set_absolute_cursor(ma_resource_manager_data_stream* pDataStream, ma_uint64 absoluteCursor)
+{
+    /* Loop if possible. */
+    if (absoluteCursor > pDataStream->totalLengthInPCMFrames && pDataStream->totalLengthInPCMFrames > 0) {
+        absoluteCursor = absoluteCursor % pDataStream->totalLengthInPCMFrames;
+    }
+
+    ma_atomic_exchange_64(&pDataStream->absoluteCursor, absoluteCursor);
+}
+
+MA_API ma_result ma_resource_manager_data_stream_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_stream* pDataStream)
+{
+    ma_result result;
+    ma_data_source_config dataSourceConfig;
+    char* pFilePathCopy = NULL;
+    wchar_t* pFilePathWCopy = NULL;
+    ma_job job;
+    ma_bool32 waitBeforeReturning = MA_FALSE;
+    ma_resource_manager_inline_notification waitNotification;
+    ma_resource_manager_pipeline_notifications notifications;
+    ma_uint32 flags;
+
+    if (pDataStream == NULL) {
+        if (pConfig != NULL && pConfig->pNotifications != NULL) {
+            ma_resource_manager_pipeline_notifications_signal_all_notifications(pConfig->pNotifications);
+        }
+
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDataStream);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->pNotifications != NULL) {
+        notifications = *pConfig->pNotifications;    /* From here on out, `notifications` should be used instead of `pNotifications`. Setting this to NULL to catch any errors at testing time. */
+    } else {
+        MA_ZERO_OBJECT(&notifications);
+    }
+
+    dataSourceConfig = ma_data_source_config_init();
+    dataSourceConfig.vtable = &g_ma_resource_manager_data_stream_vtable;
+
+    result = ma_data_source_init(&dataSourceConfig, &pDataStream->ds);
+    if (result != MA_SUCCESS) {
+        ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
+        return result;
+    }
+
+    flags = pConfig->flags;
+    if (pConfig->isLooping) {
+        flags |= MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING;
+    }
+
+    pDataStream->pResourceManager = pResourceManager;
+    pDataStream->flags            = pConfig->flags;
+    pDataStream->result           = MA_BUSY;
+
+    ma_data_source_set_range_in_pcm_frames(pDataStream, pConfig->rangeBegInPCMFrames, pConfig->rangeEndInPCMFrames);
+    ma_data_source_set_loop_point_in_pcm_frames(pDataStream, pConfig->loopPointBegInPCMFrames, pConfig->loopPointEndInPCMFrames);
+    ma_data_source_set_looping(pDataStream, (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING) != 0);
+
+    if (pResourceManager == NULL || (pConfig->pFilePath == NULL && pConfig->pFilePathW == NULL)) {
+        ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
+        return MA_INVALID_ARGS;
+    }
+
+    /* We want all access to the VFS and the internal decoder to happen on the job thread just to keep things easier to manage for the VFS.  */
+
+    /* We need a copy of the file path. We should probably make this more efficient, but for now we'll do a transient memory allocation. */
+    if (pConfig->pFilePath != NULL) {
+        pFilePathCopy  = ma_copy_string(pConfig->pFilePath, &pResourceManager->config.allocationCallbacks);
+    } else {
+        pFilePathWCopy = ma_copy_string_w(pConfig->pFilePathW, &pResourceManager->config.allocationCallbacks);
+    }
+
+    if (pFilePathCopy == NULL && pFilePathWCopy == NULL) {
+        ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
+        return MA_OUT_OF_MEMORY;
+    }
+
+    /*
+    We need to check for the presence of MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC. If it's not set, we need to wait before returning. Otherwise we
+    can return immediately. Likewise, we'll also check for MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT and do the same.
+    */
+    if ((pConfig->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_ASYNC) == 0 || (pConfig->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT) != 0) {
+        waitBeforeReturning = MA_TRUE;
+        ma_resource_manager_inline_notification_init(pResourceManager, &waitNotification);
+    }
+
+    ma_resource_manager_pipeline_notifications_acquire_all_fences(&notifications);
+
+    /* Set the absolute cursor to our initial seek position so retrieval of the cursor returns a good value. */
+    ma_resource_manager_data_stream_set_absolute_cursor(pDataStream, pConfig->initialSeekPointInPCMFrames);
+
+    /* We now have everything we need to post the job. This is the last thing we need to do from here. The rest will be done by the job thread. */
+    job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_LOAD_DATA_STREAM);
+    job.order = ma_resource_manager_data_stream_next_execution_order(pDataStream);
+    job.data.resourceManager.loadDataStream.pDataStream       = pDataStream;
+    job.data.resourceManager.loadDataStream.pFilePath         = pFilePathCopy;
+    job.data.resourceManager.loadDataStream.pFilePathW        = pFilePathWCopy;
+    job.data.resourceManager.loadDataStream.initialSeekPoint  = pConfig->initialSeekPointInPCMFrames;
+    job.data.resourceManager.loadDataStream.pInitNotification = (waitBeforeReturning == MA_TRUE) ? &waitNotification : notifications.init.pNotification;
+    job.data.resourceManager.loadDataStream.pInitFence        = notifications.init.pFence;
+    result = ma_resource_manager_post_job(pResourceManager, &job);
+    if (result != MA_SUCCESS) {
+        ma_resource_manager_pipeline_notifications_signal_all_notifications(&notifications);
+        ma_resource_manager_pipeline_notifications_release_all_fences(&notifications);
+
+        if (waitBeforeReturning) {
+            ma_resource_manager_inline_notification_uninit(&waitNotification);
+        }
+
+        ma_free(pFilePathCopy,  &pResourceManager->config.allocationCallbacks);
+        ma_free(pFilePathWCopy, &pResourceManager->config.allocationCallbacks);
+        return result;
+    }
+
+    /* Wait if needed. */
+    if (waitBeforeReturning) {
+        ma_resource_manager_inline_notification_wait_and_uninit(&waitNotification);
+
+        if (notifications.init.pNotification != NULL) {
+            ma_async_notification_signal(notifications.init.pNotification);
+        }
+
+        /*
+        If there was an error during initialization make sure we return that result here. We don't want to do this
+        if we're not waiting because it will most likely be in a busy state.
+        */
+        if (pDataStream->result != MA_SUCCESS) {
+            return pDataStream->result;
+        }
+
+        /* NOTE: Do not release pInitFence here. That will be done by the job. */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resource_manager_data_stream_init(ma_resource_manager* pResourceManager, const char* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_stream* pDataStream)
+{
+    ma_resource_manager_data_source_config config;
+
+    config = ma_resource_manager_data_source_config_init();
+    config.pFilePath      = pFilePath;
+    config.flags          = flags;
+    config.pNotifications = pNotifications;
+
+    return ma_resource_manager_data_stream_init_ex(pResourceManager, &config, pDataStream);
+}
+
+MA_API ma_result ma_resource_manager_data_stream_init_w(ma_resource_manager* pResourceManager, const wchar_t* pFilePath, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_stream* pDataStream)
+{
+    ma_resource_manager_data_source_config config;
+
+    config = ma_resource_manager_data_source_config_init();
+    config.pFilePathW     = pFilePath;
+    config.flags          = flags;
+    config.pNotifications = pNotifications;
+
+    return ma_resource_manager_data_stream_init_ex(pResourceManager, &config, pDataStream);
+}
+
+MA_API ma_result ma_resource_manager_data_stream_uninit(ma_resource_manager_data_stream* pDataStream)
+{
+    ma_resource_manager_inline_notification freeEvent;
+    ma_job job;
+
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The first thing to do is set the result to unavailable. This will prevent future page decoding. */
+    ma_atomic_exchange_i32(&pDataStream->result, MA_UNAVAILABLE);
+
+    /*
+    We need to post a job to ensure we're not in the middle or decoding or anything. Because the object is owned by the caller, we'll need
+    to wait for it to complete before returning which means we need an event.
+    */
+    ma_resource_manager_inline_notification_init(pDataStream->pResourceManager, &freeEvent);
+
+    job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_STREAM);
+    job.order = ma_resource_manager_data_stream_next_execution_order(pDataStream);
+    job.data.resourceManager.freeDataStream.pDataStream       = pDataStream;
+    job.data.resourceManager.freeDataStream.pDoneNotification = &freeEvent;
+    job.data.resourceManager.freeDataStream.pDoneFence        = NULL;
+    ma_resource_manager_post_job(pDataStream->pResourceManager, &job);
+
+    /* We need to wait for the job to finish processing before we return. */
+    ma_resource_manager_inline_notification_wait_and_uninit(&freeEvent);
+
+    return MA_SUCCESS;
+}
+
+
+static ma_uint32 ma_resource_manager_data_stream_get_page_size_in_frames(ma_resource_manager_data_stream* pDataStream)
+{
+    MA_ASSERT(pDataStream != NULL);
+    MA_ASSERT(pDataStream->isDecoderInitialized == MA_TRUE);
+
+    return MA_RESOURCE_MANAGER_PAGE_SIZE_IN_MILLISECONDS * (pDataStream->decoder.outputSampleRate/1000);
+}
+
+static void* ma_resource_manager_data_stream_get_page_data_pointer(ma_resource_manager_data_stream* pDataStream, ma_uint32 pageIndex, ma_uint32 relativeCursor)
+{
+    MA_ASSERT(pDataStream != NULL);
+    MA_ASSERT(pDataStream->isDecoderInitialized == MA_TRUE);
+    MA_ASSERT(pageIndex == 0 || pageIndex == 1);
+
+    return ma_offset_ptr(pDataStream->pPageData, ((ma_resource_manager_data_stream_get_page_size_in_frames(pDataStream) * pageIndex) + relativeCursor) * ma_get_bytes_per_frame(pDataStream->decoder.outputFormat, pDataStream->decoder.outputChannels));
+}
+
+static void ma_resource_manager_data_stream_fill_page(ma_resource_manager_data_stream* pDataStream, ma_uint32 pageIndex)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 pageSizeInFrames;
+    ma_uint64 totalFramesReadForThisPage = 0;
+    void* pPageData = ma_resource_manager_data_stream_get_page_data_pointer(pDataStream, pageIndex, 0);
+
+    pageSizeInFrames = ma_resource_manager_data_stream_get_page_size_in_frames(pDataStream);
+
+    /* The decoder needs to inherit the stream's looping and range state. */
+    {
+        ma_uint64 rangeBeg;
+        ma_uint64 rangeEnd;
+        ma_uint64 loopPointBeg;
+        ma_uint64 loopPointEnd;
+
+        ma_data_source_set_looping(&pDataStream->decoder, ma_resource_manager_data_stream_is_looping(pDataStream));
+
+        ma_data_source_get_range_in_pcm_frames(pDataStream, &rangeBeg, &rangeEnd);
+        ma_data_source_set_range_in_pcm_frames(&pDataStream->decoder, rangeBeg, rangeEnd);
+
+        ma_data_source_get_loop_point_in_pcm_frames(pDataStream, &loopPointBeg, &loopPointEnd);
+        ma_data_source_set_loop_point_in_pcm_frames(&pDataStream->decoder, loopPointBeg, loopPointEnd);
+    }
+
+    /* Just read straight from the decoder. It will deal with ranges and looping for us. */
+    result = ma_data_source_read_pcm_frames(&pDataStream->decoder, pPageData, pageSizeInFrames, &totalFramesReadForThisPage);
+    if (result == MA_AT_END || totalFramesReadForThisPage < pageSizeInFrames) {
+        ma_atomic_exchange_32(&pDataStream->isDecoderAtEnd, MA_TRUE);
+    }
+
+    ma_atomic_exchange_32(&pDataStream->pageFrameCount[pageIndex], (ma_uint32)totalFramesReadForThisPage);
+    ma_atomic_exchange_32(&pDataStream->isPageValid[pageIndex], MA_TRUE);
+}
+
+static void ma_resource_manager_data_stream_fill_pages(ma_resource_manager_data_stream* pDataStream)
+{
+    ma_uint32 iPage;
+
+    MA_ASSERT(pDataStream != NULL);
+
+    for (iPage = 0; iPage < 2; iPage += 1) {
+        ma_resource_manager_data_stream_fill_page(pDataStream, iPage);
+    }
+}
+
+
+static ma_result ma_resource_manager_data_stream_map(ma_resource_manager_data_stream* pDataStream, void** ppFramesOut, ma_uint64* pFrameCount)
+{
+    ma_uint64 framesAvailable;
+    ma_uint64 frameCount = 0;
+
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
+
+    if (pFrameCount != NULL) {
+        frameCount = *pFrameCount;
+        *pFrameCount = 0;
+    }
+    if (ppFramesOut != NULL) {
+        *ppFramesOut = NULL;
+    }
+
+    if (pDataStream == NULL || ppFramesOut == NULL || pFrameCount == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* Don't attempt to read while we're in the middle of seeking. Tell the caller that we're busy. */
+    if (ma_resource_manager_data_stream_seek_counter(pDataStream) > 0) {
+        return MA_BUSY;
+    }
+
+    /* If the page we're on is invalid it means we've caught up to the job thread. */
+    if (ma_atomic_load_32(&pDataStream->isPageValid[pDataStream->currentPageIndex]) == MA_FALSE) {
+        framesAvailable = 0;
+    } else {
+        /*
+        The page we're on is valid so we must have some frames available. We need to make sure that we don't overflow into the next page, even if it's valid. The reason is
+        that the unmap process will only post an update for one page at a time. Keeping mapping tied to page boundaries makes this simpler.
+        */
+        ma_uint32 currentPageFrameCount = ma_atomic_load_32(&pDataStream->pageFrameCount[pDataStream->currentPageIndex]);
+        MA_ASSERT(currentPageFrameCount >= pDataStream->relativeCursor);
+
+        framesAvailable = currentPageFrameCount - pDataStream->relativeCursor;
+    }
+
+    /* If there's no frames available and the result is set to MA_AT_END we need to return MA_AT_END. */
+    if (framesAvailable == 0) {
+        if (ma_resource_manager_data_stream_is_decoder_at_end(pDataStream)) {
+            return MA_AT_END;
+        } else {
+            return MA_BUSY; /* There are no frames available, but we're not marked as EOF so we might have caught up to the job thread. Need to return MA_BUSY and wait for more data. */
+        }
+    }
+
+    MA_ASSERT(framesAvailable > 0);
+
+    if (frameCount > framesAvailable) {
+        frameCount = framesAvailable;
+    }
+
+    *ppFramesOut = ma_resource_manager_data_stream_get_page_data_pointer(pDataStream, pDataStream->currentPageIndex, pDataStream->relativeCursor);
+    *pFrameCount = frameCount;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_resource_manager_data_stream_unmap(ma_resource_manager_data_stream* pDataStream, ma_uint64 frameCount)
+{
+    ma_uint32 newRelativeCursor;
+    ma_uint32 pageSizeInFrames;
+    ma_job job;
+
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
+
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* The frame count should always fit inside a 32-bit integer. */
+    if (frameCount > 0xFFFFFFFF) {
+        return MA_INVALID_ARGS;
+    }
+
+    pageSizeInFrames = ma_resource_manager_data_stream_get_page_size_in_frames(pDataStream);
+
+    /* The absolute cursor needs to be updated for ma_resource_manager_data_stream_get_cursor_in_pcm_frames(). */
+    ma_resource_manager_data_stream_set_absolute_cursor(pDataStream, ma_atomic_load_64(&pDataStream->absoluteCursor) + frameCount);
+
+    /* Here is where we need to check if we need to load a new page, and if so, post a job to load it. */
+    newRelativeCursor = pDataStream->relativeCursor + (ma_uint32)frameCount;
+
+    /* If the new cursor has flowed over to the next page we need to mark the old one as invalid and post an event for it. */
+    if (newRelativeCursor >= pageSizeInFrames) {
+        newRelativeCursor -= pageSizeInFrames;
+
+        /* Here is where we post the job start decoding. */
+        job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_STREAM);
+        job.order = ma_resource_manager_data_stream_next_execution_order(pDataStream);
+        job.data.resourceManager.pageDataStream.pDataStream = pDataStream;
+        job.data.resourceManager.pageDataStream.pageIndex   = pDataStream->currentPageIndex;
+
+        /* The page needs to be marked as invalid so that the public API doesn't try reading from it. */
+        ma_atomic_exchange_32(&pDataStream->isPageValid[pDataStream->currentPageIndex], MA_FALSE);
+
+        /* Before posting the job we need to make sure we set some state. */
+        pDataStream->relativeCursor   = newRelativeCursor;
+        pDataStream->currentPageIndex = (pDataStream->currentPageIndex + 1) & 0x01;
+        return ma_resource_manager_post_job(pDataStream->pResourceManager, &job);
+    } else {
+        /* We haven't moved into a new page so we can just move the cursor forward. */
+        pDataStream->relativeCursor = newRelativeCursor;
+        return MA_SUCCESS;
+    }
+}
+
+
+MA_API ma_result ma_resource_manager_data_stream_read_pcm_frames(ma_resource_manager_data_stream* pDataStream, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 totalFramesProcessed;
+    ma_format format;
+    ma_uint32 channels;
+
+    /* Safety. */
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (frameCount == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
+
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* Don't attempt to read while we're in the middle of seeking. Tell the caller that we're busy. */
+    if (ma_resource_manager_data_stream_seek_counter(pDataStream) > 0) {
+        return MA_BUSY;
+    }
+
+    ma_resource_manager_data_stream_get_data_format(pDataStream, &format, &channels, NULL, NULL, 0);
+
+    /* Reading is implemented in terms of map/unmap. We need to run this in a loop because mapping is clamped against page boundaries. */
+    totalFramesProcessed = 0;
+    while (totalFramesProcessed < frameCount) {
+        void* pMappedFrames;
+        ma_uint64 mappedFrameCount;
+
+        mappedFrameCount = frameCount - totalFramesProcessed;
+        result = ma_resource_manager_data_stream_map(pDataStream, &pMappedFrames, &mappedFrameCount);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+
+        /* Copy the mapped data to the output buffer if we have one. It's allowed for pFramesOut to be NULL in which case a relative forward seek is performed. */
+        if (pFramesOut != NULL) {
+            ma_copy_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesProcessed, format, channels), pMappedFrames, mappedFrameCount, format, channels);
+        }
+
+        totalFramesProcessed += mappedFrameCount;
+
+        result = ma_resource_manager_data_stream_unmap(pDataStream, mappedFrameCount);
+        if (result != MA_SUCCESS) {
+            break;  /* This is really bad - will only get an error here if we failed to post a job to the queue for loading the next page. */
+        }
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = totalFramesProcessed;
+    }
+
+    if (result == MA_SUCCESS && totalFramesProcessed == 0) {
+        result  = MA_AT_END;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_resource_manager_data_stream_seek_to_pcm_frame(ma_resource_manager_data_stream* pDataStream, ma_uint64 frameIndex)
+{
+    ma_job job;
+    ma_result streamResult;
+
+    streamResult = ma_resource_manager_data_stream_result(pDataStream);
+
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(streamResult != MA_UNAVAILABLE);
+
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (streamResult != MA_SUCCESS && streamResult != MA_BUSY) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* If we're not already seeking and we're sitting on the same frame, just make this a no-op. */
+    if (ma_atomic_load_32(&pDataStream->seekCounter) == 0) {
+        if (ma_atomic_load_64(&pDataStream->absoluteCursor) == frameIndex) {
+            return MA_SUCCESS;
+        }
+    }
+
+
+    /* Increment the seek counter first to indicate to read_paged_pcm_frames() and map_paged_pcm_frames() that we are in the middle of a seek and MA_BUSY should be returned. */
+    ma_atomic_fetch_add_32(&pDataStream->seekCounter, 1);
+
+    /* Update the absolute cursor so that ma_resource_manager_data_stream_get_cursor_in_pcm_frames() returns the new position. */
+    ma_resource_manager_data_stream_set_absolute_cursor(pDataStream, frameIndex);
+
+    /*
+    We need to clear our currently loaded pages so that the stream starts playback from the new seek point as soon as possible. These are for the purpose of the public
+    API and will be ignored by the seek job. The seek job will operate on the assumption that both pages have been marked as invalid and the cursor is at the start of
+    the first page.
+    */
+    pDataStream->relativeCursor   = 0;
+    pDataStream->currentPageIndex = 0;
+    ma_atomic_exchange_32(&pDataStream->isPageValid[0], MA_FALSE);
+    ma_atomic_exchange_32(&pDataStream->isPageValid[1], MA_FALSE);
+
+    /* Make sure the data stream is not marked as at the end or else if we seek in response to hitting the end, we won't be able to read any more data. */
+    ma_atomic_exchange_32(&pDataStream->isDecoderAtEnd, MA_FALSE);
+
+    /*
+    The public API is not allowed to touch the internal decoder so we need to use a job to perform the seek. When seeking, the job thread will assume both pages
+    are invalid and any content contained within them will be discarded and replaced with newly decoded data.
+    */
+    job = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_SEEK_DATA_STREAM);
+    job.order = ma_resource_manager_data_stream_next_execution_order(pDataStream);
+    job.data.resourceManager.seekDataStream.pDataStream = pDataStream;
+    job.data.resourceManager.seekDataStream.frameIndex  = frameIndex;
+    return ma_resource_manager_post_job(pDataStream->pResourceManager, &job);
+}
+
+MA_API ma_result ma_resource_manager_data_stream_get_data_format(ma_resource_manager_data_stream* pDataStream, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
+
+    if (pFormat != NULL) {
+        *pFormat = ma_format_unknown;
+    }
+
+    if (pChannels != NULL) {
+        *pChannels = 0;
+    }
+
+    if (pSampleRate != NULL) {
+        *pSampleRate = 0;
+    }
+
+    if (pChannelMap != NULL) {
+        MA_ZERO_MEMORY(pChannelMap, sizeof(*pChannelMap) * channelMapCap);
+    }
+
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /*
+    We're being a little bit naughty here and accessing the internal decoder from the public API. The output data format is constant, and we've defined this function
+    such that the application is responsible for ensuring it's not called while uninitializing so it should be safe.
+    */
+    return ma_data_source_get_data_format(&pDataStream->decoder, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+}
+
+MA_API ma_result ma_resource_manager_data_stream_get_cursor_in_pcm_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pCursor)
+{
+    ma_result result;
+
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pCursor = 0;
+
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) != MA_UNAVAILABLE);
+
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    If the stream is in an erroneous state we need to return an invalid operation. We can allow
+    this to be called when the data stream is in a busy state because the caller may have asked
+    for an initial seek position and it's convenient to return that as the cursor position.
+    */
+    result = ma_resource_manager_data_stream_result(pDataStream);
+    if (result != MA_SUCCESS && result != MA_BUSY) {
+        return MA_INVALID_OPERATION;
+    }
+
+    *pCursor = ma_atomic_load_64(&pDataStream->absoluteCursor);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resource_manager_data_stream_get_length_in_pcm_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pLength)
+{
+    ma_result streamResult;
+
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pLength = 0;
+
+    streamResult = ma_resource_manager_data_stream_result(pDataStream);
+
+    /* We cannot be using the data source after it's been uninitialized. */
+    MA_ASSERT(streamResult != MA_UNAVAILABLE);
+
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (streamResult != MA_SUCCESS) {
+        return streamResult;
+    }
+
+    /*
+    We most definitely do not want to be calling ma_decoder_get_length_in_pcm_frames() directly. Instead we want to use a cached value that we
+    calculated when we initialized it on the job thread.
+    */
+    *pLength = pDataStream->totalLengthInPCMFrames;
+    if (*pLength == 0) {
+        return MA_NOT_IMPLEMENTED;  /* Some decoders may not have a known length. */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resource_manager_data_stream_result(const ma_resource_manager_data_stream* pDataStream)
+{
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return (ma_result)ma_atomic_load_i32(&pDataStream->result);
+}
+
+MA_API ma_result ma_resource_manager_data_stream_set_looping(ma_resource_manager_data_stream* pDataStream, ma_bool32 isLooping)
+{
+    return ma_data_source_set_looping(pDataStream, isLooping);
+}
+
+MA_API ma_bool32 ma_resource_manager_data_stream_is_looping(const ma_resource_manager_data_stream* pDataStream)
+{
+    if (pDataStream == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_atomic_load_32((ma_bool32*)&pDataStream->isLooping);   /* Naughty const-cast. Value won't change from here in practice (maybe from another thread). */
+}
+
+MA_API ma_result ma_resource_manager_data_stream_get_available_frames(ma_resource_manager_data_stream* pDataStream, ma_uint64* pAvailableFrames)
+{
+    ma_uint32 pageIndex0;
+    ma_uint32 pageIndex1;
+    ma_uint32 relativeCursor;
+    ma_uint64 availableFrames;
+
+    if (pAvailableFrames == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pAvailableFrames = 0;
+
+    if (pDataStream == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pageIndex0     =  pDataStream->currentPageIndex;
+    pageIndex1     = (pDataStream->currentPageIndex + 1) & 0x01;
+    relativeCursor =  pDataStream->relativeCursor;
+
+    availableFrames = 0;
+    if (ma_atomic_load_32(&pDataStream->isPageValid[pageIndex0])) {
+        availableFrames += ma_atomic_load_32(&pDataStream->pageFrameCount[pageIndex0]) - relativeCursor;
+        if (ma_atomic_load_32(&pDataStream->isPageValid[pageIndex1])) {
+            availableFrames += ma_atomic_load_32(&pDataStream->pageFrameCount[pageIndex1]);
+        }
+    }
+
+    *pAvailableFrames = availableFrames;
+    return MA_SUCCESS;
+}
+
+
+static ma_result ma_resource_manager_data_source_preinit(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_source* pDataSource)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDataSource);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pResourceManager == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pDataSource->flags = pConfig->flags;
+    if (pConfig->isLooping) {
+        pDataSource->flags |= MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_resource_manager_data_source_init_ex(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source_config* pConfig, ma_resource_manager_data_source* pDataSource)
+{
+    ma_result result;
+
+    result = ma_resource_manager_data_source_preinit(pResourceManager, pConfig, pDataSource);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* The data source itself is just a data stream or a data buffer. */
+    if ((pConfig->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_init_ex(pResourceManager, pConfig, &pDataSource->backend.stream);
+    } else {
+        return ma_resource_manager_data_buffer_init_ex(pResourceManager, pConfig, &pDataSource->backend.buffer);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_init(ma_resource_manager* pResourceManager, const char* pName, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_source* pDataSource)
+{
+    ma_resource_manager_data_source_config config;
+
+    config = ma_resource_manager_data_source_config_init();
+    config.pFilePath      = pName;
+    config.flags          = flags;
+    config.pNotifications = pNotifications;
+
+    return ma_resource_manager_data_source_init_ex(pResourceManager, &config, pDataSource);
+}
+
+MA_API ma_result ma_resource_manager_data_source_init_w(ma_resource_manager* pResourceManager, const wchar_t* pName, ma_uint32 flags, const ma_resource_manager_pipeline_notifications* pNotifications, ma_resource_manager_data_source* pDataSource)
+{
+    ma_resource_manager_data_source_config config;
+
+    config = ma_resource_manager_data_source_config_init();
+    config.pFilePathW     = pName;
+    config.flags          = flags;
+    config.pNotifications = pNotifications;
+
+    return ma_resource_manager_data_source_init_ex(pResourceManager, &config, pDataSource);
+}
+
+MA_API ma_result ma_resource_manager_data_source_init_copy(ma_resource_manager* pResourceManager, const ma_resource_manager_data_source* pExistingDataSource, ma_resource_manager_data_source* pDataSource)
+{
+    ma_result result;
+    ma_resource_manager_data_source_config config;
+
+    if (pExistingDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    config = ma_resource_manager_data_source_config_init();
+    config.flags = pExistingDataSource->flags;
+
+    result = ma_resource_manager_data_source_preinit(pResourceManager, &config, pDataSource);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* Copying can only be done from data buffers. Streams cannot be copied. */
+    if ((pExistingDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return MA_INVALID_OPERATION;
+    }
+
+    return ma_resource_manager_data_buffer_init_copy(pResourceManager, &pExistingDataSource->backend.buffer, &pDataSource->backend.buffer);
+}
+
+MA_API ma_result ma_resource_manager_data_source_uninit(ma_resource_manager_data_source* pDataSource)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* All we need to is uninitialize the underlying data buffer or data stream. */
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_uninit(&pDataSource->backend.stream);
+    } else {
+        return ma_resource_manager_data_buffer_uninit(&pDataSource->backend.buffer);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_read_pcm_frames(ma_resource_manager_data_source* pDataSource, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    /* Safety. */
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_read_pcm_frames(&pDataSource->backend.stream, pFramesOut, frameCount, pFramesRead);
+    } else {
+        return ma_resource_manager_data_buffer_read_pcm_frames(&pDataSource->backend.buffer, pFramesOut, frameCount, pFramesRead);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_seek_to_pcm_frame(ma_resource_manager_data_source* pDataSource, ma_uint64 frameIndex)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_seek_to_pcm_frame(&pDataSource->backend.stream, frameIndex);
+    } else {
+        return ma_resource_manager_data_buffer_seek_to_pcm_frame(&pDataSource->backend.buffer, frameIndex);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_map(ma_resource_manager_data_source* pDataSource, void** ppFramesOut, ma_uint64* pFrameCount)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_map(&pDataSource->backend.stream, ppFramesOut, pFrameCount);
+    } else {
+        return MA_NOT_IMPLEMENTED;  /* Mapping not supported with data buffers. */
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_unmap(ma_resource_manager_data_source* pDataSource, ma_uint64 frameCount)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_unmap(&pDataSource->backend.stream, frameCount);
+    } else {
+        return MA_NOT_IMPLEMENTED;  /* Mapping not supported with data buffers. */
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_get_data_format(ma_resource_manager_data_source* pDataSource, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_get_data_format(&pDataSource->backend.stream, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+    } else {
+        return ma_resource_manager_data_buffer_get_data_format(&pDataSource->backend.buffer, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_get_cursor_in_pcm_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pCursor)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_get_cursor_in_pcm_frames(&pDataSource->backend.stream, pCursor);
+    } else {
+        return ma_resource_manager_data_buffer_get_cursor_in_pcm_frames(&pDataSource->backend.buffer, pCursor);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_get_length_in_pcm_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pLength)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_get_length_in_pcm_frames(&pDataSource->backend.stream, pLength);
+    } else {
+        return ma_resource_manager_data_buffer_get_length_in_pcm_frames(&pDataSource->backend.buffer, pLength);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_result(const ma_resource_manager_data_source* pDataSource)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_result(&pDataSource->backend.stream);
+    } else {
+        return ma_resource_manager_data_buffer_result(&pDataSource->backend.buffer);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_set_looping(ma_resource_manager_data_source* pDataSource, ma_bool32 isLooping)
+{
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_set_looping(&pDataSource->backend.stream, isLooping);
+    } else {
+        return ma_resource_manager_data_buffer_set_looping(&pDataSource->backend.buffer, isLooping);
+    }
+}
+
+MA_API ma_bool32 ma_resource_manager_data_source_is_looping(const ma_resource_manager_data_source* pDataSource)
+{
+    if (pDataSource == NULL) {
+        return MA_FALSE;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_is_looping(&pDataSource->backend.stream);
+    } else {
+        return ma_resource_manager_data_buffer_is_looping(&pDataSource->backend.buffer);
+    }
+}
+
+MA_API ma_result ma_resource_manager_data_source_get_available_frames(ma_resource_manager_data_source* pDataSource, ma_uint64* pAvailableFrames)
+{
+    if (pAvailableFrames == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pAvailableFrames = 0;
+
+    if (pDataSource == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if ((pDataSource->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_STREAM) != 0) {
+        return ma_resource_manager_data_stream_get_available_frames(&pDataSource->backend.stream, pAvailableFrames);
+    } else {
+        return ma_resource_manager_data_buffer_get_available_frames(&pDataSource->backend.buffer, pAvailableFrames);
+    }
+}
+
+
+MA_API ma_result ma_resource_manager_post_job(ma_resource_manager* pResourceManager, const ma_job* pJob)
+{
+    if (pResourceManager == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_job_queue_post(&pResourceManager->jobQueue, pJob);
+}
+
+MA_API ma_result ma_resource_manager_post_job_quit(ma_resource_manager* pResourceManager)
+{
+    ma_job job = ma_job_init(MA_JOB_TYPE_QUIT);
+    return ma_resource_manager_post_job(pResourceManager, &job);
+}
+
+MA_API ma_result ma_resource_manager_next_job(ma_resource_manager* pResourceManager, ma_job* pJob)
+{
+    if (pResourceManager == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_job_queue_next(&pResourceManager->jobQueue, pJob);
+}
+
+
+static ma_result ma_job_process__resource_manager__load_data_buffer_node(ma_job* pJob)
+{
+    ma_result result = MA_SUCCESS;
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_buffer_node* pDataBufferNode;
+
+    MA_ASSERT(pJob != NULL);
+
+    pResourceManager = (ma_resource_manager*)pJob->data.resourceManager.loadDataBufferNode.pResourceManager;
+    MA_ASSERT(pResourceManager != NULL);
+
+    pDataBufferNode = (ma_resource_manager_data_buffer_node*)pJob->data.resourceManager.loadDataBufferNode.pDataBufferNode;
+    MA_ASSERT(pDataBufferNode != NULL);
+    MA_ASSERT(pDataBufferNode->isDataOwnedByResourceManager == MA_TRUE);  /* The data should always be owned by the resource manager. */
+
+    /* The data buffer is not getting deleted, but we may be getting executed out of order. If so, we need to push the job back onto the queue and return. */
+    if (pJob->order != ma_atomic_load_32(&pDataBufferNode->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Attempting to execute out of order. Probably interleaved with a MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER job. */
+    }
+
+    /* First thing we need to do is check whether or not the data buffer is getting deleted. If so we just abort. */
+    if (ma_resource_manager_data_buffer_node_result(pDataBufferNode) != MA_BUSY) {
+        result = ma_resource_manager_data_buffer_node_result(pDataBufferNode);    /* The data buffer may be getting deleted before it's even been loaded. */
+        goto done;
+    }
+
+    /*
+    We're ready to start loading. Essentially what we're doing here is initializing the data supply
+    of the node. Once this is complete, data buffers can have their connectors initialized which
+    will allow then to have audio data read from them.
+
+    Note that when the data supply type has been moved away from "unknown", that is when other threads
+    will determine that the node is available for data delivery and the data buffer connectors can be
+    initialized. Therefore, it's important that it is set after the data supply has been initialized.
+    */
+    if ((pJob->data.resourceManager.loadDataBufferNode.flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_DECODE) != 0) {
+        /*
+        Decoding. This is the complex case because we're not going to be doing the entire decoding
+        process here. Instead it's going to be split of multiple jobs and loaded in pages. The
+        reason for this is to evenly distribute decoding time across multiple sounds, rather than
+        having one huge sound hog all the available processing resources.
+
+        The first thing we do is initialize a decoder. This is allocated on the heap and is passed
+        around to the paging jobs. When the last paging job has completed it's processing, it'll
+        free the decoder for us.
+
+        This job does not do any actual decoding. It instead just posts a PAGE_DATA_BUFFER_NODE job
+        which is where the actual decoding work will be done. However, once this job is complete,
+        the node will be in a state where data buffer connectors can be initialized.
+        */
+        ma_decoder* pDecoder;   /* <-- Free'd on the last page decode. */
+        ma_job pageDataBufferNodeJob;
+
+        /* Allocate the decoder by initializing a decoded data supply. */
+        result = ma_resource_manager_data_buffer_node_init_supply_decoded(pResourceManager, pDataBufferNode, pJob->data.resourceManager.loadDataBufferNode.pFilePath, pJob->data.resourceManager.loadDataBufferNode.pFilePathW, pJob->data.resourceManager.loadDataBufferNode.flags, &pDecoder);
+
+        /*
+        Don't ever propagate an MA_BUSY result code or else the resource manager will think the
+        node is just busy decoding rather than in an error state. This should never happen, but
+        including this logic for safety just in case.
+        */
+        if (result == MA_BUSY) {
+            result  = MA_ERROR;
+        }
+
+        if (result != MA_SUCCESS) {
+            if (pJob->data.resourceManager.loadDataBufferNode.pFilePath != NULL) {
+                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to initialize data supply for \"%s\". %s.\n", pJob->data.resourceManager.loadDataBufferNode.pFilePath, ma_result_description(result));
+            } else {
+                #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(_MSC_VER)
+                    ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_WARNING, "Failed to initialize data supply for \"%ls\", %s.\n", pJob->data.resourceManager.loadDataBufferNode.pFilePathW, ma_result_description(result));
+                #endif
+            }
+
+            goto done;
+        }
+
+        /*
+        At this point the node's data supply is initialized and other threads can start initializing
+        their data buffer connectors. However, no data will actually be available until we start to
+        actually decode it. To do this, we need to post a paging job which is where the decoding
+        work is done.
+
+        Note that if an error occurred at an earlier point, this section will have been skipped.
+        */
+        pageDataBufferNodeJob = ma_job_init(MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE);
+        pageDataBufferNodeJob.order = ma_resource_manager_data_buffer_node_next_execution_order(pDataBufferNode);
+        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pResourceManager  = pResourceManager;
+        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pDataBufferNode   = pDataBufferNode;
+        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pDecoder          = pDecoder;
+        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pDoneNotification = pJob->data.resourceManager.loadDataBufferNode.pDoneNotification;
+        pageDataBufferNodeJob.data.resourceManager.pageDataBufferNode.pDoneFence        = pJob->data.resourceManager.loadDataBufferNode.pDoneFence;
+
+        /* The job has been set up so it can now be posted. */
+        result = ma_resource_manager_post_job(pResourceManager, &pageDataBufferNodeJob);
+
+        /*
+        When we get here, we want to make sure the result code is set to MA_BUSY. The reason for
+        this is that the result will be copied over to the node's internal result variable. In
+        this case, since the decoding is still in-progress, we need to make sure the result code
+        is set to MA_BUSY.
+        */
+        if (result != MA_SUCCESS) {
+            ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to post MA_JOB_TYPE_RESOURCE_MANAGER_PAGE_DATA_BUFFER_NODE job. %s\n", ma_result_description(result));
+            ma_decoder_uninit(pDecoder);
+            ma_free(pDecoder, &pResourceManager->config.allocationCallbacks);
+        } else {
+            result = MA_BUSY;
+        }
+    } else {
+        /* No decoding. This is the simple case. We need only read the file content into memory and we're done. */
+        result = ma_resource_manager_data_buffer_node_init_supply_encoded(pResourceManager, pDataBufferNode, pJob->data.resourceManager.loadDataBufferNode.pFilePath, pJob->data.resourceManager.loadDataBufferNode.pFilePathW);
+    }
+
+
+done:
+    /* File paths are no longer needed. */
+    ma_free(pJob->data.resourceManager.loadDataBufferNode.pFilePath,  &pResourceManager->config.allocationCallbacks);
+    ma_free(pJob->data.resourceManager.loadDataBufferNode.pFilePathW, &pResourceManager->config.allocationCallbacks);
+
+    /*
+    We need to set the result to at the very end to ensure no other threads try reading the data before we've fully initialized the object. Other threads
+    are going to be inspecting this variable to determine whether or not they're ready to read data. We can only change the result if it's set to MA_BUSY
+    because otherwise we may be changing away from an error code which would be bad. An example is if the application creates a data buffer, but then
+    immediately deletes it before we've got to this point. In this case, pDataBuffer->result will be MA_UNAVAILABLE, and setting it to MA_SUCCESS or any
+    other error code would cause the buffer to look like it's in a state that it's not.
+    */
+    ma_atomic_compare_and_swap_i32(&pDataBufferNode->result, MA_BUSY, result);
+
+    /* At this point initialization is complete and we can signal the notification if any. */
+    if (pJob->data.resourceManager.loadDataBufferNode.pInitNotification != NULL) {
+        ma_async_notification_signal(pJob->data.resourceManager.loadDataBufferNode.pInitNotification);
+    }
+    if (pJob->data.resourceManager.loadDataBufferNode.pInitFence != NULL) {
+        ma_fence_release(pJob->data.resourceManager.loadDataBufferNode.pInitFence);
+    }
+
+    /* If we have a success result it means we've fully loaded the buffer. This will happen in the non-decoding case. */
+    if (result != MA_BUSY) {
+        if (pJob->data.resourceManager.loadDataBufferNode.pDoneNotification != NULL) {
+            ma_async_notification_signal(pJob->data.resourceManager.loadDataBufferNode.pDoneNotification);
+        }
+        if (pJob->data.resourceManager.loadDataBufferNode.pDoneFence != NULL) {
+            ma_fence_release(pJob->data.resourceManager.loadDataBufferNode.pDoneFence);
+        }
+    }
+
+    /* Increment the node's execution pointer so that the next jobs can be processed. This is how we keep decoding of pages in-order. */
+    ma_atomic_fetch_add_32(&pDataBufferNode->executionPointer, 1);
+
+    /* A busy result should be considered successful from the point of view of the job system. */
+    if (result == MA_BUSY) {
+        result  = MA_SUCCESS;
+    }
+
+    return result;
+}
+
+static ma_result ma_job_process__resource_manager__free_data_buffer_node(ma_job* pJob)
+{
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_buffer_node* pDataBufferNode;
+
+    MA_ASSERT(pJob != NULL);
+
+    pResourceManager = (ma_resource_manager*)pJob->data.resourceManager.freeDataBufferNode.pResourceManager;
+    MA_ASSERT(pResourceManager != NULL);
+
+    pDataBufferNode = (ma_resource_manager_data_buffer_node*)pJob->data.resourceManager.freeDataBufferNode.pDataBufferNode;
+    MA_ASSERT(pDataBufferNode != NULL);
+
+    if (pJob->order != ma_atomic_load_32(&pDataBufferNode->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
+    }
+
+    ma_resource_manager_data_buffer_node_free(pResourceManager, pDataBufferNode);
+
+    /* The event needs to be signalled last. */
+    if (pJob->data.resourceManager.freeDataBufferNode.pDoneNotification != NULL) {
+        ma_async_notification_signal(pJob->data.resourceManager.freeDataBufferNode.pDoneNotification);
+    }
+
+    if (pJob->data.resourceManager.freeDataBufferNode.pDoneFence != NULL) {
+        ma_fence_release(pJob->data.resourceManager.freeDataBufferNode.pDoneFence);
+    }
+
+    ma_atomic_fetch_add_32(&pDataBufferNode->executionPointer, 1);
+    return MA_SUCCESS;
+}
+
+static ma_result ma_job_process__resource_manager__page_data_buffer_node(ma_job* pJob)
+{
+    ma_result result = MA_SUCCESS;
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_buffer_node* pDataBufferNode;
+
+    MA_ASSERT(pJob != NULL);
+
+    pResourceManager = (ma_resource_manager*)pJob->data.resourceManager.pageDataBufferNode.pResourceManager;
+    MA_ASSERT(pResourceManager != NULL);
+
+    pDataBufferNode = (ma_resource_manager_data_buffer_node*)pJob->data.resourceManager.pageDataBufferNode.pDataBufferNode;
+    MA_ASSERT(pDataBufferNode != NULL);
+
+    if (pJob->order != ma_atomic_load_32(&pDataBufferNode->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
+    }
+
+    /* Don't do any more decoding if the data buffer has started the uninitialization process. */
+    result = ma_resource_manager_data_buffer_node_result(pDataBufferNode);
+    if (result != MA_BUSY) {
+        goto done;
+    }
+
+    /* We're ready to decode the next page. */
+    result = ma_resource_manager_data_buffer_node_decode_next_page(pResourceManager, pDataBufferNode, (ma_decoder*)pJob->data.resourceManager.pageDataBufferNode.pDecoder);
+
+    /*
+    If we have a success code by this point, we want to post another job. We're going to set the
+    result back to MA_BUSY to make it clear that there's still more to load.
+    */
+    if (result == MA_SUCCESS) {
+        ma_job newJob;
+        newJob = *pJob; /* Everything is the same as the input job, except the execution order. */
+        newJob.order = ma_resource_manager_data_buffer_node_next_execution_order(pDataBufferNode);   /* We need a fresh execution order. */
+
+        result = ma_resource_manager_post_job(pResourceManager, &newJob);
+
+        /* Since the sound isn't yet fully decoded we want the status to be set to busy. */
+        if (result == MA_SUCCESS) {
+            result  = MA_BUSY;
+        }
+    }
+
+done:
+    /* If there's still more to decode the result will be set to MA_BUSY. Otherwise we can free the decoder. */
+    if (result != MA_BUSY) {
+        ma_decoder_uninit((ma_decoder*)pJob->data.resourceManager.pageDataBufferNode.pDecoder);
+        ma_free(pJob->data.resourceManager.pageDataBufferNode.pDecoder, &pResourceManager->config.allocationCallbacks);
+    }
+
+    /* If we reached the end we need to treat it as successful. */
+    if (result == MA_AT_END) {
+        result  = MA_SUCCESS;
+    }
+
+    /* Make sure we set the result of node in case some error occurred. */
+    ma_atomic_compare_and_swap_i32(&pDataBufferNode->result, MA_BUSY, result);
+
+    /* Signal the notification after setting the result in case the notification callback wants to inspect the result code. */
+    if (result != MA_BUSY) {
+        if (pJob->data.resourceManager.pageDataBufferNode.pDoneNotification != NULL) {
+            ma_async_notification_signal(pJob->data.resourceManager.pageDataBufferNode.pDoneNotification);
+        }
+
+        if (pJob->data.resourceManager.pageDataBufferNode.pDoneFence != NULL) {
+            ma_fence_release(pJob->data.resourceManager.pageDataBufferNode.pDoneFence);
+        }
+    }
+
+    ma_atomic_fetch_add_32(&pDataBufferNode->executionPointer, 1);
+    return result;
+}
+
+
+static ma_result ma_job_process__resource_manager__load_data_buffer(ma_job* pJob)
+{
+    ma_result result = MA_SUCCESS;
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_buffer* pDataBuffer;
+    ma_resource_manager_data_supply_type dataSupplyType = ma_resource_manager_data_supply_type_unknown;
+    ma_bool32 isConnectorInitialized = MA_FALSE;
+
+    /*
+    All we're doing here is checking if the node has finished loading. If not, we just re-post the job
+    and keep waiting. Otherwise we increment the execution counter and set the buffer's result code.
+    */
+    MA_ASSERT(pJob != NULL);
+
+    pDataBuffer = (ma_resource_manager_data_buffer*)pJob->data.resourceManager.loadDataBuffer.pDataBuffer;
+    MA_ASSERT(pDataBuffer != NULL);
+
+    pResourceManager = pDataBuffer->pResourceManager;
+
+    if (pJob->order != ma_atomic_load_32(&pDataBuffer->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Attempting to execute out of order. Probably interleaved with a MA_JOB_TYPE_RESOURCE_MANAGER_FREE_DATA_BUFFER job. */
+    }
+
+    /*
+    First thing we need to do is check whether or not the data buffer is getting deleted. If so we
+    just abort, but making sure we increment the execution pointer.
+    */
+    result = ma_resource_manager_data_buffer_result(pDataBuffer);
+    if (result != MA_BUSY) {
+        goto done;  /* <-- This will ensure the execution pointer is incremented. */
+    } else {
+        result = MA_SUCCESS;    /* <-- Make sure this is reset. */
+        (void)result;           /* <-- This is to suppress a static analysis diagnostic about "result" not being used. But for safety when I do future maintenance I don't want to delete that assignment. */
+    }
+
+    /* Try initializing the connector if we haven't already. */
+    isConnectorInitialized = ma_resource_manager_data_buffer_has_connector(pDataBuffer);
+    if (isConnectorInitialized == MA_FALSE) {
+        dataSupplyType = ma_resource_manager_data_buffer_node_get_data_supply_type(pDataBuffer->pNode);
+
+        if (dataSupplyType != ma_resource_manager_data_supply_type_unknown) {
+            /* We can now initialize the connector. If this fails, we need to abort. It's very rare for this to fail. */
+            ma_resource_manager_data_source_config dataSourceConfig;    /* For setting initial looping state and range. */
+            dataSourceConfig = ma_resource_manager_data_source_config_init();
+            dataSourceConfig.rangeBegInPCMFrames     = pJob->data.resourceManager.loadDataBuffer.rangeBegInPCMFrames;
+            dataSourceConfig.rangeEndInPCMFrames     = pJob->data.resourceManager.loadDataBuffer.rangeEndInPCMFrames;
+            dataSourceConfig.loopPointBegInPCMFrames = pJob->data.resourceManager.loadDataBuffer.loopPointBegInPCMFrames;
+            dataSourceConfig.loopPointEndInPCMFrames = pJob->data.resourceManager.loadDataBuffer.loopPointEndInPCMFrames;
+            dataSourceConfig.isLooping               = pJob->data.resourceManager.loadDataBuffer.isLooping;
+
+            result = ma_resource_manager_data_buffer_init_connector(pDataBuffer, &dataSourceConfig, pJob->data.resourceManager.loadDataBuffer.pInitNotification, pJob->data.resourceManager.loadDataBuffer.pInitFence);
+            if (result != MA_SUCCESS) {
+                ma_log_postf(ma_resource_manager_get_log(pResourceManager), MA_LOG_LEVEL_ERROR, "Failed to initialize connector for data buffer. %s.\n", ma_result_description(result));
+                goto done;
+            }
+        } else {
+            /* Don't have a known data supply type. Most likely the data buffer node is still loading, but it could be that an error occurred. */
+        }
+    } else {
+        /* The connector is already initialized. Nothing to do here. */
+    }
+
+    /*
+    If the data node is still loading, we need to repost the job and *not* increment the execution
+    pointer (i.e. we need to not fall through to the "done" label).
+
+    There is a hole between here and the where the data connector is initialized where the data
+    buffer node may have finished initializing. We need to check for this by checking the result of
+    the data buffer node and whether or not we had an unknown data supply type at the time of
+    trying to initialize the data connector.
+    */
+    result = ma_resource_manager_data_buffer_node_result(pDataBuffer->pNode);
+    if (result == MA_BUSY || (result == MA_SUCCESS && isConnectorInitialized == MA_FALSE && dataSupplyType == ma_resource_manager_data_supply_type_unknown)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);
+    }
+
+done:
+    /* Only move away from a busy code so that we don't trash any existing error codes. */
+    ma_atomic_compare_and_swap_i32(&pDataBuffer->result, MA_BUSY, result);
+
+    /* Only signal the other threads after the result has been set just for cleanliness sake. */
+    if (pJob->data.resourceManager.loadDataBuffer.pDoneNotification != NULL) {
+        ma_async_notification_signal(pJob->data.resourceManager.loadDataBuffer.pDoneNotification);
+    }
+    if (pJob->data.resourceManager.loadDataBuffer.pDoneFence != NULL) {
+        ma_fence_release(pJob->data.resourceManager.loadDataBuffer.pDoneFence);
+    }
+
+    /*
+    If at this point the data buffer has not had it's connector initialized, it means the
+    notification event was never signalled which means we need to signal it here.
+    */
+    if (ma_resource_manager_data_buffer_has_connector(pDataBuffer) == MA_FALSE && result != MA_SUCCESS) {
+        if (pJob->data.resourceManager.loadDataBuffer.pInitNotification != NULL) {
+            ma_async_notification_signal(pJob->data.resourceManager.loadDataBuffer.pInitNotification);
+        }
+        if (pJob->data.resourceManager.loadDataBuffer.pInitFence != NULL) {
+            ma_fence_release(pJob->data.resourceManager.loadDataBuffer.pInitFence);
+        }
+    }
+
+    ma_atomic_fetch_add_32(&pDataBuffer->executionPointer, 1);
+    return result;
+}
+
+static ma_result ma_job_process__resource_manager__free_data_buffer(ma_job* pJob)
+{
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_buffer* pDataBuffer;
+
+    MA_ASSERT(pJob != NULL);
+
+    pDataBuffer = (ma_resource_manager_data_buffer*)pJob->data.resourceManager.freeDataBuffer.pDataBuffer;
+    MA_ASSERT(pDataBuffer != NULL);
+
+    pResourceManager = pDataBuffer->pResourceManager;
+
+    if (pJob->order != ma_atomic_load_32(&pDataBuffer->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
+    }
+
+    ma_resource_manager_data_buffer_uninit_internal(pDataBuffer);
+
+    /* The event needs to be signalled last. */
+    if (pJob->data.resourceManager.freeDataBuffer.pDoneNotification != NULL) {
+        ma_async_notification_signal(pJob->data.resourceManager.freeDataBuffer.pDoneNotification);
+    }
+
+    if (pJob->data.resourceManager.freeDataBuffer.pDoneFence != NULL) {
+        ma_fence_release(pJob->data.resourceManager.freeDataBuffer.pDoneFence);
+    }
+
+    ma_atomic_fetch_add_32(&pDataBuffer->executionPointer, 1);
+    return MA_SUCCESS;
+}
+
+static ma_result ma_job_process__resource_manager__load_data_stream(ma_job* pJob)
+{
+    ma_result result = MA_SUCCESS;
+    ma_decoder_config decoderConfig;
+    ma_uint32 pageBufferSizeInBytes;
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_stream* pDataStream;
+
+    MA_ASSERT(pJob != NULL);
+
+    pDataStream = (ma_resource_manager_data_stream*)pJob->data.resourceManager.loadDataStream.pDataStream;
+    MA_ASSERT(pDataStream != NULL);
+
+    pResourceManager = pDataStream->pResourceManager;
+
+    if (pJob->order != ma_atomic_load_32(&pDataStream->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
+    }
+
+    if (ma_resource_manager_data_stream_result(pDataStream) != MA_BUSY) {
+        result = MA_INVALID_OPERATION;  /* Most likely the data stream is being uninitialized. */
+        goto done;
+    }
+
+    /* We need to initialize the decoder first so we can determine the size of the pages. */
+    decoderConfig = ma_resource_manager__init_decoder_config(pResourceManager);
+
+    if (pJob->data.resourceManager.loadDataStream.pFilePath != NULL) {
+        result = ma_decoder_init_vfs(pResourceManager->config.pVFS, pJob->data.resourceManager.loadDataStream.pFilePath, &decoderConfig, &pDataStream->decoder);
+    } else {
+        result = ma_decoder_init_vfs_w(pResourceManager->config.pVFS, pJob->data.resourceManager.loadDataStream.pFilePathW, &decoderConfig, &pDataStream->decoder);
+    }
+    if (result != MA_SUCCESS) {
+        goto done;
+    }
+
+    /* Retrieve the total length of the file before marking the decoder as loaded. */
+    if ((pDataStream->flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_UNKNOWN_LENGTH) == 0) {
+        result = ma_decoder_get_length_in_pcm_frames(&pDataStream->decoder, &pDataStream->totalLengthInPCMFrames);
+        if (result != MA_SUCCESS) {
+            goto done;  /* Failed to retrieve the length. */
+        }
+    } else {
+        pDataStream->totalLengthInPCMFrames = 0;
+    }
+
+    /*
+    Only mark the decoder as initialized when the length of the decoder has been retrieved because that can possibly require a scan over the whole file
+    and we don't want to have another thread trying to access the decoder while it's scanning.
+    */
+    pDataStream->isDecoderInitialized = MA_TRUE;
+
+    /* We have the decoder so we can now initialize our page buffer. */
+    pageBufferSizeInBytes = ma_resource_manager_data_stream_get_page_size_in_frames(pDataStream) * 2 * ma_get_bytes_per_frame(pDataStream->decoder.outputFormat, pDataStream->decoder.outputChannels);
+
+    pDataStream->pPageData = ma_malloc(pageBufferSizeInBytes, &pResourceManager->config.allocationCallbacks);
+    if (pDataStream->pPageData == NULL) {
+        ma_decoder_uninit(&pDataStream->decoder);
+        result = MA_OUT_OF_MEMORY;
+        goto done;
+    }
+
+    /* Seek to our initial seek point before filling the initial pages. */
+    ma_decoder_seek_to_pcm_frame(&pDataStream->decoder, pJob->data.resourceManager.loadDataStream.initialSeekPoint);
+
+    /* We have our decoder and our page buffer, so now we need to fill our pages. */
+    ma_resource_manager_data_stream_fill_pages(pDataStream);
+
+    /* And now we're done. We want to make sure the result is MA_SUCCESS. */
+    result = MA_SUCCESS;
+
+done:
+    ma_free(pJob->data.resourceManager.loadDataStream.pFilePath,  &pResourceManager->config.allocationCallbacks);
+    ma_free(pJob->data.resourceManager.loadDataStream.pFilePathW, &pResourceManager->config.allocationCallbacks);
+
+    /* We can only change the status away from MA_BUSY. If it's set to anything else it means an error has occurred somewhere or the uninitialization process has started (most likely). */
+    ma_atomic_compare_and_swap_i32(&pDataStream->result, MA_BUSY, result);
+
+    /* Only signal the other threads after the result has been set just for cleanliness sake. */
+    if (pJob->data.resourceManager.loadDataStream.pInitNotification != NULL) {
+        ma_async_notification_signal(pJob->data.resourceManager.loadDataStream.pInitNotification);
+    }
+    if (pJob->data.resourceManager.loadDataStream.pInitFence != NULL) {
+        ma_fence_release(pJob->data.resourceManager.loadDataStream.pInitFence);
+    }
+
+    ma_atomic_fetch_add_32(&pDataStream->executionPointer, 1);
+    return result;
+}
+
+static ma_result ma_job_process__resource_manager__free_data_stream(ma_job* pJob)
+{
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_stream* pDataStream;
+
+    MA_ASSERT(pJob != NULL);
+
+    pDataStream = (ma_resource_manager_data_stream*)pJob->data.resourceManager.freeDataStream.pDataStream;
+    MA_ASSERT(pDataStream != NULL);
+
+    pResourceManager = pDataStream->pResourceManager;
+
+    if (pJob->order != ma_atomic_load_32(&pDataStream->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
+    }
+
+    /* If our status is not MA_UNAVAILABLE we have a bug somewhere. */
+    MA_ASSERT(ma_resource_manager_data_stream_result(pDataStream) == MA_UNAVAILABLE);
+
+    if (pDataStream->isDecoderInitialized) {
+        ma_decoder_uninit(&pDataStream->decoder);
+    }
+
+    if (pDataStream->pPageData != NULL) {
+        ma_free(pDataStream->pPageData, &pResourceManager->config.allocationCallbacks);
+        pDataStream->pPageData = NULL;  /* Just in case... */
+    }
+
+    ma_data_source_uninit(&pDataStream->ds);
+
+    /* The event needs to be signalled last. */
+    if (pJob->data.resourceManager.freeDataStream.pDoneNotification != NULL) {
+        ma_async_notification_signal(pJob->data.resourceManager.freeDataStream.pDoneNotification);
+    }
+    if (pJob->data.resourceManager.freeDataStream.pDoneFence != NULL) {
+        ma_fence_release(pJob->data.resourceManager.freeDataStream.pDoneFence);
+    }
+
+    /*ma_atomic_fetch_add_32(&pDataStream->executionPointer, 1);*/
+    return MA_SUCCESS;
+}
+
+static ma_result ma_job_process__resource_manager__page_data_stream(ma_job* pJob)
+{
+    ma_result result = MA_SUCCESS;
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_stream* pDataStream;
+
+    MA_ASSERT(pJob != NULL);
+
+    pDataStream = (ma_resource_manager_data_stream*)pJob->data.resourceManager.pageDataStream.pDataStream;
+    MA_ASSERT(pDataStream != NULL);
+
+    pResourceManager = pDataStream->pResourceManager;
+
+    if (pJob->order != ma_atomic_load_32(&pDataStream->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
+    }
+
+    /* For streams, the status should be MA_SUCCESS. */
+    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS) {
+        result = MA_INVALID_OPERATION;
+        goto done;
+    }
+
+    ma_resource_manager_data_stream_fill_page(pDataStream, pJob->data.resourceManager.pageDataStream.pageIndex);
+
+done:
+    ma_atomic_fetch_add_32(&pDataStream->executionPointer, 1);
+    return result;
+}
+
+static ma_result ma_job_process__resource_manager__seek_data_stream(ma_job* pJob)
+{
+    ma_result result = MA_SUCCESS;
+    ma_resource_manager* pResourceManager;
+    ma_resource_manager_data_stream* pDataStream;
+
+    MA_ASSERT(pJob != NULL);
+
+    pDataStream = (ma_resource_manager_data_stream*)pJob->data.resourceManager.seekDataStream.pDataStream;
+    MA_ASSERT(pDataStream != NULL);
+
+    pResourceManager = pDataStream->pResourceManager;
+
+    if (pJob->order != ma_atomic_load_32(&pDataStream->executionPointer)) {
+        return ma_resource_manager_post_job(pResourceManager, pJob);    /* Out of order. */
+    }
+
+    /* For streams the status should be MA_SUCCESS for this to do anything. */
+    if (ma_resource_manager_data_stream_result(pDataStream) != MA_SUCCESS || pDataStream->isDecoderInitialized == MA_FALSE) {
+        result = MA_INVALID_OPERATION;
+        goto done;
+    }
+
+    /*
+    With seeking we just assume both pages are invalid and the relative frame cursor at position 0. This is basically exactly the same as loading, except
+    instead of initializing the decoder, we seek to a frame.
+    */
+    ma_decoder_seek_to_pcm_frame(&pDataStream->decoder, pJob->data.resourceManager.seekDataStream.frameIndex);
+
+    /* After seeking we'll need to reload the pages. */
+    ma_resource_manager_data_stream_fill_pages(pDataStream);
+
+    /* We need to let the public API know that we're done seeking. */
+    ma_atomic_fetch_sub_32(&pDataStream->seekCounter, 1);
+
+done:
+    ma_atomic_fetch_add_32(&pDataStream->executionPointer, 1);
+    return result;
+}
+
+MA_API ma_result ma_resource_manager_process_job(ma_resource_manager* pResourceManager, ma_job* pJob)
+{
+    if (pResourceManager == NULL || pJob == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_job_process(pJob);
+}
+
+MA_API ma_result ma_resource_manager_process_next_job(ma_resource_manager* pResourceManager)
+{
+    ma_result result;
+    ma_job job;
+
+    if (pResourceManager == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* This will return MA_CANCELLED if the next job is a quit job. */
+    result = ma_resource_manager_next_job(pResourceManager, &job);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return ma_job_process(&job);
+}
+#else
+/* We'll get here if the resource manager is being excluded from the build. We need to define the job processing callbacks as no-ops. */
+static ma_result ma_job_process__resource_manager__load_data_buffer_node(ma_job* pJob) { return ma_job_process__noop(pJob); }
+static ma_result ma_job_process__resource_manager__free_data_buffer_node(ma_job* pJob) { return ma_job_process__noop(pJob); }
+static ma_result ma_job_process__resource_manager__page_data_buffer_node(ma_job* pJob) { return ma_job_process__noop(pJob); }
+static ma_result ma_job_process__resource_manager__load_data_buffer(ma_job* pJob)      { return ma_job_process__noop(pJob); }
+static ma_result ma_job_process__resource_manager__free_data_buffer(ma_job* pJob)      { return ma_job_process__noop(pJob); }
+static ma_result ma_job_process__resource_manager__load_data_stream(ma_job* pJob)      { return ma_job_process__noop(pJob); }
+static ma_result ma_job_process__resource_manager__free_data_stream(ma_job* pJob)      { return ma_job_process__noop(pJob); }
+static ma_result ma_job_process__resource_manager__page_data_stream(ma_job* pJob)      { return ma_job_process__noop(pJob); }
+static ma_result ma_job_process__resource_manager__seek_data_stream(ma_job* pJob)      { return ma_job_process__noop(pJob); }
+#endif  /* MA_NO_RESOURCE_MANAGER */
+
+
+#ifndef MA_NO_NODE_GRAPH
+
+static ma_stack* ma_stack_init(size_t sizeInBytes, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_stack* pStack;
+
+    if (sizeInBytes == 0) {
+        return NULL;
+    }
+
+    pStack = (ma_stack*)ma_malloc(sizeof(*pStack) - sizeof(pStack->_data) + sizeInBytes, pAllocationCallbacks);
+    if (pStack == NULL) {
+        return NULL;
+    }
+
+    pStack->offset = 0;
+    pStack->sizeInBytes = sizeInBytes;
+
+    return pStack;
+}
+
+static void ma_stack_uninit(ma_stack* pStack, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pStack == NULL) {
+        return;
+    }
+
+    ma_free(pStack, pAllocationCallbacks);
+}
+
+static void* ma_stack_alloc(ma_stack* pStack, size_t sz)
+{
+    /* The size of the allocation is stored in the memory directly before the pointer. This needs to include padding to keep it aligned to ma_uintptr */
+    void* p = (void*)((char*)pStack->_data + pStack->offset);
+    size_t* pSize = (size_t*)p;
+
+    sz = (sz + (sizeof(ma_uintptr) - 1)) & ~(sizeof(ma_uintptr) - 1);  /* Padding. */
+    if (pStack->offset + sz + sizeof(size_t) > pStack->sizeInBytes) {
+        return NULL;    /* Out of memory. */
+    }
+
+    pStack->offset += sz + sizeof(size_t);
+
+    *pSize = sz;
+    return (void*)((char*)p + sizeof(size_t));
+}
+
+static void ma_stack_free(ma_stack* pStack, void* p)
+{
+    size_t* pSize;
+
+    if (p == NULL) {
+        return;
+    }
+
+    pSize = (size_t*)p - 1;
+    pStack->offset -= *pSize + sizeof(size_t);
+}
+
+
+
+/* 10ms @ 48K = 480. Must never exceed 65535. */
+#ifndef MA_DEFAULT_NODE_CACHE_CAP_IN_FRAMES_PER_BUS
+#define MA_DEFAULT_NODE_CACHE_CAP_IN_FRAMES_PER_BUS 480
+#endif
+
+#ifndef MA_DEFAULT_PREMIX_STACK_SIZE_PER_CHANNEL
+#define MA_DEFAULT_PREMIX_STACK_SIZE_PER_CHANNEL    524288
+#endif
+
+static ma_result ma_node_read_pcm_frames(ma_node* pNode, ma_uint32 outputBusIndex, float* pFramesOut, ma_uint32 frameCount, ma_uint32* pFramesRead, ma_uint64 globalTime);
+
+MA_API void ma_debug_fill_pcm_frames_with_sine_wave(float* pFramesOut, ma_uint32 frameCount, ma_format format, ma_uint32 channels, ma_uint32 sampleRate)
+{
+    #ifndef MA_NO_GENERATION
+    {
+        ma_waveform_config waveformConfig;
+        ma_waveform waveform;
+
+        waveformConfig = ma_waveform_config_init(format, channels, sampleRate, ma_waveform_type_sine, 1.0, 400);
+        ma_waveform_init(&waveformConfig, &waveform);
+        ma_waveform_read_pcm_frames(&waveform, pFramesOut, frameCount, NULL);
+    }
+    #else
+    {
+        (void)pFramesOut;
+        (void)frameCount;
+        (void)format;
+        (void)channels;
+        (void)sampleRate;
+        #if defined(MA_DEBUG_OUTPUT)
+        {
+            #if _MSC_VER
+                #pragma message ("ma_debug_fill_pcm_frames_with_sine_wave() will do nothing because MA_NO_GENERATION is enabled.")
+            #endif
+        }
+        #endif
+    }
+    #endif
+}
+
+
+
+MA_API ma_node_graph_config ma_node_graph_config_init(ma_uint32 channels)
+{
+    ma_node_graph_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.channels               = channels;
+    config.processingSizeInFrames = 0;
+
+    return config;
+}
+
+
+static void ma_node_graph_set_is_reading(ma_node_graph* pNodeGraph, ma_bool32 isReading)
+{
+    MA_ASSERT(pNodeGraph != NULL);
+    ma_atomic_exchange_32(&pNodeGraph->isReading, isReading);
+}
+
+#if 0
+static ma_bool32 ma_node_graph_is_reading(ma_node_graph* pNodeGraph)
+{
+    MA_ASSERT(pNodeGraph != NULL);
+    return ma_atomic_load_32(&pNodeGraph->isReading);
+}
+#endif
+
+
+static void ma_node_graph_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_node_graph* pNodeGraph = (ma_node_graph*)pNode;
+    ma_uint64 framesRead;
+
+    ma_node_graph_read_pcm_frames(pNodeGraph, ppFramesOut[0], *pFrameCountOut, &framesRead);
+
+    *pFrameCountOut = (ma_uint32)framesRead;    /* Safe cast. */
+
+    (void)ppFramesIn;
+    (void)pFrameCountIn;
+}
+
+static ma_node_vtable g_node_graph_node_vtable =
+{
+    ma_node_graph_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    0,      /* 0 input buses. */
+    1,      /* 1 output bus. */
+    0       /* Flags. */
+};
+
+static void ma_node_graph_endpoint_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    MA_ASSERT(pNode != NULL);
+    MA_ASSERT(ma_node_get_input_bus_count(pNode)  == 1);
+    MA_ASSERT(ma_node_get_output_bus_count(pNode) == 1);
+
+    /* Input channel count needs to be the same as the output channel count. */
+    MA_ASSERT(ma_node_get_input_channels(pNode, 0) == ma_node_get_output_channels(pNode, 0));
+
+    /* We don't need to do anything here because it's a passthrough. */
+    (void)pNode;
+    (void)ppFramesIn;
+    (void)pFrameCountIn;
+    (void)ppFramesOut;
+    (void)pFrameCountOut;
+
+#if 0
+    /* The data has already been mixed. We just need to move it to the output buffer. */
+    if (ppFramesIn != NULL) {
+        ma_copy_pcm_frames(ppFramesOut[0], ppFramesIn[0], *pFrameCountOut, ma_format_f32, ma_node_get_output_channels(pNode, 0));
+    }
+#endif
+}
+
+static ma_node_vtable g_node_graph_endpoint_vtable =
+{
+    ma_node_graph_endpoint_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* 1 input bus. */
+    1,      /* 1 output bus. */
+    MA_NODE_FLAG_PASSTHROUGH    /* Flags. The endpoint is a passthrough. */
+};
+
+MA_API ma_result ma_node_graph_init(const ma_node_graph_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_node_graph* pNodeGraph)
+{
+    ma_result result;
+    ma_node_config baseConfig;
+    ma_node_config endpointConfig;
+
+    if (pNodeGraph == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNodeGraph);
+    pNodeGraph->processingSizeInFrames = pConfig->processingSizeInFrames;
+
+    /* Base node so we can use the node graph as a node into another graph. */
+    baseConfig = ma_node_config_init();
+    baseConfig.vtable = &g_node_graph_node_vtable;
+    baseConfig.pOutputChannels = &pConfig->channels;
+
+    result = ma_node_init(pNodeGraph, &baseConfig, pAllocationCallbacks, &pNodeGraph->base);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+
+    /* Endpoint. */
+    endpointConfig = ma_node_config_init();
+    endpointConfig.vtable          = &g_node_graph_endpoint_vtable;
+    endpointConfig.pInputChannels  = &pConfig->channels;
+    endpointConfig.pOutputChannels = &pConfig->channels;
+
+    result = ma_node_init(pNodeGraph, &endpointConfig, pAllocationCallbacks, &pNodeGraph->endpoint);
+    if (result != MA_SUCCESS) {
+        ma_node_uninit(&pNodeGraph->base, pAllocationCallbacks);
+        return result;
+    }
+
+
+    /* Processing cache. */
+    if (pConfig->processingSizeInFrames > 0) {
+        pNodeGraph->pProcessingCache = (float*)ma_malloc(pConfig->processingSizeInFrames * pConfig->channels * sizeof(float), pAllocationCallbacks);
+        if (pNodeGraph->pProcessingCache == NULL) {
+            ma_node_uninit(&pNodeGraph->endpoint, pAllocationCallbacks);
+            ma_node_uninit(&pNodeGraph->base, pAllocationCallbacks);
+            return MA_OUT_OF_MEMORY;
+        }
+    }
+
+
+    /*
+    We need a pre-mix stack. The size of this stack is configurable via the config. The default value depends on the channel count.
+    */
+    {
+        size_t preMixStackSizeInBytes = pConfig->preMixStackSizeInBytes;
+        if (preMixStackSizeInBytes == 0) {
+            preMixStackSizeInBytes = pConfig->channels * MA_DEFAULT_PREMIX_STACK_SIZE_PER_CHANNEL;
+        }
+
+        pNodeGraph->pPreMixStack = ma_stack_init(preMixStackSizeInBytes, pAllocationCallbacks);
+        if (pNodeGraph->pPreMixStack == NULL) {
+            ma_node_uninit(&pNodeGraph->endpoint, pAllocationCallbacks);
+            ma_node_uninit(&pNodeGraph->base, pAllocationCallbacks);
+            if (pNodeGraph->pProcessingCache != NULL) {
+                ma_free(pNodeGraph->pProcessingCache, pAllocationCallbacks);
+            }
+
+            return MA_OUT_OF_MEMORY;
+        }
+    }
+
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_node_graph_uninit(ma_node_graph* pNodeGraph, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pNodeGraph == NULL) {
+        return;
+    }
+
+    ma_node_uninit(&pNodeGraph->endpoint, pAllocationCallbacks);
+    ma_node_uninit(&pNodeGraph->base, pAllocationCallbacks);
+
+    if (pNodeGraph->pProcessingCache != NULL) {
+        ma_free(pNodeGraph->pProcessingCache, pAllocationCallbacks);
+        pNodeGraph->pProcessingCache = NULL;
+    }
+
+    if (pNodeGraph->pPreMixStack != NULL) {
+        ma_stack_uninit(pNodeGraph->pPreMixStack, pAllocationCallbacks);
+        pNodeGraph->pPreMixStack = NULL;
+    }
+}
+
+MA_API ma_node* ma_node_graph_get_endpoint(ma_node_graph* pNodeGraph)
+{
+    if (pNodeGraph == NULL) {
+        return NULL;
+    }
+
+    return &pNodeGraph->endpoint;
+}
+
+MA_API ma_result ma_node_graph_read_pcm_frames(ma_node_graph* pNodeGraph, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint64 totalFramesRead;
+    ma_uint32 channels;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;   /* Safety. */
+    }
+
+    if (pNodeGraph == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    channels = ma_node_get_output_channels(&pNodeGraph->endpoint, 0);
+
+
+    /* We'll be nice and try to do a full read of all frameCount frames. */
+    totalFramesRead = 0;
+    while (totalFramesRead < frameCount) {
+        ma_uint32 framesJustRead;
+        ma_uint64 framesToRead;
+        float* pRunningFramesOut;
+
+        framesToRead = frameCount - totalFramesRead;
+        if (framesToRead > 0xFFFFFFFF) {
+            framesToRead = 0xFFFFFFFF;
+        }
+
+        pRunningFramesOut = (float*)ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, ma_format_f32, channels);
+
+        /* If there's anything in the cache, consume that first. */
+        if (pNodeGraph->processingCacheFramesRemaining > 0) {
+            ma_uint32 framesToReadFromCache;
+
+            framesToReadFromCache = (ma_uint32)framesToRead;
+            if (framesToReadFromCache > pNodeGraph->processingCacheFramesRemaining) {
+                framesToReadFromCache = pNodeGraph->processingCacheFramesRemaining;
+            }
+
+            MA_COPY_MEMORY(pRunningFramesOut, pNodeGraph->pProcessingCache, framesToReadFromCache * channels * sizeof(float));
+            MA_MOVE_MEMORY(pNodeGraph->pProcessingCache, pNodeGraph->pProcessingCache + (framesToReadFromCache * channels), (pNodeGraph->processingCacheFramesRemaining - framesToReadFromCache) * channels * sizeof(float));
+            pNodeGraph->processingCacheFramesRemaining -= framesToReadFromCache;
+
+            totalFramesRead += framesToReadFromCache;
+            continue;
+        } else {
+            /*
+            If processingSizeInFrames is non-zero, we need to make sure we always read in chunks of that size. If the frame count is less than
+            that, we need to read into the cache and then continue on.
+            */
+            float* pReadDst = pRunningFramesOut;
+
+            if (pNodeGraph->processingSizeInFrames > 0) {
+                if (framesToRead < pNodeGraph->processingSizeInFrames) {
+                    pReadDst = pNodeGraph->pProcessingCache;    /* We need to read into the cache because otherwise we'll overflow the output buffer. */
+                }
+
+                framesToRead = pNodeGraph->processingSizeInFrames;
+            }
+
+            ma_node_graph_set_is_reading(pNodeGraph, MA_TRUE);
+            {
+                result = ma_node_read_pcm_frames(&pNodeGraph->endpoint, 0, pReadDst, (ma_uint32)framesToRead, &framesJustRead, ma_node_get_time(&pNodeGraph->endpoint));
+            }
+            ma_node_graph_set_is_reading(pNodeGraph, MA_FALSE);
+
+            /*
+            Do not increment the total frames read counter if we read into the cache. We use this to determine how many frames have
+            been written to the final output buffer.
+            */
+            if (pReadDst == pNodeGraph->pProcessingCache) {
+                /* We read into the cache. */
+                pNodeGraph->processingCacheFramesRemaining = framesJustRead;
+            } else {
+                /* We read straight into the output buffer. */
+                totalFramesRead += framesJustRead;
+            }
+
+            if (result != MA_SUCCESS) {
+                break;
+            }
+
+            /* Abort if we weren't able to read any frames or else we risk getting stuck in a loop. */
+            if (framesJustRead == 0) {
+                break;
+            }
+        }
+    }
+
+    /* Let's go ahead and silence any leftover frames just for some added safety to ensure the caller doesn't try emitting garbage out of the speakers. */
+    if (totalFramesRead < frameCount) {
+        ma_silence_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, totalFramesRead, ma_format_f32, channels), (frameCount - totalFramesRead), ma_format_f32, channels);
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = totalFramesRead;
+    }
+
+    return result;
+}
+
+MA_API ma_uint32 ma_node_graph_get_channels(const ma_node_graph* pNodeGraph)
+{
+    if (pNodeGraph == NULL) {
+        return 0;
+    }
+
+    return ma_node_get_output_channels(&pNodeGraph->endpoint, 0);
+}
+
+MA_API ma_uint64 ma_node_graph_get_time(const ma_node_graph* pNodeGraph)
+{
+    if (pNodeGraph == NULL) {
+        return 0;
+    }
+
+    return ma_node_get_time(&pNodeGraph->endpoint); /* Global time is just the local time of the endpoint. */
+}
+
+MA_API ma_result ma_node_graph_set_time(ma_node_graph* pNodeGraph, ma_uint64 globalTime)
+{
+    if (pNodeGraph == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_node_set_time(&pNodeGraph->endpoint, globalTime); /* Global time is just the local time of the endpoint. */
+}
+
+
+#define MA_NODE_OUTPUT_BUS_FLAG_HAS_READ    0x01    /* Whether or not this bus ready to read more data. Only used on nodes with multiple output buses. */
+
+static ma_result ma_node_output_bus_init(ma_node* pNode, ma_uint32 outputBusIndex, ma_uint32 channels, ma_node_output_bus* pOutputBus)
+{
+    MA_ASSERT(pOutputBus != NULL);
+    MA_ASSERT(outputBusIndex < MA_MAX_NODE_BUS_COUNT);
+    MA_ASSERT(outputBusIndex < ma_node_get_output_bus_count(pNode));
+    MA_ASSERT(channels < 256);
+
+    MA_ZERO_OBJECT(pOutputBus);
+
+    if (channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pOutputBus->pNode          = pNode;
+    pOutputBus->outputBusIndex = (ma_uint8)outputBusIndex;
+    pOutputBus->channels       = (ma_uint8)channels;
+    pOutputBus->flags          = MA_NODE_OUTPUT_BUS_FLAG_HAS_READ; /* <-- Important that this flag is set by default. */
+    pOutputBus->volume         = 1;
+
+    return MA_SUCCESS;
+}
+
+static void ma_node_output_bus_lock(ma_node_output_bus* pOutputBus)
+{
+    ma_spinlock_lock(&pOutputBus->lock);
+}
+
+static void ma_node_output_bus_unlock(ma_node_output_bus* pOutputBus)
+{
+    ma_spinlock_unlock(&pOutputBus->lock);
+}
+
+
+static ma_uint32 ma_node_output_bus_get_channels(const ma_node_output_bus* pOutputBus)
+{
+    return pOutputBus->channels;
+}
+
+
+static void ma_node_output_bus_set_has_read(ma_node_output_bus* pOutputBus, ma_bool32 hasRead)
+{
+    if (hasRead) {
+        ma_atomic_fetch_or_32(&pOutputBus->flags, MA_NODE_OUTPUT_BUS_FLAG_HAS_READ);
+    } else {
+        ma_atomic_fetch_and_32(&pOutputBus->flags, (ma_uint32)~MA_NODE_OUTPUT_BUS_FLAG_HAS_READ);
+    }
+}
+
+static ma_bool32 ma_node_output_bus_has_read(ma_node_output_bus* pOutputBus)
+{
+    return (ma_atomic_load_32(&pOutputBus->flags) & MA_NODE_OUTPUT_BUS_FLAG_HAS_READ) != 0;
+}
+
+
+static void ma_node_output_bus_set_is_attached(ma_node_output_bus* pOutputBus, ma_bool32 isAttached)
+{
+    ma_atomic_exchange_32(&pOutputBus->isAttached, isAttached);
+}
+
+static ma_bool32 ma_node_output_bus_is_attached(ma_node_output_bus* pOutputBus)
+{
+    return ma_atomic_load_32(&pOutputBus->isAttached);
+}
+
+
+static ma_result ma_node_output_bus_set_volume(ma_node_output_bus* pOutputBus, float volume)
+{
+    MA_ASSERT(pOutputBus != NULL);
+
+    if (volume < 0.0f) {
+        volume = 0.0f;
+    }
+
+    ma_atomic_exchange_f32(&pOutputBus->volume, volume);
+
+    return MA_SUCCESS;
+}
+
+static float ma_node_output_bus_get_volume(const ma_node_output_bus* pOutputBus)
+{
+    return ma_atomic_load_f32((float*)&pOutputBus->volume);
+}
+
+
+static ma_result ma_node_input_bus_init(ma_uint32 channels, ma_node_input_bus* pInputBus)
+{
+    MA_ASSERT(pInputBus != NULL);
+    MA_ASSERT(channels < 256);
+
+    MA_ZERO_OBJECT(pInputBus);
+
+    if (channels == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pInputBus->channels = (ma_uint8)channels;
+
+    return MA_SUCCESS;
+}
+
+static void ma_node_input_bus_lock(ma_node_input_bus* pInputBus)
+{
+    MA_ASSERT(pInputBus != NULL);
+
+    ma_spinlock_lock(&pInputBus->lock);
+}
+
+static void ma_node_input_bus_unlock(ma_node_input_bus* pInputBus)
+{
+    MA_ASSERT(pInputBus != NULL);
+
+    ma_spinlock_unlock(&pInputBus->lock);
+}
+
+
+static void ma_node_input_bus_next_begin(ma_node_input_bus* pInputBus)
+{
+    ma_atomic_fetch_add_32(&pInputBus->nextCounter, 1);
+}
+
+static void ma_node_input_bus_next_end(ma_node_input_bus* pInputBus)
+{
+    ma_atomic_fetch_sub_32(&pInputBus->nextCounter, 1);
+}
+
+static ma_uint32 ma_node_input_bus_get_next_counter(ma_node_input_bus* pInputBus)
+{
+    return ma_atomic_load_32(&pInputBus->nextCounter);
+}
+
+
+static ma_uint32 ma_node_input_bus_get_channels(const ma_node_input_bus* pInputBus)
+{
+    return pInputBus->channels;
+}
+
+
+static void ma_node_input_bus_detach__no_output_bus_lock(ma_node_input_bus* pInputBus, ma_node_output_bus* pOutputBus)
+{
+    MA_ASSERT(pInputBus  != NULL);
+    MA_ASSERT(pOutputBus != NULL);
+
+    /*
+    Mark the output bus as detached first. This will prevent future iterations on the audio thread
+    from iterating this output bus.
+    */
+    ma_node_output_bus_set_is_attached(pOutputBus, MA_FALSE);
+
+    /*
+    We cannot use the output bus lock here since it'll be getting used at a higher level, but we do
+    still need to use the input bus lock since we'll be updating pointers on two different output
+    buses. The same rules apply here as the attaching case. Although we're using a lock here, we're
+    *not* using a lock when iterating over the list in the audio thread. We therefore need to craft
+    this in a way such that the iteration on the audio thread doesn't break.
+
+    The first thing to do is swap out the "next" pointer of the previous output bus with the
+    new "next" output bus. This is the operation that matters for iteration on the audio thread.
+    After that, the previous pointer on the new "next" pointer needs to be updated, after which
+    point the linked list will be in a good state.
+    */
+    ma_node_input_bus_lock(pInputBus);
+    {
+        ma_node_output_bus* pOldPrev = (ma_node_output_bus*)ma_atomic_load_ptr(&pOutputBus->pPrev);
+        ma_node_output_bus* pOldNext = (ma_node_output_bus*)ma_atomic_load_ptr(&pOutputBus->pNext);
+
+        if (pOldPrev != NULL) {
+            ma_atomic_exchange_ptr(&pOldPrev->pNext, pOldNext); /* <-- This is where the output bus is detached from the list. */
+        }
+        if (pOldNext != NULL) {
+            ma_atomic_exchange_ptr(&pOldNext->pPrev, pOldPrev); /* <-- This is required for detachment. */
+        }
+    }
+    ma_node_input_bus_unlock(pInputBus);
+
+    /* At this point the output bus is detached and the linked list is completely unaware of it. Reset some data for safety. */
+    ma_atomic_exchange_ptr(&pOutputBus->pNext, NULL);   /* Using atomic exchanges here, mainly for the benefit of analysis tools which don't always recognize spinlocks. */
+    ma_atomic_exchange_ptr(&pOutputBus->pPrev, NULL);   /* As above. */
+    pOutputBus->pInputNode             = NULL;
+    pOutputBus->inputNodeInputBusIndex = 0;
+
+
+    /*
+    For thread-safety reasons, we don't want to be returning from this straight away. We need to
+    wait for the audio thread to finish with the output bus. There's two things we need to wait
+    for. The first is the part that selects the next output bus in the list, and the other is the
+    part that reads from the output bus. Basically all we're doing is waiting for the input bus
+    to stop referencing the output bus.
+
+    We're doing this part last because we want the section above to run while the audio thread
+    is finishing up with the output bus, just for efficiency reasons. We marked the output bus as
+    detached right at the top of this function which is going to prevent the audio thread from
+    iterating the output bus again.
+    */
+
+    /* Part 1: Wait for the current iteration to complete. */
+    while (ma_node_input_bus_get_next_counter(pInputBus) > 0) {
+        ma_yield();
+    }
+
+    /* Part 2: Wait for any reads to complete. */
+    while (ma_atomic_load_32(&pOutputBus->refCount) > 0) {
+        ma_yield();
+    }
+
+    /*
+    At this point we're done detaching and we can be guaranteed that the audio thread is not going
+    to attempt to reference this output bus again (until attached again).
+    */
+}
+
+#if 0   /* Not used at the moment, but leaving here in case I need it later. */
+static void ma_node_input_bus_detach(ma_node_input_bus* pInputBus, ma_node_output_bus* pOutputBus)
+{
+    MA_ASSERT(pInputBus  != NULL);
+    MA_ASSERT(pOutputBus != NULL);
+
+    ma_node_output_bus_lock(pOutputBus);
+    {
+        ma_node_input_bus_detach__no_output_bus_lock(pInputBus, pOutputBus);
+    }
+    ma_node_output_bus_unlock(pOutputBus);
+}
+#endif
+
+static void ma_node_input_bus_attach(ma_node_input_bus* pInputBus, ma_node_output_bus* pOutputBus, ma_node* pNewInputNode, ma_uint32 inputNodeInputBusIndex)
+{
+    MA_ASSERT(pInputBus  != NULL);
+    MA_ASSERT(pOutputBus != NULL);
+
+    ma_node_output_bus_lock(pOutputBus);
+    {
+        ma_node_output_bus* pOldInputNode = (ma_node_output_bus*)ma_atomic_load_ptr(&pOutputBus->pInputNode);
+
+        /* Detach from any existing attachment first if necessary. */
+        if (pOldInputNode != NULL) {
+            ma_node_input_bus_detach__no_output_bus_lock(pInputBus, pOutputBus);
+        }
+
+        /*
+        At this point we can be sure the output bus is not attached to anything. The linked list in the
+        old input bus has been updated so that pOutputBus will not get iterated again.
+        */
+        pOutputBus->pInputNode             = pNewInputNode;                     /* No need for an atomic assignment here because modification of this variable always happens within a lock. */
+        pOutputBus->inputNodeInputBusIndex = (ma_uint8)inputNodeInputBusIndex;
+
+        /*
+        Now we need to attach the output bus to the linked list. This involves updating two pointers on
+        two different output buses so I'm going to go ahead and keep this simple and just use a lock.
+        There are ways to do this without a lock, but it's just too hard to maintain for its value.
+
+        Although we're locking here, it's important to remember that we're *not* locking when iterating
+        and reading audio data since that'll be running on the audio thread. As a result we need to be
+        careful how we craft this so that we don't break iteration. What we're going to do is always
+        attach the new item so that it becomes the first item in the list. That way, as we're iterating
+        we won't break any links in the list and iteration will continue safely. The detaching case will
+        also be crafted in a way as to not break list iteration. It's important to remember to use
+        atomic exchanges here since no locking is happening on the audio thread during iteration.
+        */
+        ma_node_input_bus_lock(pInputBus);
+        {
+            ma_node_output_bus* pNewPrev = &pInputBus->head;
+            ma_node_output_bus* pNewNext = (ma_node_output_bus*)ma_atomic_load_ptr(&pInputBus->head.pNext);
+
+            /* Update the local output bus. */
+            ma_atomic_exchange_ptr(&pOutputBus->pPrev, pNewPrev);
+            ma_atomic_exchange_ptr(&pOutputBus->pNext, pNewNext);
+
+            /* Update the other output buses to point back to the local output bus. */
+            ma_atomic_exchange_ptr(&pInputBus->head.pNext, pOutputBus); /* <-- This is where the output bus is actually attached to the input bus. */
+
+            /* Do the previous pointer last. This is only used for detachment. */
+            if (pNewNext != NULL) {
+                ma_atomic_exchange_ptr(&pNewNext->pPrev,  pOutputBus);
+            }
+        }
+        ma_node_input_bus_unlock(pInputBus);
+
+        /*
+        Mark the node as attached last. This is used to controlling whether or the output bus will be
+        iterated on the audio thread. Mainly required for detachment purposes.
+        */
+        ma_node_output_bus_set_is_attached(pOutputBus, MA_TRUE);
+    }
+    ma_node_output_bus_unlock(pOutputBus);
+}
+
+static ma_node_output_bus* ma_node_input_bus_next(ma_node_input_bus* pInputBus, ma_node_output_bus* pOutputBus)
+{
+    ma_node_output_bus* pNext;
+
+    MA_ASSERT(pInputBus != NULL);
+
+    if (pOutputBus == NULL) {
+        return NULL;
+    }
+
+    ma_node_input_bus_next_begin(pInputBus);
+    {
+        pNext = pOutputBus;
+        for (;;) {
+            pNext = (ma_node_output_bus*)ma_atomic_load_ptr(&pNext->pNext);
+            if (pNext == NULL) {
+                break;      /* Reached the end. */
+            }
+
+            if (ma_node_output_bus_is_attached(pNext) == MA_FALSE) {
+                continue;   /* The node is not attached. Keep checking. */
+            }
+
+            /* The next node has been selected. */
+            break;
+        }
+
+        /* We need to increment the reference count of the selected node. */
+        if (pNext != NULL) {
+            ma_atomic_fetch_add_32(&pNext->refCount, 1);
+        }
+
+        /* The previous node is no longer being referenced. */
+        ma_atomic_fetch_sub_32(&pOutputBus->refCount, 1);
+    }
+    ma_node_input_bus_next_end(pInputBus);
+
+    return pNext;
+}
+
+static ma_node_output_bus* ma_node_input_bus_first(ma_node_input_bus* pInputBus)
+{
+    return ma_node_input_bus_next(pInputBus, &pInputBus->head);
+}
+
+
+
+static ma_result ma_node_input_bus_read_pcm_frames(ma_node* pInputNode, ma_node_input_bus* pInputBus, float* pFramesOut, ma_uint32 frameCount, ma_uint32* pFramesRead, ma_uint64 globalTime)
+{
+    ma_result result = MA_SUCCESS;
+    ma_node_output_bus* pOutputBus;
+    ma_node_output_bus* pFirst;
+    ma_uint32 inputChannels;
+    ma_bool32 doesOutputBufferHaveContent = MA_FALSE;
+
+    /*
+    This will be called from the audio thread which means we can't be doing any locking. Basically,
+    this function will not perform any locking, whereas attaching and detaching will, but crafted in
+    such a way that we don't need to perform any locking here. The important thing to remember is
+    to always iterate in a forward direction.
+
+    In order to process any data we need to first read from all input buses. That's where this
+    function comes in. This iterates over each of the attachments and accumulates/mixes them. We
+    also convert the channels to the nodes output channel count before mixing. We want to do this
+    channel conversion so that the caller of this function can invoke the processing callback
+    without having to do it themselves.
+
+    When we iterate over each of the attachments on the input bus, we need to read as much data as
+    we can from each of them so that we don't end up with holes between each of the attachments. To
+    do this, we need to read from each attachment in a loop and read as many frames as we can, up
+    to `frameCount`.
+    */
+    MA_ASSERT(pInputNode  != NULL);
+    MA_ASSERT(pFramesRead != NULL); /* pFramesRead is critical and must always be specified. On input it's undefined and on output it'll be set to the number of frames actually read. */
+
+    *pFramesRead = 0;   /* Safety. */
+
+    inputChannels = ma_node_input_bus_get_channels(pInputBus);
+
+    /*
+    We need to be careful with how we call ma_node_input_bus_first() and ma_node_input_bus_next(). They
+    are both critical to our lock-free thread-safety system. We can only call ma_node_input_bus_first()
+    once per iteration, however we have an optimization to checks whether or not it's the first item in
+    the list. We therefore need to store a pointer to the first item rather than repeatedly calling
+    ma_node_input_bus_first(). It's safe to keep hold of this pointer, so long as we don't dereference it
+    after calling ma_node_input_bus_next(), which we won't be.
+    */
+    pFirst = ma_node_input_bus_first(pInputBus);
+    if (pFirst == NULL) {
+        return MA_SUCCESS;  /* No attachments. Read nothing. */
+    }
+
+    for (pOutputBus = pFirst; pOutputBus != NULL; pOutputBus = ma_node_input_bus_next(pInputBus, pOutputBus)) {
+        ma_uint32 framesProcessed = 0;
+        ma_bool32 isSilentOutput = MA_FALSE;
+
+        MA_ASSERT(pOutputBus->pNode != NULL);
+        MA_ASSERT(((ma_node_base*)pOutputBus->pNode)->vtable != NULL);
+
+        isSilentOutput = (((ma_node_base*)pOutputBus->pNode)->vtable->flags & MA_NODE_FLAG_SILENT_OUTPUT) != 0;
+
+        if (pFramesOut != NULL) {
+            /* Read. */
+            while (framesProcessed < frameCount) {
+                float* pRunningFramesOut;
+                ma_uint32 framesToRead;
+                ma_uint32 framesJustRead = 0;
+
+                framesToRead = frameCount - framesProcessed;
+                pRunningFramesOut = ma_offset_pcm_frames_ptr_f32(pFramesOut, framesProcessed, inputChannels);
+
+                if (doesOutputBufferHaveContent == MA_FALSE) {
+                    /* Fast path. First attachment. We just read straight into the output buffer (no mixing required). */
+                    result = ma_node_read_pcm_frames(pOutputBus->pNode, pOutputBus->outputBusIndex, pRunningFramesOut, framesToRead, &framesJustRead, globalTime + framesProcessed);
+                } else {
+                    /* Slow path. Not the first attachment. Mixing required. */
+                    ma_uint32 preMixBufferCapInFrames = ((ma_node_base*)pInputNode)->cachedDataCapInFramesPerBus;
+                    float* pPreMixBuffer = (float*)ma_stack_alloc(((ma_node_base*)pInputNode)->pNodeGraph->pPreMixStack, preMixBufferCapInFrames * inputChannels * sizeof(float));
+
+                    if (pPreMixBuffer == NULL) {
+                        /*
+                        If you're hitting this assert it means you've got an unusually deep chain of nodes, you've got an excessively large processing
+                        size, or you have a combination of both, and as a result have run out of stack space. You can increase this using the
+                        preMixStackSizeInBytes variable in ma_node_graph_config. If you're using ma_engine, you can do it via the preMixStackSizeInBytes
+                        variable in ma_engine_config. It defaults to 512KB per output channel.
+                        */
+                        MA_ASSERT(MA_FALSE);
+                    } else {
+                        if (framesToRead > preMixBufferCapInFrames) {
+                            framesToRead = preMixBufferCapInFrames;
+                        }
+
+                        result = ma_node_read_pcm_frames(pOutputBus->pNode, pOutputBus->outputBusIndex, pPreMixBuffer, framesToRead, &framesJustRead, globalTime + framesProcessed);
+                        if (result == MA_SUCCESS || result == MA_AT_END) {
+                            if (isSilentOutput == MA_FALSE) {   /* Don't mix if the node outputs silence. */
+                                ma_mix_pcm_frames_f32(pRunningFramesOut, pPreMixBuffer, framesJustRead, inputChannels, /*volume*/1);
+                            }
+                        }
+
+                        /* The pre-mix buffer is no longer required. */
+                        ma_stack_free(((ma_node_base*)pInputNode)->pNodeGraph->pPreMixStack, pPreMixBuffer);
+                        pPreMixBuffer = NULL;
+                    }
+                }
+
+                framesProcessed += framesJustRead;
+
+                /* If we reached the end or otherwise failed to read any data we need to finish up with this output node. */
+                if (result != MA_SUCCESS) {
+                    break;
+                }
+
+                /* If we didn't read anything, abort so we don't get stuck in a loop. */
+                if (framesJustRead == 0) {
+                    break;
+                }
+            }
+
+            /* If it's the first attachment we didn't do any mixing. Any leftover samples need to be silenced. */
+            if (pOutputBus == pFirst && framesProcessed < frameCount) {
+                ma_silence_pcm_frames(ma_offset_pcm_frames_ptr(pFramesOut, framesProcessed, ma_format_f32, inputChannels), (frameCount - framesProcessed), ma_format_f32, inputChannels);
+            }
+
+            if (isSilentOutput == MA_FALSE) {
+                doesOutputBufferHaveContent = MA_TRUE;
+            }
+        } else {
+            /* Seek. */
+            ma_node_read_pcm_frames(pOutputBus->pNode, pOutputBus->outputBusIndex, NULL, frameCount, &framesProcessed, globalTime);
+        }
+    }
+
+    /* If we didn't output anything, output silence. */
+    if (doesOutputBufferHaveContent == MA_FALSE && pFramesOut != NULL) {
+        ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, inputChannels);
+    }
+
+    /* In this path we always "process" the entire amount. */
+    *pFramesRead = frameCount;
+
+    return result;
+}
+
+
+MA_API ma_node_config ma_node_config_init(void)
+{
+    ma_node_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.initialState   = ma_node_state_started;    /* Nodes are started by default. */
+    config.inputBusCount  = MA_NODE_BUS_COUNT_UNKNOWN;
+    config.outputBusCount = MA_NODE_BUS_COUNT_UNKNOWN;
+
+    return config;
+}
+
+static ma_uint16 ma_node_config_get_cache_size_in_frames(const ma_node_config* pConfig, const ma_node_graph* pNodeGraph)
+{
+    ma_uint32 cacheSizeInFrames;
+
+    (void)pConfig;
+
+    if (pNodeGraph->processingSizeInFrames > 0) {
+        cacheSizeInFrames = pNodeGraph->processingSizeInFrames;
+    } else {
+        cacheSizeInFrames = MA_DEFAULT_NODE_CACHE_CAP_IN_FRAMES_PER_BUS;
+    }
+
+    if (cacheSizeInFrames > 0xFFFF) {
+        cacheSizeInFrames = 0xFFFF;
+    }
+
+    return (ma_uint16)cacheSizeInFrames;
+}
+
+
+
+static ma_result ma_node_detach_full(ma_node* pNode);
+
+static float* ma_node_get_cached_input_ptr(ma_node* pNode, ma_uint32 inputBusIndex)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+    ma_uint32 iInputBus;
+    float* pBasePtr;
+
+    MA_ASSERT(pNodeBase != NULL);
+
+    /* Input data is stored at the front of the buffer. */
+    pBasePtr = pNodeBase->pCachedData;
+    for (iInputBus = 0; iInputBus < inputBusIndex; iInputBus += 1) {
+        pBasePtr += pNodeBase->cachedDataCapInFramesPerBus * ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[iInputBus]);
+    }
+
+    return pBasePtr;
+}
+
+static float* ma_node_get_cached_output_ptr(ma_node* pNode, ma_uint32 outputBusIndex)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+    ma_uint32 iInputBus;
+    ma_uint32 iOutputBus;
+    float* pBasePtr;
+
+    MA_ASSERT(pNodeBase != NULL);
+
+    /* Cached output data starts after the input data. */
+    pBasePtr = pNodeBase->pCachedData;
+    for (iInputBus = 0; iInputBus < ma_node_get_input_bus_count(pNodeBase); iInputBus += 1) {
+        pBasePtr += pNodeBase->cachedDataCapInFramesPerBus * ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[iInputBus]);
+    }
+
+    for (iOutputBus = 0; iOutputBus < outputBusIndex; iOutputBus += 1) {
+        pBasePtr += pNodeBase->cachedDataCapInFramesPerBus * ma_node_output_bus_get_channels(&pNodeBase->pOutputBuses[iOutputBus]);
+    }
+
+    return pBasePtr;
+}
+
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t inputBusOffset;
+    size_t outputBusOffset;
+    size_t cachedDataOffset;
+    ma_uint32 inputBusCount;    /* So it doesn't have to be calculated twice. */
+    ma_uint32 outputBusCount;   /* So it doesn't have to be calculated twice. */
+} ma_node_heap_layout;
+
+static ma_result ma_node_translate_bus_counts(const ma_node_config* pConfig, ma_uint32* pInputBusCount, ma_uint32* pOutputBusCount)
+{
+    ma_uint32 inputBusCount;
+    ma_uint32 outputBusCount;
+
+    MA_ASSERT(pConfig != NULL);
+    MA_ASSERT(pInputBusCount  != NULL);
+    MA_ASSERT(pOutputBusCount != NULL);
+
+    /* Bus counts are determined by the vtable, unless they're set to `MA_NODE_BUS_COUNT_UNKNWON`, in which case they're taken from the config. */
+    if (pConfig->vtable->inputBusCount == MA_NODE_BUS_COUNT_UNKNOWN) {
+        inputBusCount = pConfig->inputBusCount;
+    } else {
+        inputBusCount = pConfig->vtable->inputBusCount;
+
+        if (pConfig->inputBusCount != MA_NODE_BUS_COUNT_UNKNOWN && pConfig->inputBusCount != pConfig->vtable->inputBusCount) {
+            return MA_INVALID_ARGS; /* Invalid configuration. You must not specify a conflicting bus count between the node's config and the vtable. */
+        }
+    }
+
+    if (pConfig->vtable->outputBusCount == MA_NODE_BUS_COUNT_UNKNOWN) {
+        outputBusCount = pConfig->outputBusCount;
+    } else {
+        outputBusCount = pConfig->vtable->outputBusCount;
+
+        if (pConfig->outputBusCount != MA_NODE_BUS_COUNT_UNKNOWN && pConfig->outputBusCount != pConfig->vtable->outputBusCount) {
+            return MA_INVALID_ARGS; /* Invalid configuration. You must not specify a conflicting bus count between the node's config and the vtable. */
+        }
+    }
+
+    /* Bus counts must be within limits. */
+    if (inputBusCount > MA_MAX_NODE_BUS_COUNT || outputBusCount > MA_MAX_NODE_BUS_COUNT) {
+        return MA_INVALID_ARGS;
+    }
+
+
+    /* We must have channel counts for each bus. */
+    if ((inputBusCount > 0 && pConfig->pInputChannels == NULL) || (outputBusCount > 0 && pConfig->pOutputChannels == NULL)) {
+        return MA_INVALID_ARGS; /* You must specify channel counts for each input and output bus. */
+    }
+
+
+    /* Some special rules for passthrough nodes. */
+    if ((pConfig->vtable->flags & MA_NODE_FLAG_PASSTHROUGH) != 0) {
+        if ((pConfig->vtable->inputBusCount != 0 && pConfig->vtable->inputBusCount != 1) || pConfig->vtable->outputBusCount != 1) {
+            return MA_INVALID_ARGS; /* Passthrough nodes must have exactly 1 output bus and either 0 or 1 input bus. */
+        }
+
+        if (pConfig->pInputChannels[0] != pConfig->pOutputChannels[0]) {
+            return MA_INVALID_ARGS; /* Passthrough nodes must have the same number of channels between input and output nodes. */
+        }
+    }
+
+
+    *pInputBusCount  = inputBusCount;
+    *pOutputBusCount = outputBusCount;
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_node_get_heap_layout(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, ma_node_heap_layout* pHeapLayout)
+{
+    ma_result result;
+    ma_uint32 inputBusCount;
+    ma_uint32 outputBusCount;
+
+    MA_ASSERT(pHeapLayout != NULL);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL || pConfig->vtable == NULL || pConfig->vtable->onProcess == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_node_translate_bus_counts(pConfig, &inputBusCount, &outputBusCount);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    /* Input buses. */
+    if (inputBusCount > MA_MAX_NODE_LOCAL_BUS_COUNT) {
+        pHeapLayout->inputBusOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += ma_align_64(sizeof(ma_node_input_bus) * inputBusCount);
+    } else {
+        pHeapLayout->inputBusOffset = MA_SIZE_MAX;  /* MA_SIZE_MAX indicates that no heap allocation is required for the input bus. */
+    }
+
+    /* Output buses. */
+    if (outputBusCount > MA_MAX_NODE_LOCAL_BUS_COUNT) {
+        pHeapLayout->outputBusOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += ma_align_64(sizeof(ma_node_output_bus) * outputBusCount);
+    } else {
+        pHeapLayout->outputBusOffset = MA_SIZE_MAX;
+    }
+
+    /*
+    Cached audio data.
+
+    We need to allocate memory for caching both input and output data. We have an optimization
+    where no caching is necessary for specific conditions:
+
+        - The node has 0 inputs and 1 output.
+
+    When a node meets the above conditions, no cache is allocated.
+
+    The size choice for this buffer is a little bit finicky. We don't want to be too wasteful by
+    allocating too much, but at the same time we want it be large enough so that enough frames can
+    be processed for each call to ma_node_read_pcm_frames() so that it keeps things efficient. For
+    now I'm going with 10ms @ 48K which is 480 frames per bus. This is configurable at compile
+    time. It might also be worth investigating whether or not this can be configured at run time.
+    */
+    if (inputBusCount == 0 && outputBusCount == 1) {
+        /* Fast path. No cache needed. */
+        pHeapLayout->cachedDataOffset = MA_SIZE_MAX;
+    } else {
+        /* Slow path. Cache needed. */
+        size_t cachedDataSizeInBytes = 0;
+        ma_uint32 cacheCapInFrames;
+        ma_uint32 iBus;
+
+        /* The capacity of the cache is based on our callback processing size. */
+        cacheCapInFrames = ma_node_config_get_cache_size_in_frames(pConfig, pNodeGraph);
+
+        for (iBus = 0; iBus < inputBusCount; iBus += 1) {
+            cachedDataSizeInBytes += cacheCapInFrames * ma_get_bytes_per_frame(ma_format_f32, pConfig->pInputChannels[iBus]);
+        }
+
+        for (iBus = 0; iBus < outputBusCount; iBus += 1) {
+            cachedDataSizeInBytes += cacheCapInFrames * ma_get_bytes_per_frame(ma_format_f32, pConfig->pOutputChannels[iBus]);
+        }
+
+        pHeapLayout->cachedDataOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += ma_align_64(cachedDataSizeInBytes);
+    }
+
+
+    /*
+    Not technically part of the heap, but we can output the input and output bus counts so we can
+    avoid a redundant call to ma_node_translate_bus_counts().
+    */
+    pHeapLayout->inputBusCount  = inputBusCount;
+    pHeapLayout->outputBusCount = outputBusCount;
+
+    /* Make sure allocation size is aligned. */
+    pHeapLayout->sizeInBytes = ma_align_64(pHeapLayout->sizeInBytes);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_node_get_heap_size(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_node_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_node_get_heap_layout(pNodeGraph, pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_node_init_preallocated(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, void* pHeap, ma_node* pNode)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+    ma_result result;
+    ma_node_heap_layout heapLayout;
+    ma_uint32 iInputBus;
+    ma_uint32 iOutputBus;
+
+    if (pNodeBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNodeBase);
+
+    result = ma_node_get_heap_layout(pNodeGraph, pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pNodeBase->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pNodeBase->pNodeGraph     = pNodeGraph;
+    pNodeBase->vtable         = pConfig->vtable;
+    pNodeBase->state          = pConfig->initialState;
+    pNodeBase->stateTimes[ma_node_state_started] = 0;
+    pNodeBase->stateTimes[ma_node_state_stopped] = (ma_uint64)(ma_int64)-1; /* Weird casting for VC6 compatibility. */
+    pNodeBase->inputBusCount  = heapLayout.inputBusCount;
+    pNodeBase->outputBusCount = heapLayout.outputBusCount;
+
+    if (heapLayout.inputBusOffset != MA_SIZE_MAX) {
+        pNodeBase->pInputBuses = (ma_node_input_bus*)ma_offset_ptr(pHeap, heapLayout.inputBusOffset);
+    } else {
+        pNodeBase->pInputBuses = pNodeBase->_inputBuses;
+    }
+
+    if (heapLayout.outputBusOffset != MA_SIZE_MAX) {
+        pNodeBase->pOutputBuses = (ma_node_output_bus*)ma_offset_ptr(pHeap, heapLayout.outputBusOffset);
+    } else {
+        pNodeBase->pOutputBuses = pNodeBase->_outputBuses;
+    }
+
+    if (heapLayout.cachedDataOffset != MA_SIZE_MAX) {
+        pNodeBase->pCachedData = (float*)ma_offset_ptr(pHeap, heapLayout.cachedDataOffset);
+        pNodeBase->cachedDataCapInFramesPerBus = ma_node_config_get_cache_size_in_frames(pConfig, pNodeGraph);
+    } else {
+        pNodeBase->pCachedData = NULL;
+    }
+
+
+    /* We need to run an initialization step for each input and output bus. */
+    for (iInputBus = 0; iInputBus < ma_node_get_input_bus_count(pNodeBase); iInputBus += 1) {
+        result = ma_node_input_bus_init(pConfig->pInputChannels[iInputBus], &pNodeBase->pInputBuses[iInputBus]);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+    for (iOutputBus = 0; iOutputBus < ma_node_get_output_bus_count(pNodeBase); iOutputBus += 1) {
+        result = ma_node_output_bus_init(pNodeBase, iOutputBus, pConfig->pOutputChannels[iOutputBus], &pNodeBase->pOutputBuses[iOutputBus]);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+    }
+
+
+    /* The cached data needs to be initialized to silence (or a sine wave tone if we're debugging). */
+    if (pNodeBase->pCachedData != NULL) {
+        ma_uint32 iBus;
+
+    #if 1   /* Toggle this between 0 and 1 to turn debugging on or off. 1 = fill with a sine wave for debugging; 0 = fill with silence. */
+        /* For safety we'll go ahead and default the buffer to silence. */
+        for (iBus = 0; iBus < ma_node_get_input_bus_count(pNodeBase); iBus += 1) {
+            ma_silence_pcm_frames(ma_node_get_cached_input_ptr(pNode, iBus), pNodeBase->cachedDataCapInFramesPerBus, ma_format_f32, ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[iBus]));
+        }
+        for (iBus = 0; iBus < ma_node_get_output_bus_count(pNodeBase); iBus += 1) {
+            ma_silence_pcm_frames(ma_node_get_cached_output_ptr(pNode, iBus), pNodeBase->cachedDataCapInFramesPerBus, ma_format_f32, ma_node_output_bus_get_channels(&pNodeBase->pOutputBuses[iBus]));
+        }
+    #else
+        /* For debugging. Default to a sine wave. */
+        for (iBus = 0; iBus < ma_node_get_input_bus_count(pNodeBase); iBus += 1) {
+            ma_debug_fill_pcm_frames_with_sine_wave(ma_node_get_cached_input_ptr(pNode, iBus), pNodeBase->cachedDataCapInFramesPerBus, ma_format_f32, ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[iBus]), 48000);
+        }
+        for (iBus = 0; iBus < ma_node_get_output_bus_count(pNodeBase); iBus += 1) {
+            ma_debug_fill_pcm_frames_with_sine_wave(ma_node_get_cached_output_ptr(pNode, iBus), pNodeBase->cachedDataCapInFramesPerBus, ma_format_f32, ma_node_output_bus_get_channels(&pNodeBase->pOutputBuses[iBus]), 48000);
+        }
+    #endif
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_node_init(ma_node_graph* pNodeGraph, const ma_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_node* pNode)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_node_get_heap_size(pNodeGraph, pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_node_init_preallocated(pNodeGraph, pConfig, pHeap, pNode);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    ((ma_node_base*)pNode)->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_node_uninit(ma_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+
+    if (pNodeBase == NULL) {
+        return;
+    }
+
+    /*
+    The first thing we need to do is fully detach the node. This will detach all inputs and
+    outputs. We need to do this first because it will sever the connection with the node graph and
+    allow us to complete uninitialization without needing to worry about thread-safety with the
+    audio thread. The detachment process will wait for any local processing of the node to finish.
+    */
+    ma_node_detach_full(pNode);
+
+    /*
+    At this point the node should be completely unreferenced by the node graph and we can finish up
+    the uninitialization process without needing to worry about thread-safety.
+    */
+    if (pNodeBase->_ownsHeap) {
+        ma_free(pNodeBase->_pHeap, pAllocationCallbacks);
+    }
+}
+
+MA_API ma_node_graph* ma_node_get_node_graph(const ma_node* pNode)
+{
+    if (pNode == NULL) {
+        return NULL;
+    }
+
+    return ((const ma_node_base*)pNode)->pNodeGraph;
+}
+
+MA_API ma_uint32 ma_node_get_input_bus_count(const ma_node* pNode)
+{
+    if (pNode == NULL) {
+        return 0;
+    }
+
+    return ((ma_node_base*)pNode)->inputBusCount;
+}
+
+MA_API ma_uint32 ma_node_get_output_bus_count(const ma_node* pNode)
+{
+    if (pNode == NULL) {
+        return 0;
+    }
+
+    return ((ma_node_base*)pNode)->outputBusCount;
+}
+
+
+MA_API ma_uint32 ma_node_get_input_channels(const ma_node* pNode, ma_uint32 inputBusIndex)
+{
+    const ma_node_base* pNodeBase = (const ma_node_base*)pNode;
+
+    if (pNode == NULL) {
+        return 0;
+    }
+
+    if (inputBusIndex >= ma_node_get_input_bus_count(pNode)) {
+        return 0;   /* Invalid bus index. */
+    }
+
+    return ma_node_input_bus_get_channels(&pNodeBase->pInputBuses[inputBusIndex]);
+}
+
+MA_API ma_uint32 ma_node_get_output_channels(const ma_node* pNode, ma_uint32 outputBusIndex)
+{
+    const ma_node_base* pNodeBase = (const ma_node_base*)pNode;
+
+    if (pNode == NULL) {
+        return 0;
+    }
+
+    if (outputBusIndex >= ma_node_get_output_bus_count(pNode)) {
+        return 0;   /* Invalid bus index. */
+    }
+
+    return ma_node_output_bus_get_channels(&pNodeBase->pOutputBuses[outputBusIndex]);
+}
+
+
+static ma_result ma_node_detach_full(ma_node* pNode)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+    ma_uint32 iInputBus;
+
+    if (pNodeBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /*
+    Make sure the node is completely detached first. This will not return until the output bus is
+    guaranteed to no longer be referenced by the audio thread.
+    */
+    ma_node_detach_all_output_buses(pNode);
+
+    /*
+    At this point all output buses will have been detached from the graph and we can be guaranteed
+    that none of its input nodes will be getting processed by the graph. We can detach these
+    without needing to worry about the audio thread touching them.
+    */
+    for (iInputBus = 0; iInputBus < ma_node_get_input_bus_count(pNode); iInputBus += 1) {
+        ma_node_input_bus* pInputBus;
+        ma_node_output_bus* pOutputBus;
+
+        pInputBus = &pNodeBase->pInputBuses[iInputBus];
+
+        /*
+        This is important. We cannot be using ma_node_input_bus_first() or ma_node_input_bus_next(). Those
+        functions are specifically for the audio thread. We'll instead just manually iterate using standard
+        linked list logic. We don't need to worry about the audio thread referencing these because the step
+        above severed the connection to the graph.
+        */
+        for (pOutputBus = (ma_node_output_bus*)ma_atomic_load_ptr(&pInputBus->head.pNext); pOutputBus != NULL; pOutputBus = (ma_node_output_bus*)ma_atomic_load_ptr(&pInputBus->head.pNext)) {
+            ma_node_detach_output_bus(pOutputBus->pNode, pOutputBus->outputBusIndex);   /* This won't do any waiting in practice and should be efficient. */
+        }
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_node_detach_output_bus(ma_node* pNode, ma_uint32 outputBusIndex)
+{
+    ma_result result = MA_SUCCESS;
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+    ma_node_base* pInputNodeBase;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (outputBusIndex >= ma_node_get_output_bus_count(pNode)) {
+        return MA_INVALID_ARGS; /* Invalid output bus index. */
+    }
+
+    /* We need to lock the output bus because we need to inspect the input node and grab its input bus. */
+    ma_node_output_bus_lock(&pNodeBase->pOutputBuses[outputBusIndex]);
+    {
+        pInputNodeBase = (ma_node_base*)pNodeBase->pOutputBuses[outputBusIndex].pInputNode;
+        if (pInputNodeBase != NULL) {
+            ma_node_input_bus_detach__no_output_bus_lock(&pInputNodeBase->pInputBuses[pNodeBase->pOutputBuses[outputBusIndex].inputNodeInputBusIndex], &pNodeBase->pOutputBuses[outputBusIndex]);
+        }
+    }
+    ma_node_output_bus_unlock(&pNodeBase->pOutputBuses[outputBusIndex]);
+
+    return result;
+}
+
+MA_API ma_result ma_node_detach_all_output_buses(ma_node* pNode)
+{
+    ma_uint32 iOutputBus;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    for (iOutputBus = 0; iOutputBus < ma_node_get_output_bus_count(pNode); iOutputBus += 1) {
+        ma_node_detach_output_bus(pNode, iOutputBus);
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_node_attach_output_bus(ma_node* pNode, ma_uint32 outputBusIndex, ma_node* pOtherNode, ma_uint32 otherNodeInputBusIndex)
+{
+    ma_node_base* pNodeBase  = (ma_node_base*)pNode;
+    ma_node_base* pOtherNodeBase = (ma_node_base*)pOtherNode;
+
+    if (pNodeBase == NULL || pOtherNodeBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pNodeBase == pOtherNodeBase) {
+        return MA_INVALID_OPERATION;    /* Cannot attach a node to itself. */
+    }
+
+    if (outputBusIndex >= ma_node_get_output_bus_count(pNode) || otherNodeInputBusIndex >= ma_node_get_input_bus_count(pOtherNode)) {
+        return MA_INVALID_OPERATION;    /* Invalid bus index. */
+    }
+
+    /* The output channel count of the output node must be the same as the input channel count of the input node. */
+    if (ma_node_get_output_channels(pNode, outputBusIndex) != ma_node_get_input_channels(pOtherNode, otherNodeInputBusIndex)) {
+        return MA_INVALID_OPERATION;    /* Channel count is incompatible. */
+    }
+
+    /* This will deal with detaching if the output bus is already attached to something. */
+    ma_node_input_bus_attach(&pOtherNodeBase->pInputBuses[otherNodeInputBusIndex], &pNodeBase->pOutputBuses[outputBusIndex], pOtherNode, otherNodeInputBusIndex);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_node_set_output_bus_volume(ma_node* pNode, ma_uint32 outputBusIndex, float volume)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+
+    if (pNodeBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (outputBusIndex >= ma_node_get_output_bus_count(pNode)) {
+        return MA_INVALID_ARGS; /* Invalid bus index. */
+    }
+
+    return ma_node_output_bus_set_volume(&pNodeBase->pOutputBuses[outputBusIndex], volume);
+}
+
+MA_API float ma_node_get_output_bus_volume(const ma_node* pNode, ma_uint32 outputBusIndex)
+{
+    const ma_node_base* pNodeBase = (const ma_node_base*)pNode;
+
+    if (pNodeBase == NULL) {
+        return 0;
+    }
+
+    if (outputBusIndex >= ma_node_get_output_bus_count(pNode)) {
+        return 0;   /* Invalid bus index. */
+    }
+
+    return ma_node_output_bus_get_volume(&pNodeBase->pOutputBuses[outputBusIndex]);
+}
+
+MA_API ma_result ma_node_set_state(ma_node* pNode, ma_node_state state)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+
+    if (pNodeBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_atomic_exchange_i32(&pNodeBase->state, state);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_node_state ma_node_get_state(const ma_node* pNode)
+{
+    const ma_node_base* pNodeBase = (const ma_node_base*)pNode;
+
+    if (pNodeBase == NULL) {
+        return ma_node_state_stopped;
+    }
+
+    return (ma_node_state)ma_atomic_load_i32(&pNodeBase->state);
+}
+
+MA_API ma_result ma_node_set_state_time(ma_node* pNode, ma_node_state state, ma_uint64 globalTime)
+{
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Validation check for safety since we'll be using this as an index into stateTimes[]. */
+    if (state != ma_node_state_started && state != ma_node_state_stopped) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_atomic_exchange_64(&((ma_node_base*)pNode)->stateTimes[state], globalTime);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_uint64 ma_node_get_state_time(const ma_node* pNode, ma_node_state state)
+{
+    if (pNode == NULL) {
+        return 0;
+    }
+
+    /* Validation check for safety since we'll be using this as an index into stateTimes[]. */
+    if (state != ma_node_state_started && state != ma_node_state_stopped) {
+        return 0;
+    }
+
+    return ma_atomic_load_64(&((ma_node_base*)pNode)->stateTimes[state]);
+}
+
+MA_API ma_node_state ma_node_get_state_by_time(const ma_node* pNode, ma_uint64 globalTime)
+{
+    if (pNode == NULL) {
+        return ma_node_state_stopped;
+    }
+
+    return ma_node_get_state_by_time_range(pNode, globalTime, globalTime);
+}
+
+MA_API ma_node_state ma_node_get_state_by_time_range(const ma_node* pNode, ma_uint64 globalTimeBeg, ma_uint64 globalTimeEnd)
+{
+    ma_node_state state;
+
+    if (pNode == NULL) {
+        return ma_node_state_stopped;
+    }
+
+    state = ma_node_get_state(pNode);
+
+    /* An explicitly stopped node is always stopped. */
+    if (state == ma_node_state_stopped) {
+        return ma_node_state_stopped;
+    }
+
+    /*
+    Getting here means the node is marked as started, but it may still not be truly started due to
+    its start time not having been reached yet. Also, the stop time may have also been reached in
+    which case it'll be considered stopped.
+    */
+    if (ma_node_get_state_time(pNode, ma_node_state_started) > globalTimeBeg) {
+        return ma_node_state_stopped;   /* Start time has not yet been reached. */
+    }
+
+    if (ma_node_get_state_time(pNode, ma_node_state_stopped) <= globalTimeEnd) {
+        return ma_node_state_stopped;   /* Stop time has been reached. */
+    }
+
+    /* Getting here means the node is marked as started and is within its start/stop times. */
+    return ma_node_state_started;
+}
+
+MA_API ma_uint64 ma_node_get_time(const ma_node* pNode)
+{
+    if (pNode == NULL) {
+        return 0;
+    }
+
+    return ma_atomic_load_64(&((ma_node_base*)pNode)->localTime);
+}
+
+MA_API ma_result ma_node_set_time(ma_node* pNode, ma_uint64 localTime)
+{
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_atomic_exchange_64(&((ma_node_base*)pNode)->localTime, localTime);
+
+    return MA_SUCCESS;
+}
+
+
+
+static void ma_node_process_pcm_frames_internal(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+
+    if (pNodeBase->vtable->onProcess) {
+        pNodeBase->vtable->onProcess(pNode, ppFramesIn, pFrameCountIn, ppFramesOut, pFrameCountOut);
+    }
+}
+
+static ma_result ma_node_read_pcm_frames(ma_node* pNode, ma_uint32 outputBusIndex, float* pFramesOut, ma_uint32 frameCount, ma_uint32* pFramesRead, ma_uint64 globalTime)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+    ma_result result = MA_SUCCESS;
+    ma_uint32 iInputBus;
+    ma_uint32 iOutputBus;
+    ma_uint32 inputBusCount;
+    ma_uint32 outputBusCount;
+    ma_uint32 totalFramesRead = 0;
+    float* ppFramesIn[MA_MAX_NODE_BUS_COUNT];
+    float* ppFramesOut[MA_MAX_NODE_BUS_COUNT];
+    ma_uint64 globalTimeBeg;
+    ma_uint64 globalTimeEnd;
+    ma_uint64 startTime;
+    ma_uint64 stopTime;
+    ma_uint32 timeOffsetBeg;
+    ma_uint32 timeOffsetEnd;
+    ma_uint32 frameCountIn;
+    ma_uint32 frameCountOut;
+
+    /*
+    pFramesRead is mandatory. It must be used to determine how many frames were read. It's normal and
+    expected that the number of frames read may be different to that requested. Therefore, the caller
+    must look at this value to correctly determine how many frames were read.
+    */
+    MA_ASSERT(pFramesRead != NULL); /* <-- If you've triggered this assert, you're using this function wrong. You *must* use this variable and inspect it after the call returns. */
+    if (pFramesRead == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pFramesRead = 0;   /* Safety. */
+
+    if (pNodeBase == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (outputBusIndex >= ma_node_get_output_bus_count(pNodeBase)) {
+        return MA_INVALID_ARGS; /* Invalid output bus index. */
+    }
+
+    /* Don't do anything if we're in a stopped state. */
+    if (ma_node_get_state_by_time_range(pNode, globalTime, globalTime + frameCount) != ma_node_state_started) {
+        return MA_SUCCESS;  /* We're in a stopped state. This is not an error - we just need to not read anything. */
+    }
+
+
+    globalTimeBeg = globalTime;
+    globalTimeEnd = globalTime + frameCount;
+    startTime = ma_node_get_state_time(pNode, ma_node_state_started);
+    stopTime  = ma_node_get_state_time(pNode, ma_node_state_stopped);
+
+    /*
+    At this point we know that we are inside our start/stop times. However, we may need to adjust
+    our frame count and output pointer to accommodate since we could be straddling the time period
+    that this function is getting called for.
+
+    It's possible (and likely) that the start time does not line up with the output buffer. We
+    therefore need to offset it by a number of frames to accommodate. The same thing applies for
+    the stop time.
+    */
+    timeOffsetBeg = (globalTimeBeg < startTime) ? (ma_uint32)(globalTimeEnd - startTime) : 0;
+    timeOffsetEnd = (globalTimeEnd > stopTime)  ? (ma_uint32)(globalTimeEnd - stopTime)  : 0;
+
+    /* Trim based on the start offset. We need to silence the start of the buffer. */
+    if (timeOffsetBeg > 0) {
+        ma_silence_pcm_frames(pFramesOut, timeOffsetBeg, ma_format_f32, ma_node_get_output_channels(pNode, outputBusIndex));
+        pFramesOut += timeOffsetBeg * ma_node_get_output_channels(pNode, outputBusIndex);
+        frameCount -= timeOffsetBeg;
+    }
+
+    /* Trim based on the end offset. We don't need to silence the tail section because we'll just have a reduced value written to pFramesRead. */
+    if (timeOffsetEnd > 0) {
+        frameCount -= timeOffsetEnd;
+    }
+
+
+    /* We run on different paths depending on the bus counts. */
+    inputBusCount  = ma_node_get_input_bus_count(pNode);
+    outputBusCount = ma_node_get_output_bus_count(pNode);
+
+    /*
+    Run a simplified path when there are no inputs and one output. In this case there's nothing to
+    actually read and we can go straight to output. This is a very common scenario because the vast
+    majority of data source nodes will use this setup so this optimization I think is worthwhile.
+    */
+    if (inputBusCount == 0 && outputBusCount == 1) {
+        /* Fast path. No need to read from input and no need for any caching. */
+        frameCountIn  = 0;
+        frameCountOut = frameCount;    /* Just read as much as we can. The callback will return what was actually read. */
+
+        ppFramesOut[0] = pFramesOut;
+
+        /*
+        If it's a passthrough we won't be expecting the callback to output anything, so we'll
+        need to pre-silence the output buffer.
+        */
+        if ((pNodeBase->vtable->flags & MA_NODE_FLAG_PASSTHROUGH) != 0) {
+            ma_silence_pcm_frames(pFramesOut, frameCount, ma_format_f32, ma_node_get_output_channels(pNode, outputBusIndex));
+        }
+
+        ma_node_process_pcm_frames_internal(pNode, NULL, &frameCountIn, ppFramesOut, &frameCountOut);
+        totalFramesRead = frameCountOut;
+    } else {
+        /* Slow path. Need to read input data. */
+        if ((pNodeBase->vtable->flags & MA_NODE_FLAG_PASSTHROUGH) != 0) {
+            /*
+            Fast path. We're running a passthrough. We need to read directly into the output buffer, but
+            still fire the callback so that event handling and trigger nodes can do their thing. Since
+            it's a passthrough there's no need for any kind of caching logic.
+            */
+            MA_ASSERT(outputBusCount == inputBusCount);
+            MA_ASSERT(outputBusCount == 1);
+            MA_ASSERT(outputBusIndex == 0);
+
+            /* We just read directly from input bus to output buffer, and then afterwards fire the callback. */
+            ppFramesOut[0] = pFramesOut;
+            ppFramesIn[0] = ppFramesOut[0];
+
+            result = ma_node_input_bus_read_pcm_frames(pNodeBase, &pNodeBase->pInputBuses[0], ppFramesIn[0], frameCount, &totalFramesRead, globalTime);
+            if (result == MA_SUCCESS) {
+                /* Even though it's a passthrough, we still need to fire the callback. */
+                frameCountIn  = totalFramesRead;
+                frameCountOut = totalFramesRead;
+
+                if (totalFramesRead > 0) {
+                    ma_node_process_pcm_frames_internal(pNode, (const float**)ppFramesIn, &frameCountIn, ppFramesOut, &frameCountOut);  /* From GCC: expected 'const float **' but argument is of type 'float **'. Shouldn't this be implicit? Explicit cast to silence the warning. */
+                }
+
+                /*
+                A passthrough should never have modified the input and output frame counts. If you're
+                triggering these asserts you need to fix your processing callback.
+                */
+                MA_ASSERT(frameCountIn  == totalFramesRead);
+                MA_ASSERT(frameCountOut == totalFramesRead);
+            }
+        } else {
+            /* Slow path. Need to do caching. */
+            ma_uint32 framesToProcessIn;
+            ma_uint32 framesToProcessOut;
+            ma_bool32 consumeNullInput = MA_FALSE;
+
+            /*
+            We use frameCount as a basis for the number of frames to read since that's what's being
+            requested, however we still need to clamp it to whatever can fit in the cache.
+
+            This will also be used as the basis for determining how many input frames to read. This is
+            not ideal because it can result in too many input frames being read which introduces latency.
+            To solve this, nodes can implement an optional callback called onGetRequiredInputFrameCount
+            which is used as hint to miniaudio as to how many input frames it needs to read at a time. This
+            callback is completely optional, and if it's not set, miniaudio will assume `frameCount`.
+
+            This function will be called multiple times for each period of time, once for each output node.
+            We cannot read from each input node each time this function is called. Instead we need to check
+            whether or not this is first output bus to be read from for this time period, and if so, read
+            from our input data.
+
+            To determine whether or not we're ready to read data, we check a flag. There will be one flag
+            for each output. When the flag is set, it means data has been read previously and that we're
+            ready to advance time forward for our input nodes by reading fresh data.
+            */
+            framesToProcessOut = frameCount;
+            if (framesToProcessOut > pNodeBase->cachedDataCapInFramesPerBus) {
+                framesToProcessOut = pNodeBase->cachedDataCapInFramesPerBus;
+            }
+
+            framesToProcessIn  = frameCount;
+            if (pNodeBase->vtable->onGetRequiredInputFrameCount) {
+                pNodeBase->vtable->onGetRequiredInputFrameCount(pNode, framesToProcessOut, &framesToProcessIn); /* <-- It does not matter if this fails. */
+            }
+            if (framesToProcessIn > pNodeBase->cachedDataCapInFramesPerBus) {
+                framesToProcessIn = pNodeBase->cachedDataCapInFramesPerBus;
+            }
+
+
+            MA_ASSERT(framesToProcessIn  <= 0xFFFF);
+            MA_ASSERT(framesToProcessOut <= 0xFFFF);
+
+            if (ma_node_output_bus_has_read(&pNodeBase->pOutputBuses[outputBusIndex])) {
+                /* Getting here means we need to do another round of processing. */
+                pNodeBase->cachedFrameCountOut = 0;
+
+                for (;;) {
+                    frameCountOut = 0;
+
+                    /*
+                    We need to prepare our output frame pointers for processing. In the same iteration we need
+                    to mark every output bus as unread so that future calls to this function for different buses
+                    for the current time period don't pull in data when they should instead be reading from cache.
+                    */
+                    for (iOutputBus = 0; iOutputBus < outputBusCount; iOutputBus += 1) {
+                        ma_node_output_bus_set_has_read(&pNodeBase->pOutputBuses[iOutputBus], MA_FALSE); /* <-- This is what tells the next calls to this function for other output buses for this time period to read from cache instead of pulling in more data. */
+                        ppFramesOut[iOutputBus] = ma_node_get_cached_output_ptr(pNode, iOutputBus);
+                    }
+
+                    /* We only need to read from input buses if there isn't already some data in the cache. */
+                    if (pNodeBase->cachedFrameCountIn == 0) {
+                        ma_uint32 maxFramesReadIn = 0;
+
+                        /* Here is where we pull in data from the input buses. This is what will trigger an advance in time. */
+                        for (iInputBus = 0; iInputBus < inputBusCount; iInputBus += 1) {
+                            ma_uint32 framesRead;
+
+                            /* The first thing to do is get the offset within our bulk allocation to store this input data. */
+                            ppFramesIn[iInputBus] = ma_node_get_cached_input_ptr(pNode, iInputBus);
+
+                            /* Once we've determined our destination pointer we can read. Note that we must inspect the number of frames read and fill any leftovers with silence for safety. */
+                            result = ma_node_input_bus_read_pcm_frames(pNodeBase, &pNodeBase->pInputBuses[iInputBus], ppFramesIn[iInputBus], framesToProcessIn, &framesRead, globalTime);
+                            if (result != MA_SUCCESS) {
+                                /* It doesn't really matter if we fail because we'll just fill with silence. */
+                                framesRead = 0; /* Just for safety, but I don't think it's really needed. */
+                            }
+
+                            /* TODO: Minor optimization opportunity here. If no frames were read and the buffer is already filled with silence, no need to re-silence it. */
+                            /* Any leftover frames need to silenced for safety. */
+                            if (framesRead < framesToProcessIn) {
+                                ma_silence_pcm_frames(ppFramesIn[iInputBus] + (framesRead * ma_node_get_input_channels(pNodeBase, iInputBus)), (framesToProcessIn - framesRead), ma_format_f32, ma_node_get_input_channels(pNodeBase, iInputBus));
+                            }
+
+                            maxFramesReadIn = ma_max(maxFramesReadIn, framesRead);
+                        }
+
+                        /* This was a fresh load of input data so reset our consumption counter. */
+                        pNodeBase->consumedFrameCountIn = 0;
+
+                        /*
+                        We don't want to keep processing if there's nothing to process, so set the number of cached
+                        input frames to the maximum number we read from each attachment (the lesser will be padded
+                        with silence). If we didn't read anything, this will be set to 0 and the entire buffer will
+                        have been assigned to silence. This being equal to 0 is an important property for us because
+                        it allows us to detect when NULL can be passed into the processing callback for the input
+                        buffer for the purpose of continuous processing.
+                        */
+                        pNodeBase->cachedFrameCountIn = (ma_uint16)maxFramesReadIn;
+                    } else {
+                        /* We don't need to read anything, but we do need to prepare our input frame pointers. */
+                        for (iInputBus = 0; iInputBus < inputBusCount; iInputBus += 1) {
+                            ppFramesIn[iInputBus] = ma_node_get_cached_input_ptr(pNode, iInputBus) + (pNodeBase->consumedFrameCountIn * ma_node_get_input_channels(pNodeBase, iInputBus));
+                        }
+                    }
+
+                    /*
+                    At this point we have our input data so now we need to do some processing. Sneaky little
+                    optimization here - we can set the pointer to the output buffer for this output bus so
+                    that the final copy into the output buffer is done directly by onProcess().
+                    */
+                    if (pFramesOut != NULL) {
+                        ppFramesOut[outputBusIndex] = ma_offset_pcm_frames_ptr_f32(pFramesOut, pNodeBase->cachedFrameCountOut, ma_node_get_output_channels(pNode, outputBusIndex));
+                    }
+
+
+                    /* Give the processing function the entire capacity of the output buffer. */
+                    frameCountOut = (framesToProcessOut - pNodeBase->cachedFrameCountOut);
+
+                    /*
+                    We need to treat nodes with continuous processing a little differently. For these ones,
+                    we always want to fire the callback with the requested number of frames, regardless of
+                    pNodeBase->cachedFrameCountIn, which could be 0. Also, we want to check if we can pass
+                    in NULL for the input buffer to the callback.
+                    */
+                    if ((pNodeBase->vtable->flags & MA_NODE_FLAG_CONTINUOUS_PROCESSING) != 0) {
+                        /* We're using continuous processing. Make sure we specify the whole frame count at all times. */
+                        frameCountIn = framesToProcessIn;    /* Give the processing function as much input data as we've got in the buffer, including any silenced padding from short reads. */
+
+                        if ((pNodeBase->vtable->flags & MA_NODE_FLAG_ALLOW_NULL_INPUT) != 0 && pNodeBase->consumedFrameCountIn == 0 && pNodeBase->cachedFrameCountIn == 0) {
+                            consumeNullInput = MA_TRUE;
+                        } else {
+                            consumeNullInput = MA_FALSE;
+                        }
+
+                        /*
+                        Since we're using continuous processing we're always passing in a full frame count
+                        regardless of how much input data was read. If this is greater than what we read as
+                        input, we'll end up with an underflow. We instead need to make sure our cached frame
+                        count is set to the number of frames we'll be passing to the data callback. Not
+                        doing this will result in an underflow when we "consume" the cached data later on.
+
+                        Note that this check needs to be done after the "consumeNullInput" check above because
+                        we use the property of cachedFrameCountIn being 0 to determine whether or not we
+                        should be passing in a null pointer to the processing callback for when the node is
+                        configured with MA_NODE_FLAG_ALLOW_NULL_INPUT.
+                        */
+                        if (pNodeBase->cachedFrameCountIn < (ma_uint16)frameCountIn) {
+                            pNodeBase->cachedFrameCountIn = (ma_uint16)frameCountIn;
+                        }
+                    } else {
+                        frameCountIn = pNodeBase->cachedFrameCountIn;  /* Give the processing function as much valid input data as we've got. */
+                        consumeNullInput = MA_FALSE;
+                    }
+
+                    /*
+                    Process data slightly differently depending on whether or not we're consuming NULL
+                    input (checked just above).
+                    */
+                    if (consumeNullInput) {
+                        ma_node_process_pcm_frames_internal(pNode, NULL, &frameCountIn, ppFramesOut, &frameCountOut);
+                    } else {
+                        /*
+                        We want to skip processing if there's no input data, but we can only do that safely if
+                        we know that there is no chance of any output frames being produced. If continuous
+                        processing is being used, this won't be a problem because the input frame count will
+                        always be non-0. However, if continuous processing is *not* enabled and input and output
+                        data is processed at different rates, we still need to process that last input frame
+                        because there could be a few excess output frames needing to be produced from cached
+                        data. The `MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES` flag is used as the indicator for
+                        determining whether or not we need to process the node even when there are no input
+                        frames available right now.
+                        */
+                        if (frameCountIn > 0 || (pNodeBase->vtable->flags & MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES) != 0) {
+                            ma_node_process_pcm_frames_internal(pNode, (const float**)ppFramesIn, &frameCountIn, ppFramesOut, &frameCountOut);    /* From GCC: expected 'const float **' but argument is of type 'float **'. Shouldn't this be implicit? Explicit cast to silence the warning. */
+                        } else {
+                            frameCountOut = 0;  /* No data was processed. */
+                        }
+                    }
+
+                    /*
+                    Thanks to our sneaky optimization above we don't need to do any data copying directly into
+                    the output buffer - the onProcess() callback just did that for us. We do, however, need to
+                    apply the number of input and output frames that were processed. Note that due to continuous
+                    processing above, we need to do explicit checks here. If we just consumed a NULL input
+                    buffer it means that no actual input data was processed from the internal buffers and we
+                    don't want to be modifying any counters.
+                    */
+                    if (consumeNullInput == MA_FALSE) {
+                        pNodeBase->consumedFrameCountIn += (ma_uint16)frameCountIn;
+                        pNodeBase->cachedFrameCountIn   -= (ma_uint16)frameCountIn;
+                    }
+
+                    /* The cached output frame count is always equal to what we just read. */
+                    pNodeBase->cachedFrameCountOut += (ma_uint16)frameCountOut;
+
+                    /* If we couldn't process any data, we're done. The loop needs to be terminated here or else we'll get stuck in a loop. */
+                    if (pNodeBase->cachedFrameCountOut == framesToProcessOut || (frameCountOut == 0 && frameCountIn == 0)) {
+                        break;
+                    }
+                }
+            } else {
+                /*
+                We're not needing to read anything from the input buffer so just read directly from our
+                already-processed data.
+                */
+                if (pFramesOut != NULL) {
+                    ma_copy_pcm_frames(pFramesOut, ma_node_get_cached_output_ptr(pNodeBase, outputBusIndex), pNodeBase->cachedFrameCountOut, ma_format_f32, ma_node_get_output_channels(pNodeBase, outputBusIndex));
+                }
+            }
+
+            /* The number of frames read is always equal to the number of cached output frames. */
+            totalFramesRead = pNodeBase->cachedFrameCountOut;
+
+            /* Now that we've read the data, make sure our read flag is set. */
+            ma_node_output_bus_set_has_read(&pNodeBase->pOutputBuses[outputBusIndex], MA_TRUE);
+        }
+    }
+
+    /* Apply volume, if necessary. */
+    ma_apply_volume_factor_f32(pFramesOut, totalFramesRead * ma_node_get_output_channels(pNodeBase, outputBusIndex), ma_node_output_bus_get_volume(&pNodeBase->pOutputBuses[outputBusIndex]));
+
+    /* Advance our local time forward. */
+    ma_atomic_fetch_add_64(&pNodeBase->localTime, (ma_uint64)totalFramesRead);
+
+    *pFramesRead = totalFramesRead + timeOffsetBeg; /* Must include the silenced section at the start of the buffer. */
+    return result;
+}
+
+
+
+
+/* Data source node. */
+MA_API ma_data_source_node_config ma_data_source_node_config_init(ma_data_source* pDataSource)
+{
+    ma_data_source_node_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.nodeConfig  = ma_node_config_init();
+    config.pDataSource = pDataSource;
+
+    return config;
+}
+
+
+static void ma_data_source_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_data_source_node* pDataSourceNode = (ma_data_source_node*)pNode;
+    ma_format format;
+    ma_uint32 channels;
+    ma_uint32 frameCount;
+    ma_uint64 framesRead = 0;
+
+    MA_ASSERT(pDataSourceNode != NULL);
+    MA_ASSERT(pDataSourceNode->pDataSource != NULL);
+    MA_ASSERT(ma_node_get_input_bus_count(pDataSourceNode)  == 0);
+    MA_ASSERT(ma_node_get_output_bus_count(pDataSourceNode) == 1);
+
+    /* We don't want to read from ppFramesIn at all. Instead we read from the data source. */
+    (void)ppFramesIn;
+    (void)pFrameCountIn;
+
+    frameCount = *pFrameCountOut;
+
+    /* miniaudio should never be calling this with a frame count of zero. */
+    MA_ASSERT(frameCount > 0);
+
+    if (ma_data_source_get_data_format(pDataSourceNode->pDataSource, &format, &channels, NULL, NULL, 0) == MA_SUCCESS) { /* <-- Don't care about sample rate here. */
+        /* The node graph system requires samples be in floating point format. This is checked in ma_data_source_node_init(). */
+        MA_ASSERT(format == ma_format_f32);
+        (void)format;   /* Just to silence some static analysis tools. */
+
+        ma_data_source_read_pcm_frames(pDataSourceNode->pDataSource, ppFramesOut[0], frameCount, &framesRead);
+    }
+
+    *pFrameCountOut = (ma_uint32)framesRead;
+}
+
+static ma_node_vtable g_ma_data_source_node_vtable =
+{
+    ma_data_source_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    0,      /* 0 input buses. */
+    1,      /* 1 output bus. */
+    0
+};
+
+MA_API ma_result ma_data_source_node_init(ma_node_graph* pNodeGraph, const ma_data_source_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_data_source_node* pDataSourceNode)
+{
+    ma_result result;
+    ma_format format;   /* For validating the format, which must be ma_format_f32. */
+    ma_uint32 channels; /* For specifying the channel count of the output bus. */
+    ma_node_config baseConfig;
+
+    if (pDataSourceNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDataSourceNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_data_source_get_data_format(pConfig->pDataSource, &format, &channels, NULL, NULL, 0);    /* Don't care about sample rate. This will check pDataSource for NULL. */
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    MA_ASSERT(format == ma_format_f32); /* <-- If you've triggered this it means your data source is not outputting floating-point samples. You must configure your data source to use ma_format_f32. */
+    if (format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* Invalid format. */
+    }
+
+    /* The channel count is defined by the data source. If the caller has manually changed the channels we just ignore it. */
+    baseConfig = pConfig->nodeConfig;
+    baseConfig.vtable = &g_ma_data_source_node_vtable;  /* Explicitly set the vtable here to prevent callers from setting it incorrectly. */
+
+    /*
+    The channel count is defined by the data source. It is invalid for the caller to manually set
+    the channel counts in the config. `ma_data_source_node_config_init()` will have defaulted the
+    channel count pointer to NULL which is how it must remain. If you trigger any of these asserts
+    it means you're explicitly setting the channel count. Instead, configure the output channel
+    count of your data source to be the necessary channel count.
+    */
+    if (baseConfig.pOutputChannels != NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    baseConfig.pOutputChannels = &channels;
+
+    result = ma_node_init(pNodeGraph, &baseConfig, pAllocationCallbacks, &pDataSourceNode->base);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    pDataSourceNode->pDataSource = pConfig->pDataSource;
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_data_source_node_uninit(ma_data_source_node* pDataSourceNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_node_uninit(&pDataSourceNode->base, pAllocationCallbacks);
+}
+
+MA_API ma_result ma_data_source_node_set_looping(ma_data_source_node* pDataSourceNode, ma_bool32 isLooping)
+{
+    if (pDataSourceNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_data_source_set_looping(pDataSourceNode->pDataSource, isLooping);
+}
+
+MA_API ma_bool32 ma_data_source_node_is_looping(ma_data_source_node* pDataSourceNode)
+{
+    if (pDataSourceNode == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_data_source_is_looping(pDataSourceNode->pDataSource);
+}
+
+
+
+/* Splitter Node. */
+MA_API ma_splitter_node_config ma_splitter_node_config_init(ma_uint32 channels)
+{
+    ma_splitter_node_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.nodeConfig     = ma_node_config_init();
+    config.channels       = channels;
+    config.outputBusCount = 2;
+
+    return config;
+}
+
+
+static void ma_splitter_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_node_base* pNodeBase = (ma_node_base*)pNode;
+    ma_uint32 iOutputBus;
+    ma_uint32 channels;
+
+    MA_ASSERT(pNodeBase != NULL);
+    MA_ASSERT(ma_node_get_input_bus_count(pNodeBase) == 1);
+
+    /* We don't need to consider the input frame count - it'll be the same as the output frame count and we process everything. */
+    (void)pFrameCountIn;
+
+    /* NOTE: This assumes the same number of channels for all inputs and outputs. This was checked in ma_splitter_node_init(). */
+    channels = ma_node_get_input_channels(pNodeBase, 0);
+
+    /* Splitting is just copying the first input bus and copying it over to each output bus. */
+    for (iOutputBus = 0; iOutputBus < ma_node_get_output_bus_count(pNodeBase); iOutputBus += 1) {
+        ma_copy_pcm_frames(ppFramesOut[iOutputBus], ppFramesIn[0], *pFrameCountOut, ma_format_f32, channels);
+    }
+}
+
+static ma_node_vtable g_ma_splitter_node_vtable =
+{
+    ma_splitter_node_process_pcm_frames,
+    NULL,                       /* onGetRequiredInputFrameCount */
+    1,                          /* 1 input bus. */
+    MA_NODE_BUS_COUNT_UNKNOWN,  /* The output bus count is specified on a per-node basis. */
+    0
+};
+
+MA_API ma_result ma_splitter_node_init(ma_node_graph* pNodeGraph, const ma_splitter_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_splitter_node* pSplitterNode)
+{
+    ma_result result;
+    ma_node_config baseConfig;
+    ma_uint32 pInputChannels[1];
+    ma_uint32 pOutputChannels[MA_MAX_NODE_BUS_COUNT];
+    ma_uint32 iOutputBus;
+
+    if (pSplitterNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pSplitterNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->outputBusCount > MA_MAX_NODE_BUS_COUNT) {
+        return MA_INVALID_ARGS; /* Too many output buses. */
+    }
+
+    /* Splitters require the same number of channels between inputs and outputs. */
+    pInputChannels[0]  = pConfig->channels;
+    for (iOutputBus = 0; iOutputBus < pConfig->outputBusCount; iOutputBus += 1) {
+        pOutputChannels[iOutputBus] = pConfig->channels;
+    }
+
+    baseConfig = pConfig->nodeConfig;
+    baseConfig.vtable = &g_ma_splitter_node_vtable;
+    baseConfig.pInputChannels  = pInputChannels;
+    baseConfig.pOutputChannels = pOutputChannels;
+    baseConfig.outputBusCount  = pConfig->outputBusCount;
+
+    result = ma_node_init(pNodeGraph, &baseConfig, pAllocationCallbacks, &pSplitterNode->base);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to initialize the base node. */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API void ma_splitter_node_uninit(ma_splitter_node* pSplitterNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_node_uninit(pSplitterNode, pAllocationCallbacks);
+}
+
+
+/*
+Biquad Node
+*/
+MA_API ma_biquad_node_config ma_biquad_node_config_init(ma_uint32 channels, float b0, float b1, float b2, float a0, float a1, float a2)
+{
+    ma_biquad_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.biquad = ma_biquad_config_init(ma_format_f32, channels, b0, b1, b2, a0, a1, a2);
+
+    return config;
+}
+
+static void ma_biquad_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_biquad_node* pLPFNode = (ma_biquad_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+    (void)pFrameCountIn;
+
+    ma_biquad_process_pcm_frames(&pLPFNode->biquad, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_biquad_node_vtable =
+{
+    ma_biquad_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* One input. */
+    1,      /* One output. */
+    0       /* Default flags. */
+};
+
+MA_API ma_result ma_biquad_node_init(ma_node_graph* pNodeGraph, const ma_biquad_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_biquad_node* pNode)
+{
+    ma_result result;
+    ma_node_config baseNodeConfig;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->biquad.format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* The format must be f32. */
+    }
+
+    result = ma_biquad_init(&pConfig->biquad, pAllocationCallbacks, &pNode->biquad);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    baseNodeConfig = ma_node_config_init();
+    baseNodeConfig.vtable          = &g_ma_biquad_node_vtable;
+    baseNodeConfig.pInputChannels  = &pConfig->biquad.channels;
+    baseNodeConfig.pOutputChannels = &pConfig->biquad.channels;
+
+    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_biquad_node_reinit(const ma_biquad_config* pConfig, ma_biquad_node* pNode)
+{
+    ma_biquad_node* pLPFNode = (ma_biquad_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+
+    return ma_biquad_reinit(pConfig, &pLPFNode->biquad);
+}
+
+MA_API void ma_biquad_node_uninit(ma_biquad_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_biquad_node* pLPFNode = (ma_biquad_node*)pNode;
+
+    if (pNode == NULL) {
+        return;
+    }
+
+    ma_node_uninit(pNode, pAllocationCallbacks);
+    ma_biquad_uninit(&pLPFNode->biquad, pAllocationCallbacks);
+}
+
+
+
+/*
+Low Pass Filter Node
+*/
+MA_API ma_lpf_node_config ma_lpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
+{
+    ma_lpf_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.lpf = ma_lpf_config_init(ma_format_f32, channels, sampleRate, cutoffFrequency, order);
+
+    return config;
+}
+
+static void ma_lpf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_lpf_node* pLPFNode = (ma_lpf_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+    (void)pFrameCountIn;
+
+    ma_lpf_process_pcm_frames(&pLPFNode->lpf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_lpf_node_vtable =
+{
+    ma_lpf_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* One input. */
+    1,      /* One output. */
+    0       /* Default flags. */
+};
+
+MA_API ma_result ma_lpf_node_init(ma_node_graph* pNodeGraph, const ma_lpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_lpf_node* pNode)
+{
+    ma_result result;
+    ma_node_config baseNodeConfig;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->lpf.format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* The format must be f32. */
+    }
+
+    result = ma_lpf_init(&pConfig->lpf, pAllocationCallbacks, &pNode->lpf);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    baseNodeConfig = ma_node_config_init();
+    baseNodeConfig.vtable          = &g_ma_lpf_node_vtable;
+    baseNodeConfig.pInputChannels  = &pConfig->lpf.channels;
+    baseNodeConfig.pOutputChannels = &pConfig->lpf.channels;
+
+    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_lpf_node_reinit(const ma_lpf_config* pConfig, ma_lpf_node* pNode)
+{
+    ma_lpf_node* pLPFNode = (ma_lpf_node*)pNode;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_lpf_reinit(pConfig, &pLPFNode->lpf);
+}
+
+MA_API void ma_lpf_node_uninit(ma_lpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_lpf_node* pLPFNode = (ma_lpf_node*)pNode;
+
+    if (pNode == NULL) {
+        return;
+    }
+
+    ma_node_uninit(pNode, pAllocationCallbacks);
+    ma_lpf_uninit(&pLPFNode->lpf, pAllocationCallbacks);
+}
+
+
+
+/*
+High Pass Filter Node
+*/
+MA_API ma_hpf_node_config ma_hpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
+{
+    ma_hpf_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.hpf = ma_hpf_config_init(ma_format_f32, channels, sampleRate, cutoffFrequency, order);
+
+    return config;
+}
+
+static void ma_hpf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_hpf_node* pHPFNode = (ma_hpf_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+    (void)pFrameCountIn;
+
+    ma_hpf_process_pcm_frames(&pHPFNode->hpf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_hpf_node_vtable =
+{
+    ma_hpf_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* One input. */
+    1,      /* One output. */
+    0       /* Default flags. */
+};
+
+MA_API ma_result ma_hpf_node_init(ma_node_graph* pNodeGraph, const ma_hpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hpf_node* pNode)
+{
+    ma_result result;
+    ma_node_config baseNodeConfig;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->hpf.format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* The format must be f32. */
+    }
+
+    result = ma_hpf_init(&pConfig->hpf, pAllocationCallbacks, &pNode->hpf);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    baseNodeConfig = ma_node_config_init();
+    baseNodeConfig.vtable          = &g_ma_hpf_node_vtable;
+    baseNodeConfig.pInputChannels  = &pConfig->hpf.channels;
+    baseNodeConfig.pOutputChannels = &pConfig->hpf.channels;
+
+    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_hpf_node_reinit(const ma_hpf_config* pConfig, ma_hpf_node* pNode)
+{
+    ma_hpf_node* pHPFNode = (ma_hpf_node*)pNode;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_hpf_reinit(pConfig, &pHPFNode->hpf);
+}
+
+MA_API void ma_hpf_node_uninit(ma_hpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_hpf_node* pHPFNode = (ma_hpf_node*)pNode;
+
+    if (pNode == NULL) {
+        return;
+    }
+
+    ma_node_uninit(pNode, pAllocationCallbacks);
+    ma_hpf_uninit(&pHPFNode->hpf, pAllocationCallbacks);
+}
+
+
+
+
+/*
+Band Pass Filter Node
+*/
+MA_API ma_bpf_node_config ma_bpf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double cutoffFrequency, ma_uint32 order)
+{
+    ma_bpf_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.bpf = ma_bpf_config_init(ma_format_f32, channels, sampleRate, cutoffFrequency, order);
+
+    return config;
+}
+
+static void ma_bpf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_bpf_node* pBPFNode = (ma_bpf_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+    (void)pFrameCountIn;
+
+    ma_bpf_process_pcm_frames(&pBPFNode->bpf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_bpf_node_vtable =
+{
+    ma_bpf_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* One input. */
+    1,      /* One output. */
+    0       /* Default flags. */
+};
+
+MA_API ma_result ma_bpf_node_init(ma_node_graph* pNodeGraph, const ma_bpf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_bpf_node* pNode)
+{
+    ma_result result;
+    ma_node_config baseNodeConfig;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->bpf.format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* The format must be f32. */
+    }
+
+    result = ma_bpf_init(&pConfig->bpf, pAllocationCallbacks, &pNode->bpf);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    baseNodeConfig = ma_node_config_init();
+    baseNodeConfig.vtable          = &g_ma_bpf_node_vtable;
+    baseNodeConfig.pInputChannels  = &pConfig->bpf.channels;
+    baseNodeConfig.pOutputChannels = &pConfig->bpf.channels;
+
+    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_bpf_node_reinit(const ma_bpf_config* pConfig, ma_bpf_node* pNode)
+{
+    ma_bpf_node* pBPFNode = (ma_bpf_node*)pNode;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_bpf_reinit(pConfig, &pBPFNode->bpf);
+}
+
+MA_API void ma_bpf_node_uninit(ma_bpf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_bpf_node* pBPFNode = (ma_bpf_node*)pNode;
+
+    if (pNode == NULL) {
+        return;
+    }
+
+    ma_node_uninit(pNode, pAllocationCallbacks);
+    ma_bpf_uninit(&pBPFNode->bpf, pAllocationCallbacks);
+}
+
+
+
+/*
+Notching Filter Node
+*/
+MA_API ma_notch_node_config ma_notch_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double q, double frequency)
+{
+    ma_notch_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.notch = ma_notch2_config_init(ma_format_f32, channels, sampleRate, q, frequency);
+
+    return config;
+}
+
+static void ma_notch_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_notch_node* pBPFNode = (ma_notch_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+    (void)pFrameCountIn;
+
+    ma_notch2_process_pcm_frames(&pBPFNode->notch, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_notch_node_vtable =
+{
+    ma_notch_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* One input. */
+    1,      /* One output. */
+    0       /* Default flags. */
+};
+
+MA_API ma_result ma_notch_node_init(ma_node_graph* pNodeGraph, const ma_notch_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_notch_node* pNode)
+{
+    ma_result result;
+    ma_node_config baseNodeConfig;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->notch.format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* The format must be f32. */
+    }
+
+    result = ma_notch2_init(&pConfig->notch, pAllocationCallbacks, &pNode->notch);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    baseNodeConfig = ma_node_config_init();
+    baseNodeConfig.vtable          = &g_ma_notch_node_vtable;
+    baseNodeConfig.pInputChannels  = &pConfig->notch.channels;
+    baseNodeConfig.pOutputChannels = &pConfig->notch.channels;
+
+    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_notch_node_reinit(const ma_notch_config* pConfig, ma_notch_node* pNode)
+{
+    ma_notch_node* pNotchNode = (ma_notch_node*)pNode;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_notch2_reinit(pConfig, &pNotchNode->notch);
+}
+
+MA_API void ma_notch_node_uninit(ma_notch_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_notch_node* pNotchNode = (ma_notch_node*)pNode;
+
+    if (pNode == NULL) {
+        return;
+    }
+
+    ma_node_uninit(pNode, pAllocationCallbacks);
+    ma_notch2_uninit(&pNotchNode->notch, pAllocationCallbacks);
+}
+
+
+
+/*
+Peaking Filter Node
+*/
+MA_API ma_peak_node_config ma_peak_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency)
+{
+    ma_peak_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.peak = ma_peak2_config_init(ma_format_f32, channels, sampleRate, gainDB, q, frequency);
+
+    return config;
+}
+
+static void ma_peak_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_peak_node* pBPFNode = (ma_peak_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+    (void)pFrameCountIn;
+
+    ma_peak2_process_pcm_frames(&pBPFNode->peak, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_peak_node_vtable =
+{
+    ma_peak_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* One input. */
+    1,      /* One output. */
+    0       /* Default flags. */
+};
+
+MA_API ma_result ma_peak_node_init(ma_node_graph* pNodeGraph, const ma_peak_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_peak_node* pNode)
+{
+    ma_result result;
+    ma_node_config baseNodeConfig;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->peak.format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* The format must be f32. */
+    }
+
+    result = ma_peak2_init(&pConfig->peak, pAllocationCallbacks, &pNode->peak);
+    if (result != MA_SUCCESS) {
+        ma_node_uninit(pNode, pAllocationCallbacks);
+        return result;
+    }
+
+    baseNodeConfig = ma_node_config_init();
+    baseNodeConfig.vtable          = &g_ma_peak_node_vtable;
+    baseNodeConfig.pInputChannels  = &pConfig->peak.channels;
+    baseNodeConfig.pOutputChannels = &pConfig->peak.channels;
+
+    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_peak_node_reinit(const ma_peak_config* pConfig, ma_peak_node* pNode)
+{
+    ma_peak_node* pPeakNode = (ma_peak_node*)pNode;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_peak2_reinit(pConfig, &pPeakNode->peak);
+}
+
+MA_API void ma_peak_node_uninit(ma_peak_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_peak_node* pPeakNode = (ma_peak_node*)pNode;
+
+    if (pNode == NULL) {
+        return;
+    }
+
+    ma_node_uninit(pNode, pAllocationCallbacks);
+    ma_peak2_uninit(&pPeakNode->peak, pAllocationCallbacks);
+}
+
+
+
+/*
+Low Shelf Filter Node
+*/
+MA_API ma_loshelf_node_config ma_loshelf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency)
+{
+    ma_loshelf_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.loshelf = ma_loshelf2_config_init(ma_format_f32, channels, sampleRate, gainDB, q, frequency);
+
+    return config;
+}
+
+static void ma_loshelf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_loshelf_node* pBPFNode = (ma_loshelf_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+    (void)pFrameCountIn;
+
+    ma_loshelf2_process_pcm_frames(&pBPFNode->loshelf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_loshelf_node_vtable =
+{
+    ma_loshelf_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* One input. */
+    1,      /* One output. */
+    0       /* Default flags. */
+};
+
+MA_API ma_result ma_loshelf_node_init(ma_node_graph* pNodeGraph, const ma_loshelf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_loshelf_node* pNode)
+{
+    ma_result result;
+    ma_node_config baseNodeConfig;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->loshelf.format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* The format must be f32. */
+    }
+
+    result = ma_loshelf2_init(&pConfig->loshelf, pAllocationCallbacks, &pNode->loshelf);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    baseNodeConfig = ma_node_config_init();
+    baseNodeConfig.vtable          = &g_ma_loshelf_node_vtable;
+    baseNodeConfig.pInputChannels  = &pConfig->loshelf.channels;
+    baseNodeConfig.pOutputChannels = &pConfig->loshelf.channels;
+
+    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_loshelf_node_reinit(const ma_loshelf_config* pConfig, ma_loshelf_node* pNode)
+{
+    ma_loshelf_node* pLoshelfNode = (ma_loshelf_node*)pNode;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_loshelf2_reinit(pConfig, &pLoshelfNode->loshelf);
+}
+
+MA_API void ma_loshelf_node_uninit(ma_loshelf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_loshelf_node* pLoshelfNode = (ma_loshelf_node*)pNode;
+
+    if (pNode == NULL) {
+        return;
+    }
+
+    ma_node_uninit(pNode, pAllocationCallbacks);
+    ma_loshelf2_uninit(&pLoshelfNode->loshelf, pAllocationCallbacks);
+}
+
+
+
+/*
+High Shelf Filter Node
+*/
+MA_API ma_hishelf_node_config ma_hishelf_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, double gainDB, double q, double frequency)
+{
+    ma_hishelf_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.hishelf = ma_hishelf2_config_init(ma_format_f32, channels, sampleRate, gainDB, q, frequency);
+
+    return config;
+}
+
+static void ma_hishelf_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_hishelf_node* pBPFNode = (ma_hishelf_node*)pNode;
+
+    MA_ASSERT(pNode != NULL);
+    (void)pFrameCountIn;
+
+    ma_hishelf2_process_pcm_frames(&pBPFNode->hishelf, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_hishelf_node_vtable =
+{
+    ma_hishelf_node_process_pcm_frames,
+    NULL,   /* onGetRequiredInputFrameCount */
+    1,      /* One input. */
+    1,      /* One output. */
+    0       /* Default flags. */
+};
+
+MA_API ma_result ma_hishelf_node_init(ma_node_graph* pNodeGraph, const ma_hishelf_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_hishelf_node* pNode)
+{
+    ma_result result;
+    ma_node_config baseNodeConfig;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pNode);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->hishelf.format != ma_format_f32) {
+        return MA_INVALID_ARGS; /* The format must be f32. */
+    }
+
+    result = ma_hishelf2_init(&pConfig->hishelf, pAllocationCallbacks, &pNode->hishelf);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    baseNodeConfig = ma_node_config_init();
+    baseNodeConfig.vtable          = &g_ma_hishelf_node_vtable;
+    baseNodeConfig.pInputChannels  = &pConfig->hishelf.channels;
+    baseNodeConfig.pOutputChannels = &pConfig->hishelf.channels;
+
+    result = ma_node_init(pNodeGraph, &baseNodeConfig, pAllocationCallbacks, pNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return result;
+}
+
+MA_API ma_result ma_hishelf_node_reinit(const ma_hishelf_config* pConfig, ma_hishelf_node* pNode)
+{
+    ma_hishelf_node* pHishelfNode = (ma_hishelf_node*)pNode;
+
+    if (pNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_hishelf2_reinit(pConfig, &pHishelfNode->hishelf);
+}
+
+MA_API void ma_hishelf_node_uninit(ma_hishelf_node* pNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_hishelf_node* pHishelfNode = (ma_hishelf_node*)pNode;
+
+    if (pNode == NULL) {
+        return;
+    }
+
+    ma_node_uninit(pNode, pAllocationCallbacks);
+    ma_hishelf2_uninit(&pHishelfNode->hishelf, pAllocationCallbacks);
+}
+
+
+
+
+MA_API ma_delay_node_config ma_delay_node_config_init(ma_uint32 channels, ma_uint32 sampleRate, ma_uint32 delayInFrames, float decay)
+{
+    ma_delay_node_config config;
+
+    config.nodeConfig = ma_node_config_init();
+    config.delay = ma_delay_config_init(channels, sampleRate, delayInFrames, decay);
+
+    return config;
+}
+
+
+static void ma_delay_node_process_pcm_frames(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_delay_node* pDelayNode = (ma_delay_node*)pNode;
+
+    (void)pFrameCountIn;
+
+    ma_delay_process_pcm_frames(&pDelayNode->delay, ppFramesOut[0], ppFramesIn[0], *pFrameCountOut);
+}
+
+static ma_node_vtable g_ma_delay_node_vtable =
+{
+    ma_delay_node_process_pcm_frames,
+    NULL,
+    1,  /* 1 input channels. */
+    1,  /* 1 output channel. */
+    MA_NODE_FLAG_CONTINUOUS_PROCESSING  /* Delay requires continuous processing to ensure the tail get's processed. */
+};
+
+MA_API ma_result ma_delay_node_init(ma_node_graph* pNodeGraph, const ma_delay_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_delay_node* pDelayNode)
+{
+    ma_result result;
+    ma_node_config baseConfig;
+
+    if (pDelayNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pDelayNode);
+
+    result = ma_delay_init(&pConfig->delay, pAllocationCallbacks, &pDelayNode->delay);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    baseConfig = pConfig->nodeConfig;
+    baseConfig.vtable          = &g_ma_delay_node_vtable;
+    baseConfig.pInputChannels  = &pConfig->delay.channels;
+    baseConfig.pOutputChannels = &pConfig->delay.channels;
+
+    result = ma_node_init(pNodeGraph, &baseConfig, pAllocationCallbacks, &pDelayNode->baseNode);
+    if (result != MA_SUCCESS) {
+        ma_delay_uninit(&pDelayNode->delay, pAllocationCallbacks);
+        return result;
+    }
+
+    return result;
+}
+
+MA_API void ma_delay_node_uninit(ma_delay_node* pDelayNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pDelayNode == NULL) {
+        return;
+    }
+
+    /* The base node is always uninitialized first. */
+    ma_node_uninit(pDelayNode, pAllocationCallbacks);
+    ma_delay_uninit(&pDelayNode->delay, pAllocationCallbacks);
+}
+
+MA_API void ma_delay_node_set_wet(ma_delay_node* pDelayNode, float value)
+{
+    if (pDelayNode == NULL) {
+        return;
+    }
+
+    ma_delay_set_wet(&pDelayNode->delay, value);
+}
+
+MA_API float ma_delay_node_get_wet(const ma_delay_node* pDelayNode)
+{
+    if (pDelayNode == NULL) {
+        return 0;
+    }
+
+    return ma_delay_get_wet(&pDelayNode->delay);
+}
+
+MA_API void ma_delay_node_set_dry(ma_delay_node* pDelayNode, float value)
+{
+    if (pDelayNode == NULL) {
+        return;
+    }
+
+    ma_delay_set_dry(&pDelayNode->delay, value);
+}
+
+MA_API float ma_delay_node_get_dry(const ma_delay_node* pDelayNode)
+{
+    if (pDelayNode == NULL) {
+        return 0;
+    }
+
+    return ma_delay_get_dry(&pDelayNode->delay);
+}
+
+MA_API void ma_delay_node_set_decay(ma_delay_node* pDelayNode, float value)
+{
+    if (pDelayNode == NULL) {
+        return;
+    }
+
+    ma_delay_set_decay(&pDelayNode->delay, value);
+}
+
+MA_API float ma_delay_node_get_decay(const ma_delay_node* pDelayNode)
+{
+    if (pDelayNode == NULL) {
+        return 0;
+    }
+
+    return ma_delay_get_decay(&pDelayNode->delay);
+}
+#endif  /* MA_NO_NODE_GRAPH */
+
+
+/* SECTION: miniaudio_engine.c */
+#if !defined(MA_NO_ENGINE) && !defined(MA_NO_NODE_GRAPH)
+/**************************************************************************************************************************************************************
+
+Engine
+
+**************************************************************************************************************************************************************/
+#define MA_SEEK_TARGET_NONE         (~(ma_uint64)0)
+
+
+static void ma_sound_set_at_end(ma_sound* pSound, ma_bool32 atEnd)
+{
+    MA_ASSERT(pSound != NULL);
+    ma_atomic_exchange_32(&pSound->atEnd, atEnd);
+
+    /* Fire any callbacks or events. */
+    if (atEnd) {
+        if (pSound->endCallback != NULL) {
+            pSound->endCallback(pSound->pEndCallbackUserData, pSound);
+        }
+    }
+}
+
+static ma_bool32 ma_sound_get_at_end(const ma_sound* pSound)
+{
+    MA_ASSERT(pSound != NULL);
+    return ma_atomic_load_32(&pSound->atEnd);
+}
+
+
+MA_API ma_engine_node_config ma_engine_node_config_init(ma_engine* pEngine, ma_engine_node_type type, ma_uint32 flags)
+{
+    ma_engine_node_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.pEngine                  = pEngine;
+    config.type                     = type;
+    config.isPitchDisabled          = (flags & MA_SOUND_FLAG_NO_PITCH) != 0;
+    config.isSpatializationDisabled = (flags & MA_SOUND_FLAG_NO_SPATIALIZATION) != 0;
+    config.monoExpansionMode        = pEngine->monoExpansionMode;
+
+    return config;
+}
+
+
+static void ma_engine_node_update_pitch_if_required(ma_engine_node* pEngineNode)
+{
+    ma_bool32 isUpdateRequired = MA_FALSE;
+    float newPitch;
+
+    MA_ASSERT(pEngineNode != NULL);
+
+    newPitch = ma_atomic_load_explicit_f32(&pEngineNode->pitch, ma_atomic_memory_order_acquire);
+
+    if (pEngineNode->oldPitch != newPitch) {
+        pEngineNode->oldPitch  = newPitch;
+        isUpdateRequired = MA_TRUE;
+    }
+
+    if (pEngineNode->oldDopplerPitch != pEngineNode->spatializer.dopplerPitch) {
+        pEngineNode->oldDopplerPitch  = pEngineNode->spatializer.dopplerPitch;
+        isUpdateRequired = MA_TRUE;
+    }
+
+    if (isUpdateRequired) {
+        float basePitch = (float)pEngineNode->sampleRate / ma_engine_get_sample_rate(pEngineNode->pEngine);
+        ma_linear_resampler_set_rate_ratio(&pEngineNode->resampler, basePitch * pEngineNode->oldPitch * pEngineNode->oldDopplerPitch);
+    }
+}
+
+static ma_bool32 ma_engine_node_is_pitching_enabled(const ma_engine_node* pEngineNode)
+{
+    MA_ASSERT(pEngineNode != NULL);
+
+    /* Don't try to be clever by skipping resampling in the pitch=1 case or else you'll glitch when moving away from 1. */
+    return !ma_atomic_load_explicit_32(&pEngineNode->isPitchDisabled, ma_atomic_memory_order_acquire);
+}
+
+static ma_bool32 ma_engine_node_is_spatialization_enabled(const ma_engine_node* pEngineNode)
+{
+    MA_ASSERT(pEngineNode != NULL);
+
+    return !ma_atomic_load_explicit_32(&pEngineNode->isSpatializationDisabled, ma_atomic_memory_order_acquire);
+}
+
+static ma_uint64 ma_engine_node_get_required_input_frame_count(const ma_engine_node* pEngineNode, ma_uint64 outputFrameCount)
+{
+    ma_uint64 inputFrameCount = 0;
+
+    if (ma_engine_node_is_pitching_enabled(pEngineNode)) {
+        ma_result result = ma_linear_resampler_get_required_input_frame_count(&pEngineNode->resampler, outputFrameCount, &inputFrameCount);
+        if (result != MA_SUCCESS) {
+            inputFrameCount = 0;
+        }
+    } else {
+        inputFrameCount = outputFrameCount;    /* No resampling, so 1:1. */
+    }
+
+    return inputFrameCount;
+}
+
+static ma_result ma_engine_node_set_volume(ma_engine_node* pEngineNode, float volume)
+{
+    if (pEngineNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    ma_atomic_float_set(&pEngineNode->volume, volume);
+
+    /* If we're not smoothing we should bypass the volume gainer entirely. */
+    if (pEngineNode->volumeSmoothTimeInPCMFrames == 0) {
+        /* We should always have an active spatializer because it can be enabled and disabled dynamically. We can just use that for holding our volume. */
+        ma_spatializer_set_master_volume(&pEngineNode->spatializer, volume);
+    } else {
+        /* We're using volume smoothing, so apply the master volume to the gainer. */
+        ma_gainer_set_gain(&pEngineNode->volumeGainer, volume);
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_engine_node_get_volume(const ma_engine_node* pEngineNode, float* pVolume)
+{
+    if (pVolume == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pVolume = 0.0f;
+
+    if (pEngineNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pVolume = ma_atomic_float_get((ma_atomic_float*)&pEngineNode->volume);
+
+    return MA_SUCCESS;
+}
+
+
+static void ma_engine_node_process_pcm_frames__general(ma_engine_node* pEngineNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    ma_uint32 frameCountIn;
+    ma_uint32 frameCountOut;
+    ma_uint32 totalFramesProcessedIn;
+    ma_uint32 totalFramesProcessedOut;
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_bool32 isPitchingEnabled;
+    ma_bool32 isFadingEnabled;
+    ma_bool32 isSpatializationEnabled;
+    ma_bool32 isPanningEnabled;
+    ma_bool32 isVolumeSmoothingEnabled;
+
+    frameCountIn  = *pFrameCountIn;
+    frameCountOut = *pFrameCountOut;
+
+    channelsIn  = ma_spatializer_get_input_channels(&pEngineNode->spatializer);
+    channelsOut = ma_spatializer_get_output_channels(&pEngineNode->spatializer);
+
+    totalFramesProcessedIn  = 0;
+    totalFramesProcessedOut = 0;
+
+    /* Update the fader if applicable. */
+    {
+        ma_uint64 fadeLengthInFrames = ma_atomic_uint64_get(&pEngineNode->fadeSettings.fadeLengthInFrames);
+        if (fadeLengthInFrames != ~(ma_uint64)0) {
+            float fadeVolumeBeg = ma_atomic_float_get(&pEngineNode->fadeSettings.volumeBeg);
+            float fadeVolumeEnd = ma_atomic_float_get(&pEngineNode->fadeSettings.volumeEnd);
+            ma_int64 fadeStartOffsetInFrames = (ma_int64)ma_atomic_uint64_get(&pEngineNode->fadeSettings.absoluteGlobalTimeInFrames);
+            if (fadeStartOffsetInFrames == (ma_int64)(~(ma_uint64)0)) {
+                fadeStartOffsetInFrames = 0;
+            } else {
+                fadeStartOffsetInFrames -= ma_engine_get_time_in_pcm_frames(pEngineNode->pEngine);
+            }
+
+            ma_fader_set_fade_ex(&pEngineNode->fader, fadeVolumeBeg, fadeVolumeEnd, fadeLengthInFrames, fadeStartOffsetInFrames);
+
+            /* Reset the fade length so we don't erroneously apply it again. */
+            ma_atomic_uint64_set(&pEngineNode->fadeSettings.fadeLengthInFrames, ~(ma_uint64)0);
+        }
+    }
+
+    isPitchingEnabled        = ma_engine_node_is_pitching_enabled(pEngineNode);
+    isFadingEnabled          = pEngineNode->fader.volumeBeg != 1 || pEngineNode->fader.volumeEnd != 1;
+    isSpatializationEnabled  = ma_engine_node_is_spatialization_enabled(pEngineNode);
+    isPanningEnabled         = pEngineNode->panner.pan != 0 && channelsOut != 1;
+    isVolumeSmoothingEnabled = pEngineNode->volumeSmoothTimeInPCMFrames > 0;
+
+    /* Keep going while we've still got data available for processing. */
+    while (totalFramesProcessedOut < frameCountOut) {
+        /*
+        We need to process in a specific order. We always do resampling first because it's likely
+        we're going to be increasing the channel count after spatialization. Also, I want to do
+        fading based on the output sample rate.
+
+        We'll first read into a buffer from the resampler. Then we'll do all processing that
+        operates on the on the input channel count. We'll then get the spatializer to output to
+        the output buffer and then do all effects from that point directly in the output buffer
+        in-place.
+
+        Note that we're always running the resampler if pitching is enabled, even when the pitch
+        is 1. If we try to be clever and skip resampling when the pitch is 1, we'll get a glitch
+        when we move away from 1, back to 1, and then away from 1 again. We'll want to implement
+        any pitch=1 optimizations in the resampler itself.
+
+        There's a small optimization here that we'll utilize since it might be a fairly common
+        case. When the input and output channel counts are the same, we'll read straight into the
+        output buffer from the resampler and do everything in-place.
+        */
+        const float* pRunningFramesIn;
+        float* pRunningFramesOut;
+        float* pWorkingBuffer;   /* This is the buffer that we'll be processing frames in. This is in input channels. */
+        float temp[MA_DATA_CONVERTER_STACK_BUFFER_SIZE / sizeof(float)];
+        ma_uint32 tempCapInFrames = ma_countof(temp) / channelsIn;
+        ma_uint32 framesAvailableIn;
+        ma_uint32 framesAvailableOut;
+        ma_uint32 framesJustProcessedIn;
+        ma_uint32 framesJustProcessedOut;
+        ma_bool32 isWorkingBufferValid = MA_FALSE;
+
+        framesAvailableIn  = frameCountIn  - totalFramesProcessedIn;
+        framesAvailableOut = frameCountOut - totalFramesProcessedOut;
+
+        pRunningFramesIn  = ma_offset_pcm_frames_const_ptr_f32(ppFramesIn[0], totalFramesProcessedIn, channelsIn);
+        pRunningFramesOut = ma_offset_pcm_frames_ptr_f32(ppFramesOut[0], totalFramesProcessedOut, channelsOut);
+
+        if (channelsIn == channelsOut) {
+            /* Fast path. Channel counts are the same. No need for an intermediary input buffer. */
+            pWorkingBuffer = pRunningFramesOut;
+        } else {
+            /* Slow path. Channel counts are different. Need to use an intermediary input buffer. */
+            pWorkingBuffer = temp;
+            if (framesAvailableOut > tempCapInFrames) {
+                framesAvailableOut = tempCapInFrames;
+            }
+        }
+
+        /* First is resampler. */
+        if (isPitchingEnabled) {
+            ma_uint64 resampleFrameCountIn  = framesAvailableIn;
+            ma_uint64 resampleFrameCountOut = framesAvailableOut;
+
+            ma_linear_resampler_process_pcm_frames(&pEngineNode->resampler, pRunningFramesIn, &resampleFrameCountIn, pWorkingBuffer, &resampleFrameCountOut);
+            isWorkingBufferValid = MA_TRUE;
+
+            framesJustProcessedIn  = (ma_uint32)resampleFrameCountIn;
+            framesJustProcessedOut = (ma_uint32)resampleFrameCountOut;
+        } else {
+            framesJustProcessedIn  = ma_min(framesAvailableIn, framesAvailableOut);
+            framesJustProcessedOut = framesJustProcessedIn; /* When no resampling is being performed, the number of output frames is the same as input frames. */
+        }
+
+        /* Fading. */
+        if (isFadingEnabled) {
+            if (isWorkingBufferValid) {
+                ma_fader_process_pcm_frames(&pEngineNode->fader, pWorkingBuffer, pWorkingBuffer, framesJustProcessedOut);   /* In-place processing. */
+            } else {
+                ma_fader_process_pcm_frames(&pEngineNode->fader, pWorkingBuffer, pRunningFramesIn, framesJustProcessedOut);
+                isWorkingBufferValid = MA_TRUE;
+            }
+        }
+
+        /*
+        If we're using smoothing, we won't be applying volume via the spatializer, but instead from a ma_gainer. In this case
+        we'll want to apply our volume now.
+        */
+        if (isVolumeSmoothingEnabled) {
+            if (isWorkingBufferValid) {
+                ma_gainer_process_pcm_frames(&pEngineNode->volumeGainer, pWorkingBuffer, pWorkingBuffer, framesJustProcessedOut);
+            } else {
+                ma_gainer_process_pcm_frames(&pEngineNode->volumeGainer, pWorkingBuffer, pRunningFramesIn, framesJustProcessedOut);
+                isWorkingBufferValid = MA_TRUE;
+            }
+        }
+
+        /*
+        If at this point we still haven't actually done anything with the working buffer we need
+        to just read straight from the input buffer.
+        */
+        if (isWorkingBufferValid == MA_FALSE) {
+            pWorkingBuffer = (float*)pRunningFramesIn;  /* Naughty const cast, but it's safe at this point because we won't ever be writing to it from this point out. */
+        }
+
+        /* Spatialization. */
+        if (isSpatializationEnabled) {
+            ma_uint32 iListener;
+
+            /*
+            When determining the listener to use, we first check to see if the sound is pinned to a
+            specific listener. If so, we use that. Otherwise we just use the closest listener.
+            */
+            if (pEngineNode->pinnedListenerIndex != MA_LISTENER_INDEX_CLOSEST && pEngineNode->pinnedListenerIndex < ma_engine_get_listener_count(pEngineNode->pEngine)) {
+                iListener = pEngineNode->pinnedListenerIndex;
+            } else {
+                ma_vec3f spatializerPosition = ma_spatializer_get_position(&pEngineNode->spatializer);
+                iListener = ma_engine_find_closest_listener(pEngineNode->pEngine, spatializerPosition.x, spatializerPosition.y, spatializerPosition.z);
+            }
+
+            ma_spatializer_process_pcm_frames(&pEngineNode->spatializer, &pEngineNode->pEngine->listeners[iListener], pRunningFramesOut, pWorkingBuffer, framesJustProcessedOut);
+        } else {
+            /* No spatialization, but we still need to do channel conversion and master volume. */
+            float volume;
+            ma_engine_node_get_volume(pEngineNode, &volume);    /* Should never fail. */
+
+            if (channelsIn == channelsOut) {
+                /* No channel conversion required. Just copy straight to the output buffer. */
+                if (isVolumeSmoothingEnabled) {
+                    /* Volume has already been applied. Just copy straight to the output buffer. */
+                    ma_copy_pcm_frames(pRunningFramesOut, pWorkingBuffer, framesJustProcessedOut * channelsOut, ma_format_f32, channelsOut);
+                } else {
+                    /* Volume has not been applied yet. Copy and apply volume in the same pass. */
+                    ma_copy_and_apply_volume_factor_f32(pRunningFramesOut, pWorkingBuffer, framesJustProcessedOut * channelsOut, volume);
+                }
+            } else {
+                /* Channel conversion required. TODO: Add support for channel maps here. */
+                ma_channel_map_apply_f32(pRunningFramesOut, NULL, channelsOut, pWorkingBuffer, NULL, channelsIn, framesJustProcessedOut, ma_channel_mix_mode_simple, pEngineNode->monoExpansionMode);
+
+                /* If we're using smoothing, the volume will have already been applied. */
+                if (!isVolumeSmoothingEnabled) {
+                    ma_apply_volume_factor_f32(pRunningFramesOut, framesJustProcessedOut * channelsOut, volume);
+                }
+            }
+        }
+
+        /* At this point we can guarantee that the output buffer contains valid data. We can process everything in place now. */
+
+        /* Panning. */
+        if (isPanningEnabled) {
+            ma_panner_process_pcm_frames(&pEngineNode->panner, pRunningFramesOut, pRunningFramesOut, framesJustProcessedOut);   /* In-place processing. */
+        }
+
+        /* We're done for this chunk. */
+        totalFramesProcessedIn  += framesJustProcessedIn;
+        totalFramesProcessedOut += framesJustProcessedOut;
+
+        /* If we didn't process any output frames this iteration it means we've either run out of input data, or run out of room in the output buffer. */
+        if (framesJustProcessedOut == 0) {
+            break;
+        }
+    }
+
+    /* At this point we're done processing. */
+    *pFrameCountIn  = totalFramesProcessedIn;
+    *pFrameCountOut = totalFramesProcessedOut;
+}
+
+static void ma_engine_node_process_pcm_frames__sound(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    /* For sounds, we need to first read from the data source. Then we need to apply the engine effects (pan, pitch, fades, etc.). */
+    ma_result result = MA_SUCCESS;
+    ma_sound* pSound = (ma_sound*)pNode;
+    ma_uint32 frameCount = *pFrameCountOut;
+    ma_uint32 totalFramesRead = 0;
+    ma_format dataSourceFormat;
+    ma_uint32 dataSourceChannels;
+    ma_uint8 temp[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
+    ma_uint32 tempCapInFrames;
+    ma_uint64 seekTarget;
+
+    /* This is a data source node which means no input buses. */
+    (void)ppFramesIn;
+    (void)pFrameCountIn;
+
+    /* If we're marked at the end we need to stop the sound and do nothing. */
+    if (ma_sound_at_end(pSound)) {
+        ma_sound_stop(pSound);
+        *pFrameCountOut = 0;
+        return;
+    }
+
+    /* If we're seeking, do so now before reading. */
+    seekTarget = ma_atomic_load_64(&pSound->seekTarget);
+    if (seekTarget != MA_SEEK_TARGET_NONE) {
+        ma_data_source_seek_to_pcm_frame(pSound->pDataSource, seekTarget);
+
+        /* Any time-dependant effects need to have their times updated. */
+        ma_node_set_time(pSound, seekTarget);
+
+        ma_atomic_exchange_64(&pSound->seekTarget, MA_SEEK_TARGET_NONE);
+    }
+
+    /*
+    We want to update the pitch once. For sounds, this can be either at the start or at the end. If
+    we don't force this to only ever be updating once, we could end up in a situation where
+    retrieving the required input frame count ends up being different to what we actually retrieve.
+    What could happen is that the required input frame count is calculated, the pitch is update,
+    and then this processing function is called resulting in a different number of input frames
+    being processed. Do not call this in ma_engine_node_process_pcm_frames__general() or else
+    you'll hit the aforementioned bug.
+    */
+    ma_engine_node_update_pitch_if_required(&pSound->engineNode);
+
+    /*
+    For the convenience of the caller, we're doing to allow data sources to use non-floating-point formats and channel counts that differ
+    from the main engine.
+    */
+    result = ma_data_source_get_data_format(pSound->pDataSource, &dataSourceFormat, &dataSourceChannels, NULL, NULL, 0);
+    if (result == MA_SUCCESS) {
+        tempCapInFrames = sizeof(temp) / ma_get_bytes_per_frame(dataSourceFormat, dataSourceChannels);
+
+        /* Keep reading until we've read as much as was requested or we reach the end of the data source. */
+        while (totalFramesRead < frameCount) {
+            ma_uint32 framesRemaining = frameCount - totalFramesRead;
+            ma_uint32 framesToRead;
+            ma_uint64 framesJustRead;
+            ma_uint32 frameCountIn;
+            ma_uint32 frameCountOut;
+            const float* pRunningFramesIn;
+            float* pRunningFramesOut;
+
+            /*
+            The first thing we need to do is read into the temporary buffer. We can calculate exactly
+            how many input frames we'll need after resampling.
+            */
+            framesToRead = (ma_uint32)ma_engine_node_get_required_input_frame_count(&pSound->engineNode, framesRemaining);
+            if (framesToRead > tempCapInFrames) {
+                framesToRead = tempCapInFrames;
+            }
+
+            result = ma_data_source_read_pcm_frames(pSound->pDataSource, temp, framesToRead, &framesJustRead);
+
+            /* If we reached the end of the sound we'll want to mark it as at the end and stop it. This should never be returned for looping sounds. */
+            if (result == MA_AT_END) {
+                ma_sound_set_at_end(pSound, MA_TRUE);   /* This will be set to false in ma_sound_start(). */
+            }
+
+            pRunningFramesOut = ma_offset_pcm_frames_ptr_f32(ppFramesOut[0], totalFramesRead, ma_node_get_output_channels(pNode, 0));
+
+            frameCountIn = (ma_uint32)framesJustRead;
+            frameCountOut = framesRemaining;
+
+            /* Convert if necessary. */
+            if (dataSourceFormat == ma_format_f32) {
+                /* Fast path. No data conversion necessary. */
+                pRunningFramesIn = (float*)temp;
+                ma_engine_node_process_pcm_frames__general(&pSound->engineNode, &pRunningFramesIn, &frameCountIn, &pRunningFramesOut, &frameCountOut);
+            } else {
+                /* Slow path. Need to do sample format conversion to f32. If we give the f32 buffer the same count as the first temp buffer, we're guaranteed it'll be large enough. */
+                float tempf32[MA_DATA_CONVERTER_STACK_BUFFER_SIZE]; /* Do not do `MA_DATA_CONVERTER_STACK_BUFFER_SIZE/sizeof(float)` here like we've done in other places. */
+                ma_convert_pcm_frames_format(tempf32, ma_format_f32, temp, dataSourceFormat, framesJustRead, dataSourceChannels, ma_dither_mode_none);
+
+                /* Now that we have our samples in f32 format we can process like normal. */
+                pRunningFramesIn = tempf32;
+                ma_engine_node_process_pcm_frames__general(&pSound->engineNode, &pRunningFramesIn, &frameCountIn, &pRunningFramesOut, &frameCountOut);
+            }
+
+            /* We should have processed all of our input frames since we calculated the required number of input frames at the top. */
+            MA_ASSERT(frameCountIn == framesJustRead);
+            totalFramesRead += (ma_uint32)frameCountOut;   /* Safe cast. */
+
+            if (result != MA_SUCCESS || ma_sound_at_end(pSound)) {
+                break;  /* Might have reached the end. */
+            }
+        }
+    }
+
+    *pFrameCountOut = totalFramesRead;
+}
+
+static void ma_engine_node_process_pcm_frames__group(ma_node* pNode, const float** ppFramesIn, ma_uint32* pFrameCountIn, float** ppFramesOut, ma_uint32* pFrameCountOut)
+{
+    /*
+    Make sure the pitch is updated before trying to read anything. It's important that this is done
+    only once and not in ma_engine_node_process_pcm_frames__general(). The reason for this is that
+    ma_engine_node_process_pcm_frames__general() will call ma_engine_node_get_required_input_frame_count(),
+    and if another thread modifies the pitch just after that call it can result in a glitch due to
+    the input rate changing.
+    */
+    ma_engine_node_update_pitch_if_required((ma_engine_node*)pNode);
+
+    /* For groups, the input data has already been read and we just need to apply the effect. */
+    ma_engine_node_process_pcm_frames__general((ma_engine_node*)pNode, ppFramesIn, pFrameCountIn, ppFramesOut, pFrameCountOut);
+}
+
+static ma_result ma_engine_node_get_required_input_frame_count__group(ma_node* pNode, ma_uint32 outputFrameCount, ma_uint32* pInputFrameCount)
+{
+    ma_uint64 inputFrameCount;
+
+    MA_ASSERT(pInputFrameCount != NULL);
+
+    /* Our pitch will affect this calculation. We need to update it. */
+    ma_engine_node_update_pitch_if_required((ma_engine_node*)pNode);
+
+    inputFrameCount = ma_engine_node_get_required_input_frame_count((ma_engine_node*)pNode, outputFrameCount);
+    if (inputFrameCount > 0xFFFFFFFF) {
+        inputFrameCount = 0xFFFFFFFF;    /* Will never happen because miniaudio will only ever process in relatively small chunks. */
+    }
+
+    *pInputFrameCount = (ma_uint32)inputFrameCount;
+
+    return MA_SUCCESS;
+}
+
+
+static ma_node_vtable g_ma_engine_node_vtable__sound =
+{
+    ma_engine_node_process_pcm_frames__sound,
+    NULL,   /* onGetRequiredInputFrameCount */
+    0,      /* Sounds are data source nodes which means they have zero inputs (their input is drawn from the data source itself). */
+    1,      /* Sounds have one output bus. */
+    0       /* Default flags. */
+};
+
+static ma_node_vtable g_ma_engine_node_vtable__group =
+{
+    ma_engine_node_process_pcm_frames__group,
+    ma_engine_node_get_required_input_frame_count__group,
+    1,      /* Groups have one input bus. */
+    1,      /* Groups have one output bus. */
+    MA_NODE_FLAG_DIFFERENT_PROCESSING_RATES /* The engine node does resampling so should let miniaudio know about it. */
+};
+
+
+
+static ma_node_config ma_engine_node_base_node_config_init(const ma_engine_node_config* pConfig)
+{
+    ma_node_config baseNodeConfig;
+
+    if (pConfig->type == ma_engine_node_type_sound) {
+        /* Sound. */
+        baseNodeConfig = ma_node_config_init();
+        baseNodeConfig.vtable       = &g_ma_engine_node_vtable__sound;
+        baseNodeConfig.initialState = ma_node_state_stopped;    /* Sounds are stopped by default. */
+    } else {
+        /* Group. */
+        baseNodeConfig = ma_node_config_init();
+        baseNodeConfig.vtable       = &g_ma_engine_node_vtable__group;
+        baseNodeConfig.initialState = ma_node_state_started;    /* Groups are started by default. */
+    }
+
+    return baseNodeConfig;
+}
+
+static ma_spatializer_config ma_engine_node_spatializer_config_init(const ma_node_config* pBaseNodeConfig)
+{
+    return ma_spatializer_config_init(pBaseNodeConfig->pInputChannels[0], pBaseNodeConfig->pOutputChannels[0]);
+}
+
+typedef struct
+{
+    size_t sizeInBytes;
+    size_t baseNodeOffset;
+    size_t resamplerOffset;
+    size_t spatializerOffset;
+    size_t gainerOffset;
+} ma_engine_node_heap_layout;
+
+static ma_result ma_engine_node_get_heap_layout(const ma_engine_node_config* pConfig, ma_engine_node_heap_layout* pHeapLayout)
+{
+    ma_result result;
+    size_t tempHeapSize;
+    ma_node_config baseNodeConfig;
+    ma_linear_resampler_config resamplerConfig;
+    ma_spatializer_config spatializerConfig;
+    ma_gainer_config gainerConfig;
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_channel defaultStereoChannelMap[2] = {MA_CHANNEL_SIDE_LEFT, MA_CHANNEL_SIDE_RIGHT};  /* <-- Consistent with the default channel map of a stereo listener. Means channel conversion can run on a fast path. */
+
+    MA_ASSERT(pHeapLayout);
+
+    MA_ZERO_OBJECT(pHeapLayout);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    if (pConfig->pEngine == NULL) {
+        return MA_INVALID_ARGS; /* An engine must be specified. */
+    }
+
+    pHeapLayout->sizeInBytes = 0;
+
+    channelsIn  = (pConfig->channelsIn  != 0) ? pConfig->channelsIn  : ma_engine_get_channels(pConfig->pEngine);
+    channelsOut = (pConfig->channelsOut != 0) ? pConfig->channelsOut : ma_engine_get_channels(pConfig->pEngine);
+
+
+    /* Base node. */
+    baseNodeConfig = ma_engine_node_base_node_config_init(pConfig);
+    baseNodeConfig.pInputChannels  = &channelsIn;
+    baseNodeConfig.pOutputChannels = &channelsOut;
+
+    result = ma_node_get_heap_size(ma_engine_get_node_graph(pConfig->pEngine), &baseNodeConfig, &tempHeapSize);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to retrieve the size of the heap for the base node. */
+    }
+
+    pHeapLayout->baseNodeOffset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += ma_align_64(tempHeapSize);
+
+
+    /* Resmapler. */
+    resamplerConfig = ma_linear_resampler_config_init(ma_format_f32, channelsIn, 1, 1); /* Input and output sample rates don't affect the calculation of the heap size. */
+    resamplerConfig.lpfOrder = 0;
+
+    result = ma_linear_resampler_get_heap_size(&resamplerConfig, &tempHeapSize);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to retrieve the size of the heap for the resampler. */
+    }
+
+    pHeapLayout->resamplerOffset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += ma_align_64(tempHeapSize);
+
+
+    /* Spatializer. */
+    spatializerConfig = ma_engine_node_spatializer_config_init(&baseNodeConfig);
+
+    if (spatializerConfig.channelsIn == 2) {
+        spatializerConfig.pChannelMapIn = defaultStereoChannelMap;
+    }
+
+    result = ma_spatializer_get_heap_size(&spatializerConfig, &tempHeapSize);
+    if (result != MA_SUCCESS) {
+        return result;  /* Failed to retrieve the size of the heap for the spatializer. */
+    }
+
+    pHeapLayout->spatializerOffset = pHeapLayout->sizeInBytes;
+    pHeapLayout->sizeInBytes += ma_align_64(tempHeapSize);
+
+
+    /* Gainer. Will not be used if we are not using smoothing. */
+    if (pConfig->volumeSmoothTimeInPCMFrames > 0) {
+        gainerConfig = ma_gainer_config_init(channelsIn, pConfig->volumeSmoothTimeInPCMFrames);
+
+        result = ma_gainer_get_heap_size(&gainerConfig, &tempHeapSize);
+        if (result != MA_SUCCESS) {
+            return result;
+        }
+
+        pHeapLayout->gainerOffset = pHeapLayout->sizeInBytes;
+        pHeapLayout->sizeInBytes += ma_align_64(tempHeapSize);
+    }
+
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_engine_node_get_heap_size(const ma_engine_node_config* pConfig, size_t* pHeapSizeInBytes)
+{
+    ma_result result;
+    ma_engine_node_heap_layout heapLayout;
+
+    if (pHeapSizeInBytes == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    *pHeapSizeInBytes = 0;
+
+    result = ma_engine_node_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    *pHeapSizeInBytes = heapLayout.sizeInBytes;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_engine_node_init_preallocated(const ma_engine_node_config* pConfig, void* pHeap, ma_engine_node* pEngineNode)
+{
+    ma_result result;
+    ma_engine_node_heap_layout heapLayout;
+    ma_node_config baseNodeConfig;
+    ma_linear_resampler_config resamplerConfig;
+    ma_fader_config faderConfig;
+    ma_spatializer_config spatializerConfig;
+    ma_panner_config pannerConfig;
+    ma_gainer_config gainerConfig;
+    ma_uint32 channelsIn;
+    ma_uint32 channelsOut;
+    ma_channel defaultStereoChannelMap[2] = {MA_CHANNEL_SIDE_LEFT, MA_CHANNEL_SIDE_RIGHT};  /* <-- Consistent with the default channel map of a stereo listener. Means channel conversion can run on a fast path. */
+
+    if (pEngineNode == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pEngineNode);
+
+    result = ma_engine_node_get_heap_layout(pConfig, &heapLayout);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pConfig->pinnedListenerIndex != MA_LISTENER_INDEX_CLOSEST && pConfig->pinnedListenerIndex >= ma_engine_get_listener_count(pConfig->pEngine)) {
+        return MA_INVALID_ARGS; /* Invalid listener. */
+    }
+
+    pEngineNode->_pHeap = pHeap;
+    MA_ZERO_MEMORY(pHeap, heapLayout.sizeInBytes);
+
+    pEngineNode->pEngine                     = pConfig->pEngine;
+    pEngineNode->sampleRate                  = (pConfig->sampleRate > 0) ? pConfig->sampleRate : ma_engine_get_sample_rate(pEngineNode->pEngine);
+    pEngineNode->volumeSmoothTimeInPCMFrames = pConfig->volumeSmoothTimeInPCMFrames;
+    pEngineNode->monoExpansionMode           = pConfig->monoExpansionMode;
+    ma_atomic_float_set(&pEngineNode->volume, 1);
+    pEngineNode->pitch                       = 1;
+    pEngineNode->oldPitch                    = 1;
+    pEngineNode->oldDopplerPitch             = 1;
+    pEngineNode->isPitchDisabled             = pConfig->isPitchDisabled;
+    pEngineNode->isSpatializationDisabled    = pConfig->isSpatializationDisabled;
+    pEngineNode->pinnedListenerIndex         = pConfig->pinnedListenerIndex;
+    ma_atomic_float_set(&pEngineNode->fadeSettings.volumeBeg, 1);
+    ma_atomic_float_set(&pEngineNode->fadeSettings.volumeEnd, 1);
+    ma_atomic_uint64_set(&pEngineNode->fadeSettings.fadeLengthInFrames, (~(ma_uint64)0));
+    ma_atomic_uint64_set(&pEngineNode->fadeSettings.absoluteGlobalTimeInFrames, (~(ma_uint64)0));   /* <-- Indicates that the fade should start immediately. */
+
+    channelsIn  = (pConfig->channelsIn  != 0) ? pConfig->channelsIn  : ma_engine_get_channels(pConfig->pEngine);
+    channelsOut = (pConfig->channelsOut != 0) ? pConfig->channelsOut : ma_engine_get_channels(pConfig->pEngine);
+
+    /*
+    If the sample rate of the sound is different to the engine, make sure pitching is enabled so that the resampler
+    is activated. Not doing this will result in the sound not being resampled if MA_SOUND_FLAG_NO_PITCH is used.
+    */
+    if (pEngineNode->sampleRate != ma_engine_get_sample_rate(pEngineNode->pEngine)) {
+        pEngineNode->isPitchDisabled = MA_FALSE;
+    }
+
+
+    /* Base node. */
+    baseNodeConfig = ma_engine_node_base_node_config_init(pConfig);
+    baseNodeConfig.pInputChannels  = &channelsIn;
+    baseNodeConfig.pOutputChannels = &channelsOut;
+
+    result = ma_node_init_preallocated(&pConfig->pEngine->nodeGraph, &baseNodeConfig, ma_offset_ptr(pHeap, heapLayout.baseNodeOffset), &pEngineNode->baseNode);
+    if (result != MA_SUCCESS) {
+        goto error0;
+    }
+
+
+    /*
+    We can now initialize the effects we need in order to implement the engine node. There's a
+    defined order of operations here, mainly centered around when we convert our channels from the
+    data source's native channel count to the engine's channel count. As a rule, we want to do as
+    much computation as possible before spatialization because there's a chance that will increase
+    the channel count, thereby increasing the amount of work needing to be done to process.
+    */
+
+    /* We'll always do resampling first. */
+    resamplerConfig = ma_linear_resampler_config_init(ma_format_f32, baseNodeConfig.pInputChannels[0], pEngineNode->sampleRate, ma_engine_get_sample_rate(pEngineNode->pEngine));
+    resamplerConfig.lpfOrder = 0;    /* <-- Need to disable low-pass filtering for pitch shifting for now because there's cases where the biquads are becoming unstable. Need to figure out a better fix for this. */
+
+    result = ma_linear_resampler_init_preallocated(&resamplerConfig, ma_offset_ptr(pHeap, heapLayout.resamplerOffset), &pEngineNode->resampler);
+    if (result != MA_SUCCESS) {
+        goto error1;
+    }
+
+
+    /* After resampling will come the fader. */
+    faderConfig = ma_fader_config_init(ma_format_f32, baseNodeConfig.pInputChannels[0], ma_engine_get_sample_rate(pEngineNode->pEngine));
+
+    result = ma_fader_init(&faderConfig, &pEngineNode->fader);
+    if (result != MA_SUCCESS) {
+        goto error2;
+    }
+
+
+    /*
+    Spatialization comes next. We spatialize based on the node's output channel count. It's up the caller to
+    ensure channels counts link up correctly in the node graph.
+    */
+    spatializerConfig = ma_engine_node_spatializer_config_init(&baseNodeConfig);
+    spatializerConfig.gainSmoothTimeInFrames = pEngineNode->pEngine->gainSmoothTimeInFrames;
+
+    if (spatializerConfig.channelsIn == 2) {
+        spatializerConfig.pChannelMapIn = defaultStereoChannelMap;
+    }
+
+    result = ma_spatializer_init_preallocated(&spatializerConfig, ma_offset_ptr(pHeap, heapLayout.spatializerOffset), &pEngineNode->spatializer);
+    if (result != MA_SUCCESS) {
+        goto error2;
+    }
+
+
+    /*
+    After spatialization comes panning. We need to do this after spatialization because otherwise we wouldn't
+    be able to pan mono sounds.
+    */
+    pannerConfig = ma_panner_config_init(ma_format_f32, baseNodeConfig.pOutputChannels[0]);
+
+    result = ma_panner_init(&pannerConfig, &pEngineNode->panner);
+    if (result != MA_SUCCESS) {
+        goto error3;
+    }
+
+
+    /* We'll need a gainer for smoothing out volume changes if we have a non-zero smooth time. We apply this before converting to the output channel count. */
+    if (pConfig->volumeSmoothTimeInPCMFrames > 0) {
+        gainerConfig = ma_gainer_config_init(channelsIn, pConfig->volumeSmoothTimeInPCMFrames);
+
+        result = ma_gainer_init_preallocated(&gainerConfig, ma_offset_ptr(pHeap, heapLayout.gainerOffset), &pEngineNode->volumeGainer);
+        if (result != MA_SUCCESS) {
+            goto error3;
+        }
+    }
+
+
+    return MA_SUCCESS;
+
+    /* No need for allocation callbacks here because we use a preallocated heap. */
+error3: ma_spatializer_uninit(&pEngineNode->spatializer, NULL);
+error2: ma_linear_resampler_uninit(&pEngineNode->resampler, NULL);
+error1: ma_node_uninit(&pEngineNode->baseNode, NULL);
+error0: return result;
+}
+
+MA_API ma_result ma_engine_node_init(const ma_engine_node_config* pConfig, const ma_allocation_callbacks* pAllocationCallbacks, ma_engine_node* pEngineNode)
+{
+    ma_result result;
+    size_t heapSizeInBytes;
+    void* pHeap;
+
+    result = ma_engine_node_get_heap_size(pConfig, &heapSizeInBytes);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (heapSizeInBytes > 0) {
+        pHeap = ma_malloc(heapSizeInBytes, pAllocationCallbacks);
+        if (pHeap == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+    } else {
+        pHeap = NULL;
+    }
+
+    result = ma_engine_node_init_preallocated(pConfig, pHeap, pEngineNode);
+    if (result != MA_SUCCESS) {
+        ma_free(pHeap, pAllocationCallbacks);
+        return result;
+    }
+
+    pEngineNode->_ownsHeap = MA_TRUE;
+    return MA_SUCCESS;
+}
+
+MA_API void ma_engine_node_uninit(ma_engine_node* pEngineNode, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    /*
+    The base node always needs to be uninitialized first to ensure it's detached from the graph completely before we
+    destroy anything that might be in the middle of being used by the processing function.
+    */
+    ma_node_uninit(&pEngineNode->baseNode, pAllocationCallbacks);
+
+    /* Now that the node has been uninitialized we can safely uninitialize the rest. */
+    if (pEngineNode->volumeSmoothTimeInPCMFrames > 0) {
+        ma_gainer_uninit(&pEngineNode->volumeGainer, pAllocationCallbacks);
+    }
+
+    ma_spatializer_uninit(&pEngineNode->spatializer, pAllocationCallbacks);
+    ma_linear_resampler_uninit(&pEngineNode->resampler, pAllocationCallbacks);
+
+    /* Free the heap last. */
+    if (pEngineNode->_ownsHeap) {
+        ma_free(pEngineNode->_pHeap, pAllocationCallbacks);
+    }
+}
+
+
+MA_API ma_sound_config ma_sound_config_init(void)
+{
+    return ma_sound_config_init_2(NULL);
+}
+
+MA_API ma_sound_config ma_sound_config_init_2(ma_engine* pEngine)
+{
+    ma_sound_config config;
+
+    MA_ZERO_OBJECT(&config);
+
+    if (pEngine != NULL) {
+        config.monoExpansionMode = pEngine->monoExpansionMode;
+    } else {
+        config.monoExpansionMode = ma_mono_expansion_mode_default;
+    }
+
+    config.rangeEndInPCMFrames     = ~((ma_uint64)0);
+    config.loopPointEndInPCMFrames = ~((ma_uint64)0);
+
+    return config;
+}
+
+MA_API ma_sound_group_config ma_sound_group_config_init(void)
+{
+    return ma_sound_group_config_init_2(NULL);
+}
+
+MA_API ma_sound_group_config ma_sound_group_config_init_2(ma_engine* pEngine)
+{
+    ma_sound_group_config config;
+
+    MA_ZERO_OBJECT(&config);
+
+    if (pEngine != NULL) {
+        config.monoExpansionMode = pEngine->monoExpansionMode;
+    } else {
+        config.monoExpansionMode = ma_mono_expansion_mode_default;
+    }
+
+    return config;
+}
+
+
+MA_API ma_engine_config ma_engine_config_init(void)
+{
+    ma_engine_config config;
+
+    MA_ZERO_OBJECT(&config);
+    config.listenerCount     = 1;   /* Always want at least one listener. */
+    config.monoExpansionMode = ma_mono_expansion_mode_default;
+
+    return config;
+}
+
+
+#if !defined(MA_NO_DEVICE_IO)
+static void ma_engine_data_callback_internal(ma_device* pDevice, void* pFramesOut, const void* pFramesIn, ma_uint32 frameCount)
+{
+    ma_engine* pEngine = (ma_engine*)pDevice->pUserData;
+
+    (void)pFramesIn;
+
+    /*
+    Experiment: Try processing a resource manager job if we're on the Emscripten build.
+
+    This serves two purposes:
+
+        1) It ensures jobs are actually processed at some point since we cannot guarantee that the
+           caller is doing the right thing and calling ma_resource_manager_process_next_job(); and
+
+        2) It's an attempt at working around an issue where processing jobs on the Emscripten main
+           loop doesn't work as well as it should. When trying to load sounds without the `DECODE`
+           flag or with the `ASYNC` flag, the sound data is just not able to be loaded in time
+           before the callback is processed. I think it's got something to do with the single-
+           threaded nature of Web, but I'm not entirely sure.
+    */
+    #if !defined(MA_NO_RESOURCE_MANAGER) && defined(MA_EMSCRIPTEN)
+    {
+        if (pEngine->pResourceManager != NULL) {
+            if ((pEngine->pResourceManager->config.flags & MA_RESOURCE_MANAGER_FLAG_NO_THREADING) != 0) {
+                ma_resource_manager_process_next_job(pEngine->pResourceManager);
+            }
+        }
+    }
+    #endif
+
+    ma_engine_read_pcm_frames(pEngine, pFramesOut, frameCount, NULL);
+}
+
+static ma_uint32 ma_device__get_processing_size_in_frames(ma_device* pDevice)
+{
+    /*
+    The processing size is the period size. The device can have a fixed sized processing size, or
+    it can be decided by the backend in which case it can be variable.
+    */
+    if (pDevice->playback.intermediaryBufferCap > 0) {
+        /* Using a fixed sized processing callback. */
+        return pDevice->playback.intermediaryBufferCap;
+    } else {
+        /* Not using a fixed sized processing callback. Need to estimate the processing size based on the backend. */
+        return pDevice->playback.internalPeriodSizeInFrames;
+    }
+}
+#endif
+
+MA_API ma_result ma_engine_init(const ma_engine_config* pConfig, ma_engine* pEngine)
+{
+    ma_result result;
+    ma_node_graph_config nodeGraphConfig;
+    ma_engine_config engineConfig;
+    ma_spatializer_listener_config listenerConfig;
+    ma_uint32 iListener;
+
+    if (pEngine == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pEngine);
+
+    /* The config is allowed to be NULL in which case we use defaults for everything. */
+    if (pConfig != NULL) {
+        engineConfig = *pConfig;
+    } else {
+        engineConfig = ma_engine_config_init();
+    }
+
+    pEngine->monoExpansionMode = engineConfig.monoExpansionMode;
+    pEngine->defaultVolumeSmoothTimeInPCMFrames = engineConfig.defaultVolumeSmoothTimeInPCMFrames;
+    pEngine->onProcess = engineConfig.onProcess;
+    pEngine->pProcessUserData = engineConfig.pProcessUserData;
+    ma_allocation_callbacks_init_copy(&pEngine->allocationCallbacks, &engineConfig.allocationCallbacks);
+
+    #if !defined(MA_NO_RESOURCE_MANAGER)
+    {
+        pEngine->pResourceManager = engineConfig.pResourceManager;
+    }
+    #endif
+
+    #if !defined(MA_NO_DEVICE_IO)
+    {
+        pEngine->pDevice = engineConfig.pDevice;
+
+        /* If we don't have a device, we need one. */
+        if (pEngine->pDevice == NULL && engineConfig.noDevice == MA_FALSE) {
+            ma_device_config deviceConfig;
+
+            pEngine->pDevice = (ma_device*)ma_malloc(sizeof(*pEngine->pDevice), &pEngine->allocationCallbacks);
+            if (pEngine->pDevice == NULL) {
+                return MA_OUT_OF_MEMORY;
+            }
+
+            deviceConfig = ma_device_config_init(ma_device_type_playback);
+            deviceConfig.playback.pDeviceID        = engineConfig.pPlaybackDeviceID;
+            deviceConfig.playback.format           = ma_format_f32;
+            deviceConfig.playback.channels         = engineConfig.channels;
+            deviceConfig.sampleRate                = engineConfig.sampleRate;
+            deviceConfig.dataCallback              = (engineConfig.dataCallback != NULL) ? engineConfig.dataCallback : ma_engine_data_callback_internal;
+            deviceConfig.pUserData                 = pEngine;
+            deviceConfig.notificationCallback      = engineConfig.notificationCallback;
+            deviceConfig.periodSizeInFrames        = engineConfig.periodSizeInFrames;
+            deviceConfig.periodSizeInMilliseconds  = engineConfig.periodSizeInMilliseconds;
+            deviceConfig.noPreSilencedOutputBuffer = MA_TRUE;    /* We'll always be outputting to every frame in the callback so there's no need for a pre-silenced buffer. */
+            deviceConfig.noClip                    = MA_TRUE;    /* The engine will do clipping itself. */
+
+            if (engineConfig.pContext == NULL) {
+                ma_context_config contextConfig = ma_context_config_init();
+                contextConfig.allocationCallbacks = pEngine->allocationCallbacks;
+                contextConfig.pLog = engineConfig.pLog;
+
+                /* If the engine config does not specify a log, use the resource manager's if we have one. */
+                #ifndef MA_NO_RESOURCE_MANAGER
+                {
+                    if (contextConfig.pLog == NULL && engineConfig.pResourceManager != NULL) {
+                        contextConfig.pLog = ma_resource_manager_get_log(engineConfig.pResourceManager);
+                    }
+                }
+                #endif
+
+                result = ma_device_init_ex(NULL, 0, &contextConfig, &deviceConfig, pEngine->pDevice);
+            } else {
+                result = ma_device_init(engineConfig.pContext, &deviceConfig, pEngine->pDevice);
+            }
+
+            if (result != MA_SUCCESS) {
+                ma_free(pEngine->pDevice, &pEngine->allocationCallbacks);
+                pEngine->pDevice = NULL;
+                return result;
+            }
+
+            pEngine->ownsDevice = MA_TRUE;
+        }
+
+        /* Update the channel count and sample rate of the engine config so we can reference it below. */
+        if (pEngine->pDevice != NULL) {
+            engineConfig.channels   = pEngine->pDevice->playback.channels;
+            engineConfig.sampleRate = pEngine->pDevice->sampleRate;
+
+            /*
+            The processing size used by the engine is determined by engineConfig.periodSizeInFrames. We want
+            to make this equal to what the device is using for it's period size. If we don't do that, it's
+            possible that the node graph will split it's processing into multiple passes which can introduce
+            glitching.
+            */
+            engineConfig.periodSizeInFrames = ma_device__get_processing_size_in_frames(pEngine->pDevice);
+        }
+    }
+    #endif
+
+    if (engineConfig.channels == 0 || engineConfig.sampleRate == 0) {
+        return MA_INVALID_ARGS;
+    }
+
+    pEngine->sampleRate = engineConfig.sampleRate;
+
+    /* The engine always uses either the log that was passed into the config, or the context's log is available. */
+    if (engineConfig.pLog != NULL) {
+        pEngine->pLog = engineConfig.pLog;
+    } else {
+        #if !defined(MA_NO_DEVICE_IO)
+        {
+            pEngine->pLog = ma_device_get_log(pEngine->pDevice);
+        }
+        #else
+        {
+            pEngine->pLog = NULL;
+        }
+        #endif
+    }
+
+
+    /* The engine is a node graph. This needs to be initialized after we have the device so we can determine the channel count. */
+    nodeGraphConfig = ma_node_graph_config_init(engineConfig.channels);
+    nodeGraphConfig.processingSizeInFrames = engineConfig.periodSizeInFrames;
+    nodeGraphConfig.preMixStackSizeInBytes = engineConfig.preMixStackSizeInBytes;
+
+    result = ma_node_graph_init(&nodeGraphConfig, &pEngine->allocationCallbacks, &pEngine->nodeGraph);
+    if (result != MA_SUCCESS) {
+        goto on_error_1;
+    }
+
+
+    /* We need at least one listener. */
+    if (engineConfig.listenerCount == 0) {
+        engineConfig.listenerCount = 1;
+    }
+
+    if (engineConfig.listenerCount > MA_ENGINE_MAX_LISTENERS) {
+        result = MA_INVALID_ARGS;   /* Too many listeners. */
+        goto on_error_1;
+    }
+
+    for (iListener = 0; iListener < engineConfig.listenerCount; iListener += 1) {
+        listenerConfig = ma_spatializer_listener_config_init(ma_node_graph_get_channels(&pEngine->nodeGraph));
+
+        /*
+        If we're using a device, use the device's channel map for the listener. Otherwise just use
+        miniaudio's default channel map.
+        */
+        #if !defined(MA_NO_DEVICE_IO)
+        {
+            if (pEngine->pDevice != NULL) {
+                /*
+                Temporarily disabled. There is a subtle bug here where front-left and front-right
+                will be used by the device's channel map, but this is not what we want to use for
+                spatialization. Instead we want to use side-left and side-right. I need to figure
+                out a better solution for this. For now, disabling the use of device channel maps.
+                */
+                /*listenerConfig.pChannelMapOut = pEngine->pDevice->playback.channelMap;*/
+            }
+        }
+        #endif
+
+        result = ma_spatializer_listener_init(&listenerConfig, &pEngine->allocationCallbacks, &pEngine->listeners[iListener]);  /* TODO: Change this to a pre-allocated heap. */
+        if (result != MA_SUCCESS) {
+            goto on_error_2;
+        }
+
+        pEngine->listenerCount += 1;
+    }
+
+
+    /* Gain smoothing for spatialized sounds. */
+    pEngine->gainSmoothTimeInFrames = engineConfig.gainSmoothTimeInFrames;
+    if (pEngine->gainSmoothTimeInFrames == 0) {
+        ma_uint32 gainSmoothTimeInMilliseconds = engineConfig.gainSmoothTimeInMilliseconds;
+        if (gainSmoothTimeInMilliseconds == 0) {
+            gainSmoothTimeInMilliseconds = 8;
+        }
+
+        pEngine->gainSmoothTimeInFrames = (gainSmoothTimeInMilliseconds * ma_engine_get_sample_rate(pEngine)) / 1000;  /* 8ms by default. */
+    }
+
+
+    /* We need a resource manager. */
+    #ifndef MA_NO_RESOURCE_MANAGER
+    {
+        if (pEngine->pResourceManager == NULL) {
+            ma_resource_manager_config resourceManagerConfig;
+
+            pEngine->pResourceManager = (ma_resource_manager*)ma_malloc(sizeof(*pEngine->pResourceManager), &pEngine->allocationCallbacks);
+            if (pEngine->pResourceManager == NULL) {
+                result = MA_OUT_OF_MEMORY;
+                goto on_error_2;
+            }
+
+            resourceManagerConfig = ma_resource_manager_config_init();
+            resourceManagerConfig.pLog              = pEngine->pLog;    /* Always use the engine's log for internally-managed resource managers. */
+            resourceManagerConfig.decodedFormat     = ma_format_f32;
+            resourceManagerConfig.decodedChannels   = 0;  /* Leave the decoded channel count as 0 so we can get good spatialization. */
+            resourceManagerConfig.decodedSampleRate = ma_engine_get_sample_rate(pEngine);
+            ma_allocation_callbacks_init_copy(&resourceManagerConfig.allocationCallbacks, &pEngine->allocationCallbacks);
+            resourceManagerConfig.pVFS              = engineConfig.pResourceManagerVFS;
+
+            /* The Emscripten build cannot use threads unless it's targeting pthreads. */
+            #if defined(MA_EMSCRIPTEN) && !defined(__EMSCRIPTEN_PTHREADS__)
+            {
+                resourceManagerConfig.jobThreadCount = 0;
+                resourceManagerConfig.flags |= MA_RESOURCE_MANAGER_FLAG_NO_THREADING;
+            }
+            #endif
+
+            result = ma_resource_manager_init(&resourceManagerConfig, pEngine->pResourceManager);
+            if (result != MA_SUCCESS) {
+                goto on_error_3;
+            }
+
+            pEngine->ownsResourceManager = MA_TRUE;
+        }
+    }
+    #endif
+
+    /* Setup some stuff for inlined sounds. That is sounds played with ma_engine_play_sound(). */
+    pEngine->inlinedSoundLock  = 0;
+    pEngine->pInlinedSoundHead = NULL;
+
+    /* Start the engine if required. This should always be the last step. */
+    #if !defined(MA_NO_DEVICE_IO)
+    {
+        if (engineConfig.noAutoStart == MA_FALSE && pEngine->pDevice != NULL) {
+            result = ma_engine_start(pEngine);
+            if (result != MA_SUCCESS) {
+                goto on_error_4;    /* Failed to start the engine. */
+            }
+        }
+    }
+    #endif
+
+    return MA_SUCCESS;
+
+#if !defined(MA_NO_DEVICE_IO)
+on_error_4:
+#endif
+#if !defined(MA_NO_RESOURCE_MANAGER)
+on_error_3:
+    if (pEngine->ownsResourceManager) {
+        ma_free(pEngine->pResourceManager, &pEngine->allocationCallbacks);
+    }
+#endif  /* MA_NO_RESOURCE_MANAGER */
+on_error_2:
+    for (iListener = 0; iListener < pEngine->listenerCount; iListener += 1) {
+        ma_spatializer_listener_uninit(&pEngine->listeners[iListener], &pEngine->allocationCallbacks);
+    }
+
+    ma_node_graph_uninit(&pEngine->nodeGraph, &pEngine->allocationCallbacks);
+on_error_1:
+    #if !defined(MA_NO_DEVICE_IO)
+    {
+        if (pEngine->ownsDevice) {
+            ma_device_uninit(pEngine->pDevice);
+            ma_free(pEngine->pDevice, &pEngine->allocationCallbacks);
+        }
+    }
+    #endif
+
+    return result;
+}
+
+MA_API void ma_engine_uninit(ma_engine* pEngine)
+{
+    ma_uint32 iListener;
+
+    if (pEngine == NULL) {
+        return;
+    }
+
+    /* The device must be uninitialized before the node graph to ensure the audio thread doesn't try accessing it. */
+    #if !defined(MA_NO_DEVICE_IO)
+    {
+        if (pEngine->ownsDevice) {
+            ma_device_uninit(pEngine->pDevice);
+            ma_free(pEngine->pDevice, &pEngine->allocationCallbacks);
+        } else {
+            if (pEngine->pDevice != NULL) {
+                ma_device_stop(pEngine->pDevice);
+            }
+        }
+    }
+    #endif
+
+    /*
+    All inlined sounds need to be deleted. I'm going to use a lock here just to future proof in case
+    I want to do some kind of garbage collection later on.
+    */
+    ma_spinlock_lock(&pEngine->inlinedSoundLock);
+    {
+        for (;;) {
+            ma_sound_inlined* pSoundToDelete = pEngine->pInlinedSoundHead;
+            if (pSoundToDelete == NULL) {
+                break;  /* Done. */
+            }
+
+            pEngine->pInlinedSoundHead = pSoundToDelete->pNext;
+
+            ma_sound_uninit(&pSoundToDelete->sound);
+            ma_free(pSoundToDelete, &pEngine->allocationCallbacks);
+        }
+    }
+    ma_spinlock_unlock(&pEngine->inlinedSoundLock);
+
+    for (iListener = 0; iListener < pEngine->listenerCount; iListener += 1) {
+        ma_spatializer_listener_uninit(&pEngine->listeners[iListener], &pEngine->allocationCallbacks);
+    }
+
+    /* Make sure the node graph is uninitialized after the audio thread has been shutdown to prevent accessing of the node graph after being uninitialized. */
+    ma_node_graph_uninit(&pEngine->nodeGraph, &pEngine->allocationCallbacks);
+
+    /* Uninitialize the resource manager last to ensure we don't have a thread still trying to access it. */
+#ifndef MA_NO_RESOURCE_MANAGER
+    if (pEngine->ownsResourceManager) {
+        ma_resource_manager_uninit(pEngine->pResourceManager);
+        ma_free(pEngine->pResourceManager, &pEngine->allocationCallbacks);
+    }
+#endif
+}
+
+MA_API ma_result ma_engine_read_pcm_frames(ma_engine* pEngine, void* pFramesOut, ma_uint64 frameCount, ma_uint64* pFramesRead)
+{
+    ma_result result;
+    ma_uint64 framesRead = 0;
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = 0;
+    }
+
+    result = ma_node_graph_read_pcm_frames(&pEngine->nodeGraph, pFramesOut, frameCount, &framesRead);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pFramesRead != NULL) {
+        *pFramesRead = framesRead;
+    }
+
+    if (pEngine->onProcess) {
+        pEngine->onProcess(pEngine->pProcessUserData, (float*)pFramesOut, framesRead);  /* Safe cast to float* because the engine always works on floating point samples. */
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_node_graph* ma_engine_get_node_graph(ma_engine* pEngine)
+{
+    if (pEngine == NULL) {
+        return NULL;
+    }
+
+    return &pEngine->nodeGraph;
+}
+
+#if !defined(MA_NO_RESOURCE_MANAGER)
+MA_API ma_resource_manager* ma_engine_get_resource_manager(ma_engine* pEngine)
+{
+    if (pEngine == NULL) {
+        return NULL;
+    }
+
+    #if !defined(MA_NO_RESOURCE_MANAGER)
+    {
+        return pEngine->pResourceManager;
+    }
+    #else
+    {
+        return NULL;
+    }
+    #endif
+}
+#endif
+
+MA_API ma_device* ma_engine_get_device(ma_engine* pEngine)
+{
+    if (pEngine == NULL) {
+        return NULL;
+    }
+
+    #if !defined(MA_NO_DEVICE_IO)
+    {
+        return pEngine->pDevice;
+    }
+    #else
+    {
+        return NULL;
+    }
+    #endif
+}
+
+MA_API ma_log* ma_engine_get_log(ma_engine* pEngine)
+{
+    if (pEngine == NULL) {
+        return NULL;
+    }
+
+    if (pEngine->pLog != NULL) {
+        return pEngine->pLog;
+    } else {
+        #if !defined(MA_NO_DEVICE_IO)
+        {
+            return ma_device_get_log(ma_engine_get_device(pEngine));
+        }
+        #else
+        {
+            return NULL;
+        }
+        #endif
+    }
+}
+
+MA_API ma_node* ma_engine_get_endpoint(ma_engine* pEngine)
+{
+    return ma_node_graph_get_endpoint(&pEngine->nodeGraph);
+}
+
+MA_API ma_uint64 ma_engine_get_time_in_pcm_frames(const ma_engine* pEngine)
+{
+    return ma_node_graph_get_time(&pEngine->nodeGraph);
+}
+
+MA_API ma_uint64 ma_engine_get_time_in_milliseconds(const ma_engine* pEngine)
+{
+    return ma_engine_get_time_in_pcm_frames(pEngine) * 1000 / ma_engine_get_sample_rate(pEngine);
+}
+
+MA_API ma_result ma_engine_set_time_in_pcm_frames(ma_engine* pEngine, ma_uint64 globalTime)
+{
+    return ma_node_graph_set_time(&pEngine->nodeGraph, globalTime);
+}
+
+MA_API ma_result ma_engine_set_time_in_milliseconds(ma_engine* pEngine, ma_uint64 globalTime)
+{
+    return ma_engine_set_time_in_pcm_frames(pEngine, globalTime * ma_engine_get_sample_rate(pEngine) / 1000);
+}
+
+MA_API ma_uint64 ma_engine_get_time(const ma_engine* pEngine)
+{
+    return ma_engine_get_time_in_pcm_frames(pEngine);
+}
+
+MA_API ma_result ma_engine_set_time(ma_engine* pEngine, ma_uint64 globalTime)
+{
+    return ma_engine_set_time_in_pcm_frames(pEngine, globalTime);
+}
+
+MA_API ma_uint32 ma_engine_get_channels(const ma_engine* pEngine)
+{
+    return ma_node_graph_get_channels(&pEngine->nodeGraph);
+}
+
+MA_API ma_uint32 ma_engine_get_sample_rate(const ma_engine* pEngine)
+{
+    if (pEngine == NULL) {
+        return 0;
+    }
+
+    return pEngine->sampleRate;
+}
+
+
+MA_API ma_result ma_engine_start(ma_engine* pEngine)
+{
+    ma_result result;
+
+    if (pEngine == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_DEVICE_IO)
+    {
+        if (pEngine->pDevice != NULL) {
+            result = ma_device_start(pEngine->pDevice);
+        } else {
+            result = MA_INVALID_OPERATION;  /* The engine is running without a device which means there's no real notion of "starting" the engine. */
+        }
+    }
+    #else
+    {
+        result = MA_INVALID_OPERATION;  /* Device IO is disabled, so there's no real notion of "starting" the engine. */
+    }
+    #endif
+
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_engine_stop(ma_engine* pEngine)
+{
+    ma_result result;
+
+    if (pEngine == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    #if !defined(MA_NO_DEVICE_IO)
+    {
+        if (pEngine->pDevice != NULL) {
+            result = ma_device_stop(pEngine->pDevice);
+        } else {
+            result = MA_INVALID_OPERATION;  /* The engine is running without a device which means there's no real notion of "stopping" the engine. */
+        }
+    }
+    #else
+    {
+        result = MA_INVALID_OPERATION;  /* Device IO is disabled, so there's no real notion of "stopping" the engine. */
+    }
+    #endif
+
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_engine_set_volume(ma_engine* pEngine, float volume)
+{
+    if (pEngine == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_node_set_output_bus_volume(ma_node_graph_get_endpoint(&pEngine->nodeGraph), 0, volume);
+}
+
+MA_API float ma_engine_get_volume(ma_engine* pEngine)
+{
+    if (pEngine == NULL) {
+        return 0;
+    }
+
+    return ma_node_get_output_bus_volume(ma_node_graph_get_endpoint(&pEngine->nodeGraph), 0);
+}
+
+MA_API ma_result ma_engine_set_gain_db(ma_engine* pEngine, float gainDB)
+{
+    return ma_engine_set_volume(pEngine, ma_volume_db_to_linear(gainDB));
+}
+
+MA_API float ma_engine_get_gain_db(ma_engine* pEngine)
+{
+    return ma_volume_linear_to_db(ma_engine_get_volume(pEngine));
+}
+
+
+MA_API ma_uint32 ma_engine_get_listener_count(const ma_engine* pEngine)
+{
+    if (pEngine == NULL) {
+        return 0;
+    }
+
+    return pEngine->listenerCount;
+}
+
+MA_API ma_uint32 ma_engine_find_closest_listener(const ma_engine* pEngine, float absolutePosX, float absolutePosY, float absolutePosZ)
+{
+    ma_uint32 iListener;
+    ma_uint32 iListenerClosest;
+    float closestLen2 = MA_FLT_MAX;
+
+    if (pEngine == NULL || pEngine->listenerCount == 1) {
+        return 0;
+    }
+
+    iListenerClosest = 0;
+    for (iListener = 0; iListener < pEngine->listenerCount; iListener += 1) {
+        if (ma_engine_listener_is_enabled(pEngine, iListener)) {
+            float len2 = ma_vec3f_len2(ma_vec3f_sub(ma_spatializer_listener_get_position(&pEngine->listeners[iListener]), ma_vec3f_init_3f(absolutePosX, absolutePosY, absolutePosZ)));
+            if (closestLen2 > len2) {
+                closestLen2 = len2;
+                iListenerClosest = iListener;
+            }
+        }
+    }
+
+    MA_ASSERT(iListenerClosest < 255);
+    return iListenerClosest;
+}
+
+MA_API void ma_engine_listener_set_position(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return;
+    }
+
+    ma_spatializer_listener_set_position(&pEngine->listeners[listenerIndex], x, y, z);
+}
+
+MA_API ma_vec3f ma_engine_listener_get_position(const ma_engine* pEngine, ma_uint32 listenerIndex)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_spatializer_listener_get_position(&pEngine->listeners[listenerIndex]);
+}
+
+MA_API void ma_engine_listener_set_direction(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return;
+    }
+
+    ma_spatializer_listener_set_direction(&pEngine->listeners[listenerIndex], x, y, z);
+}
+
+MA_API ma_vec3f ma_engine_listener_get_direction(const ma_engine* pEngine, ma_uint32 listenerIndex)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return ma_vec3f_init_3f(0, 0, -1);
+    }
+
+    return ma_spatializer_listener_get_direction(&pEngine->listeners[listenerIndex]);
+}
+
+MA_API void ma_engine_listener_set_velocity(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return;
+    }
+
+    ma_spatializer_listener_set_velocity(&pEngine->listeners[listenerIndex], x, y, z);
+}
+
+MA_API ma_vec3f ma_engine_listener_get_velocity(const ma_engine* pEngine, ma_uint32 listenerIndex)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_spatializer_listener_get_velocity(&pEngine->listeners[listenerIndex]);
+}
+
+MA_API void ma_engine_listener_set_cone(ma_engine* pEngine, ma_uint32 listenerIndex, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return;
+    }
+
+    ma_spatializer_listener_set_cone(&pEngine->listeners[listenerIndex], innerAngleInRadians, outerAngleInRadians, outerGain);
+}
+
+MA_API void ma_engine_listener_get_cone(const ma_engine* pEngine, ma_uint32 listenerIndex, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
+{
+    if (pInnerAngleInRadians != NULL) {
+        *pInnerAngleInRadians = 0;
+    }
+
+    if (pOuterAngleInRadians != NULL) {
+        *pOuterAngleInRadians = 0;
+    }
+
+    if (pOuterGain != NULL) {
+        *pOuterGain = 0;
+    }
+
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return;
+    }
+
+    ma_spatializer_listener_get_cone(&pEngine->listeners[listenerIndex], pInnerAngleInRadians, pOuterAngleInRadians, pOuterGain);
+}
+
+MA_API void ma_engine_listener_set_world_up(ma_engine* pEngine, ma_uint32 listenerIndex, float x, float y, float z)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return;
+    }
+
+    ma_spatializer_listener_set_world_up(&pEngine->listeners[listenerIndex], x, y, z);
+}
+
+MA_API ma_vec3f ma_engine_listener_get_world_up(const ma_engine* pEngine, ma_uint32 listenerIndex)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return ma_vec3f_init_3f(0, 1, 0);
+    }
+
+    return ma_spatializer_listener_get_world_up(&pEngine->listeners[listenerIndex]);
+}
+
+MA_API void ma_engine_listener_set_enabled(ma_engine* pEngine, ma_uint32 listenerIndex, ma_bool32 isEnabled)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return;
+    }
+
+    ma_spatializer_listener_set_enabled(&pEngine->listeners[listenerIndex], isEnabled);
+}
+
+MA_API ma_bool32 ma_engine_listener_is_enabled(const ma_engine* pEngine, ma_uint32 listenerIndex)
+{
+    if (pEngine == NULL || listenerIndex >= pEngine->listenerCount) {
+        return MA_FALSE;
+    }
+
+    return ma_spatializer_listener_is_enabled(&pEngine->listeners[listenerIndex]);
+}
+
+
+#ifndef MA_NO_RESOURCE_MANAGER
+MA_API ma_result ma_engine_play_sound_ex(ma_engine* pEngine, const char* pFilePath, ma_node* pNode, ma_uint32 nodeInputBusIndex)
+{
+    ma_result result = MA_SUCCESS;
+    ma_sound_inlined* pSound = NULL;
+    ma_sound_inlined* pNextSound = NULL;
+
+    if (pEngine == NULL || pFilePath == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Attach to the endpoint node if nothing is specified. */
+    if (pNode == NULL) {
+        pNode = ma_node_graph_get_endpoint(&pEngine->nodeGraph);
+        nodeInputBusIndex = 0;
+    }
+
+    /*
+    We want to check if we can recycle an already-allocated inlined sound. Since this is just a
+    helper I'm not *too* concerned about performance here and I'm happy to use a lock to keep
+    the implementation simple. Maybe this can be optimized later if there's enough demand, but
+    if this function is being used it probably means the caller doesn't really care too much.
+
+    What we do is check the atEnd flag. When this is true, we can recycle the sound. Otherwise
+    we just keep iterating. If we reach the end without finding a sound to recycle we just
+    allocate a new one. This doesn't scale well for a massive number of sounds being played
+    simultaneously as we don't ever actually free the sound objects. Some kind of garbage
+    collection routine might be valuable for this which I'll think about.
+    */
+    ma_spinlock_lock(&pEngine->inlinedSoundLock);
+    {
+        ma_uint32 soundFlags = 0;
+
+        for (pNextSound = pEngine->pInlinedSoundHead; pNextSound != NULL; pNextSound = pNextSound->pNext) {
+            if (ma_sound_at_end(&pNextSound->sound)) {
+                /*
+                The sound is at the end which means it's available for recycling. All we need to do
+                is uninitialize it and reinitialize it. All we're doing is recycling memory.
+                */
+                pSound = pNextSound;
+                ma_atomic_fetch_sub_32(&pEngine->inlinedSoundCount, 1);
+                break;
+            }
+        }
+
+        if (pSound != NULL) {
+            /*
+            We actually want to detach the sound from the list here. The reason is because we want the sound
+            to be in a consistent state at the non-recycled case to simplify the logic below.
+            */
+            if (pEngine->pInlinedSoundHead == pSound) {
+                pEngine->pInlinedSoundHead =  pSound->pNext;
+            }
+
+            if (pSound->pPrev != NULL) {
+                pSound->pPrev->pNext = pSound->pNext;
+            }
+            if (pSound->pNext != NULL) {
+                pSound->pNext->pPrev = pSound->pPrev;
+            }
+
+            /* Now the previous sound needs to be uninitialized. */
+            ma_sound_uninit(&pNextSound->sound);
+        } else {
+            /* No sound available for recycling. Allocate one now. */
+            pSound = (ma_sound_inlined*)ma_malloc(sizeof(*pSound), &pEngine->allocationCallbacks);
+        }
+
+        if (pSound != NULL) {   /* Safety check for the allocation above. */
+            /*
+            At this point we should have memory allocated for the inlined sound. We just need
+            to initialize it like a normal sound now.
+            */
+            soundFlags |= MA_SOUND_FLAG_ASYNC;                 /* For inlined sounds we don't want to be sitting around waiting for stuff to load so force an async load. */
+            soundFlags |= MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT; /* We want specific control over where the sound is attached in the graph. We'll attach it manually just before playing the sound. */
+            soundFlags |= MA_SOUND_FLAG_NO_PITCH;              /* Pitching isn't usable with inlined sounds, so disable it to save on speed. */
+            soundFlags |= MA_SOUND_FLAG_NO_SPATIALIZATION;     /* Not currently doing spatialization with inlined sounds, but this might actually change later. For now disable spatialization. Will be removed if we ever add support for spatialization here. */
+
+            result = ma_sound_init_from_file(pEngine, pFilePath, soundFlags, NULL, NULL, &pSound->sound);
+            if (result == MA_SUCCESS) {
+                /* Now attach the sound to the graph. */
+                result = ma_node_attach_output_bus(pSound, 0, pNode, nodeInputBusIndex);
+                if (result == MA_SUCCESS) {
+                    /* At this point the sound should be loaded and we can go ahead and add it to the list. The new item becomes the new head. */
+                    pSound->pNext = pEngine->pInlinedSoundHead;
+                    pSound->pPrev = NULL;
+
+                    pEngine->pInlinedSoundHead = pSound;    /* <-- This is what attaches the sound to the list. */
+                    if (pSound->pNext != NULL) {
+                        pSound->pNext->pPrev = pSound;
+                    }
+                } else {
+                    ma_free(pSound, &pEngine->allocationCallbacks);
+                }
+            } else {
+                ma_free(pSound, &pEngine->allocationCallbacks);
+            }
+        } else {
+            result = MA_OUT_OF_MEMORY;
+        }
+    }
+    ma_spinlock_unlock(&pEngine->inlinedSoundLock);
+
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* Finally we can start playing the sound. */
+    result = ma_sound_start(&pSound->sound);
+    if (result != MA_SUCCESS) {
+        /* Failed to start the sound. We need to mark it for recycling and return an error. */
+        ma_atomic_exchange_32(&pSound->sound.atEnd, MA_TRUE);
+        return result;
+    }
+
+    ma_atomic_fetch_add_32(&pEngine->inlinedSoundCount, 1);
+    return result;
+}
+
+MA_API ma_result ma_engine_play_sound(ma_engine* pEngine, const char* pFilePath, ma_sound_group* pGroup)
+{
+    return ma_engine_play_sound_ex(pEngine, pFilePath, pGroup, 0);
+}
+#endif
+
+
+static ma_result ma_sound_preinit(ma_engine* pEngine, ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pSound);
+    pSound->seekTarget = MA_SEEK_TARGET_NONE;
+
+    if (pEngine == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return MA_SUCCESS;
+}
+
+static ma_result ma_sound_init_from_data_source_internal(ma_engine* pEngine, const ma_sound_config* pConfig, ma_sound* pSound)
+{
+    ma_result result;
+    ma_engine_node_config engineNodeConfig;
+    ma_engine_node_type type;   /* Will be set to ma_engine_node_type_group if no data source is specified. */
+
+    /* Do not clear pSound to zero here - that's done at a higher level with ma_sound_preinit(). */
+    MA_ASSERT(pEngine != NULL);
+    MA_ASSERT(pSound  != NULL);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pSound->pDataSource = pConfig->pDataSource;
+
+    if (pConfig->pDataSource != NULL) {
+        type = ma_engine_node_type_sound;
+    } else {
+        type = ma_engine_node_type_group;
+    }
+
+    /*
+    Sounds are engine nodes. Before we can initialize this we need to determine the channel count.
+    If we can't do this we need to abort. It's up to the caller to ensure they're using a data
+    source that provides this information upfront.
+    */
+    engineNodeConfig = ma_engine_node_config_init(pEngine, type, pConfig->flags);
+    engineNodeConfig.channelsIn                  = pConfig->channelsIn;
+    engineNodeConfig.channelsOut                 = pConfig->channelsOut;
+    engineNodeConfig.volumeSmoothTimeInPCMFrames = pConfig->volumeSmoothTimeInPCMFrames;
+    engineNodeConfig.monoExpansionMode           = pConfig->monoExpansionMode;
+
+    if (engineNodeConfig.volumeSmoothTimeInPCMFrames == 0) {
+        engineNodeConfig.volumeSmoothTimeInPCMFrames = pEngine->defaultVolumeSmoothTimeInPCMFrames;
+    }
+
+    /* If we're loading from a data source the input channel count needs to be the data source's native channel count. */
+    if (pConfig->pDataSource != NULL) {
+        result = ma_data_source_get_data_format(pConfig->pDataSource, NULL, &engineNodeConfig.channelsIn, &engineNodeConfig.sampleRate, NULL, 0);
+        if (result != MA_SUCCESS) {
+            return result;  /* Failed to retrieve the channel count. */
+        }
+
+        if (engineNodeConfig.channelsIn == 0) {
+            return MA_INVALID_OPERATION;    /* Invalid channel count. */
+        }
+
+        if (engineNodeConfig.channelsOut == MA_SOUND_SOURCE_CHANNEL_COUNT) {
+            engineNodeConfig.channelsOut = engineNodeConfig.channelsIn;
+        }
+    }
+
+
+    /* Getting here means we should have a valid channel count and we can initialize the engine node. */
+    result = ma_engine_node_init(&engineNodeConfig, &pEngine->allocationCallbacks, &pSound->engineNode);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* If no attachment is specified, attach the sound straight to the endpoint. */
+    if (pConfig->pInitialAttachment == NULL) {
+        /* No group. Attach straight to the endpoint by default, unless the caller has requested that it not. */
+        if ((pConfig->flags & MA_SOUND_FLAG_NO_DEFAULT_ATTACHMENT) == 0) {
+            result = ma_node_attach_output_bus(pSound, 0, ma_node_graph_get_endpoint(&pEngine->nodeGraph), 0);
+        }
+    } else {
+        /* An attachment is specified. Attach to it by default. The sound has only a single output bus, and the config will specify which input bus to attach to. */
+        result = ma_node_attach_output_bus(pSound, 0, pConfig->pInitialAttachment, pConfig->initialAttachmentInputBusIndex);
+    }
+
+    if (result != MA_SUCCESS) {
+        ma_engine_node_uninit(&pSound->engineNode, &pEngine->allocationCallbacks);
+        return result;
+    }
+
+
+    /* Apply initial range and looping state to the data source if applicable. */
+    if (pConfig->rangeBegInPCMFrames != 0 || pConfig->rangeEndInPCMFrames != ~((ma_uint64)0)) {
+        ma_data_source_set_range_in_pcm_frames(ma_sound_get_data_source(pSound), pConfig->rangeBegInPCMFrames, pConfig->rangeEndInPCMFrames);
+    }
+
+    if (pConfig->loopPointBegInPCMFrames != 0 || pConfig->loopPointEndInPCMFrames != ~((ma_uint64)0)) {
+        ma_data_source_set_range_in_pcm_frames(ma_sound_get_data_source(pSound), pConfig->loopPointBegInPCMFrames, pConfig->loopPointEndInPCMFrames);
+    }
+
+    ma_sound_set_looping(pSound, pConfig->isLooping || ((pConfig->flags & MA_SOUND_FLAG_LOOPING) != 0));
+
+    return MA_SUCCESS;
+}
+
+#ifndef MA_NO_RESOURCE_MANAGER
+MA_API ma_result ma_sound_init_from_file_internal(ma_engine* pEngine, const ma_sound_config* pConfig, ma_sound* pSound)
+{
+    ma_result result = MA_SUCCESS;
+    ma_uint32 flags;
+    ma_sound_config config;
+    ma_resource_manager_pipeline_notifications notifications;
+
+    /*
+    The engine requires knowledge of the channel count of the underlying data source before it can
+    initialize the sound. Therefore, we need to make the resource manager wait until initialization
+    of the underlying data source to be initialized so we can get access to the channel count. To
+    do this, the MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT is forced.
+
+    Because we're initializing the data source before the sound, there's a chance the notification
+    will get triggered before this function returns. This is OK, so long as the caller is aware of
+    it and can avoid accessing the sound from within the notification.
+    */
+    flags = pConfig->flags | MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_WAIT_INIT;
+    if (pConfig->isLooping) {
+        flags |= MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING;
+    }
+
+    pSound->pResourceManagerDataSource = (ma_resource_manager_data_source*)ma_malloc(sizeof(*pSound->pResourceManagerDataSource), &pEngine->allocationCallbacks);
+    if (pSound->pResourceManagerDataSource == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    /* Removed in 0.12. Set pDoneFence on the notifications. */
+    notifications = pConfig->initNotifications;
+    if (pConfig->pDoneFence != NULL && notifications.done.pFence == NULL) {
+        notifications.done.pFence = pConfig->pDoneFence;
+    }
+
+    /*
+    We must wrap everything around the fence if one was specified. This ensures ma_fence_wait() does
+    not return prematurely before the sound has finished initializing.
+    */
+    if (notifications.done.pFence) { ma_fence_acquire(notifications.done.pFence); }
+    {
+        ma_resource_manager_data_source_config resourceManagerDataSourceConfig = ma_resource_manager_data_source_config_init();
+        resourceManagerDataSourceConfig.pFilePath                   = pConfig->pFilePath;
+        resourceManagerDataSourceConfig.pFilePathW                  = pConfig->pFilePathW;
+        resourceManagerDataSourceConfig.flags                       = flags;
+        resourceManagerDataSourceConfig.pNotifications              = &notifications;
+        resourceManagerDataSourceConfig.initialSeekPointInPCMFrames = pConfig->initialSeekPointInPCMFrames;
+        resourceManagerDataSourceConfig.rangeBegInPCMFrames         = pConfig->rangeBegInPCMFrames;
+        resourceManagerDataSourceConfig.rangeEndInPCMFrames         = pConfig->rangeEndInPCMFrames;
+        resourceManagerDataSourceConfig.loopPointBegInPCMFrames     = pConfig->loopPointBegInPCMFrames;
+        resourceManagerDataSourceConfig.loopPointEndInPCMFrames     = pConfig->loopPointEndInPCMFrames;
+        resourceManagerDataSourceConfig.isLooping                   = (flags & MA_RESOURCE_MANAGER_DATA_SOURCE_FLAG_LOOPING) != 0;
+
+        result = ma_resource_manager_data_source_init_ex(pEngine->pResourceManager, &resourceManagerDataSourceConfig, pSound->pResourceManagerDataSource);
+        if (result != MA_SUCCESS) {
+            goto done;
+        }
+
+        pSound->ownsDataSource = MA_TRUE;   /* <-- Important. Not setting this will result in the resource manager data source never getting uninitialized. */
+
+        /* We need to use a slightly customized version of the config so we'll need to make a copy. */
+        config = *pConfig;
+        config.pFilePath   = NULL;
+        config.pFilePathW  = NULL;
+        config.pDataSource = pSound->pResourceManagerDataSource;
+
+        result = ma_sound_init_from_data_source_internal(pEngine, &config, pSound);
+        if (result != MA_SUCCESS) {
+            ma_resource_manager_data_source_uninit(pSound->pResourceManagerDataSource);
+            ma_free(pSound->pResourceManagerDataSource, &pEngine->allocationCallbacks);
+            MA_ZERO_OBJECT(pSound);
+            goto done;
+        }
+    }
+done:
+    if (notifications.done.pFence) { ma_fence_release(notifications.done.pFence); }
+    return result;
+}
+
+MA_API ma_result ma_sound_init_from_file(ma_engine* pEngine, const char* pFilePath, ma_uint32 flags, ma_sound_group* pGroup, ma_fence* pDoneFence, ma_sound* pSound)
+{
+    ma_sound_config config;
+
+    if (pFilePath == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    config = ma_sound_config_init_2(pEngine);
+    config.pFilePath          = pFilePath;
+    config.flags              = flags;
+    config.pInitialAttachment = pGroup;
+    config.pDoneFence         = pDoneFence;
+
+    return ma_sound_init_ex(pEngine, &config, pSound);
+}
+
+MA_API ma_result ma_sound_init_from_file_w(ma_engine* pEngine, const wchar_t* pFilePath, ma_uint32 flags, ma_sound_group* pGroup, ma_fence* pDoneFence, ma_sound* pSound)
+{
+    ma_sound_config config;
+
+    if (pFilePath == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    config = ma_sound_config_init_2(pEngine);
+    config.pFilePathW         = pFilePath;
+    config.flags              = flags;
+    config.pInitialAttachment = pGroup;
+    config.pDoneFence         = pDoneFence;
+
+    return ma_sound_init_ex(pEngine, &config, pSound);
+}
+
+MA_API ma_result ma_sound_init_copy(ma_engine* pEngine, const ma_sound* pExistingSound, ma_uint32 flags, ma_sound_group* pGroup, ma_sound* pSound)
+{
+    ma_result result;
+    ma_sound_config config;
+
+    result = ma_sound_preinit(pEngine, pSound);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pExistingSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Cloning only works for data buffers (not streams) that are loaded from the resource manager. */
+    if (pExistingSound->pResourceManagerDataSource == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /*
+    We need to make a clone of the data source. If the data source is not a data buffer (i.e. a stream)
+    this will fail.
+    */
+    pSound->pResourceManagerDataSource = (ma_resource_manager_data_source*)ma_malloc(sizeof(*pSound->pResourceManagerDataSource), &pEngine->allocationCallbacks);
+    if (pSound->pResourceManagerDataSource == NULL) {
+        return MA_OUT_OF_MEMORY;
+    }
+
+    result = ma_resource_manager_data_source_init_copy(pEngine->pResourceManager, pExistingSound->pResourceManagerDataSource, pSound->pResourceManagerDataSource);
+    if (result != MA_SUCCESS) {
+        ma_free(pSound->pResourceManagerDataSource, &pEngine->allocationCallbacks);
+        return result;
+    }
+
+    config = ma_sound_config_init_2(pEngine);
+    config.pDataSource                 = pSound->pResourceManagerDataSource;
+    config.flags                       = flags;
+    config.pInitialAttachment          = pGroup;
+    config.monoExpansionMode           = pExistingSound->engineNode.monoExpansionMode;
+    config.volumeSmoothTimeInPCMFrames = pExistingSound->engineNode.volumeSmoothTimeInPCMFrames;
+
+    result = ma_sound_init_from_data_source_internal(pEngine, &config, pSound);
+    if (result != MA_SUCCESS) {
+        ma_resource_manager_data_source_uninit(pSound->pResourceManagerDataSource);
+        ma_free(pSound->pResourceManagerDataSource, &pEngine->allocationCallbacks);
+        MA_ZERO_OBJECT(pSound);
+        return result;
+    }
+
+    /* Make sure the sound is marked as the owner of the data source or else it will never get uninitialized. */
+    pSound->ownsDataSource = MA_TRUE;
+
+    return MA_SUCCESS;
+}
+#endif
+
+MA_API ma_result ma_sound_init_from_data_source(ma_engine* pEngine, ma_data_source* pDataSource, ma_uint32 flags, ma_sound_group* pGroup, ma_sound* pSound)
+{
+    ma_sound_config config = ma_sound_config_init_2(pEngine);
+    config.pDataSource        = pDataSource;
+    config.flags              = flags;
+    config.pInitialAttachment = pGroup;
+    return ma_sound_init_ex(pEngine, &config, pSound);
+}
+
+MA_API ma_result ma_sound_init_ex(ma_engine* pEngine, const ma_sound_config* pConfig, ma_sound* pSound)
+{
+    ma_result result;
+
+    result = ma_sound_preinit(pEngine, pSound);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    pSound->endCallback          = pConfig->endCallback;
+    pSound->pEndCallbackUserData = pConfig->pEndCallbackUserData;
+
+    /* We need to load the sound differently depending on whether or not we're loading from a file. */
+#ifndef MA_NO_RESOURCE_MANAGER
+    if (pConfig->pFilePath != NULL || pConfig->pFilePathW != NULL) {
+        return ma_sound_init_from_file_internal(pEngine, pConfig, pSound);
+    } else
+#endif
+    {
+        /*
+        Getting here means we're not loading from a file. We may be loading from an already-initialized
+        data source, or none at all. If we aren't specifying any data source, we'll be initializing
+        the equivalent to a group. ma_data_source_init_from_data_source_internal() will deal with this
+        for us, so no special treatment required here.
+        */
+        return ma_sound_init_from_data_source_internal(pEngine, pConfig, pSound);
+    }
+}
+
+MA_API void ma_sound_uninit(ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    /*
+    Always uninitialize the node first. This ensures it's detached from the graph and does not return until it has done
+    so which makes thread safety beyond this point trivial.
+    */
+    ma_engine_node_uninit(&pSound->engineNode, &pSound->engineNode.pEngine->allocationCallbacks);
+
+    /* Once the sound is detached from the group we can guarantee that it won't be referenced by the mixer thread which means it's safe for us to destroy the data source. */
+#ifndef MA_NO_RESOURCE_MANAGER
+    if (pSound->ownsDataSource) {
+        ma_resource_manager_data_source_uninit(pSound->pResourceManagerDataSource);
+        ma_free(pSound->pResourceManagerDataSource, &pSound->engineNode.pEngine->allocationCallbacks);
+        pSound->pDataSource = NULL;
+    }
+#else
+    MA_ASSERT(pSound->ownsDataSource == MA_FALSE);
+#endif
+}
+
+MA_API ma_engine* ma_sound_get_engine(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return NULL;
+    }
+
+    return pSound->engineNode.pEngine;
+}
+
+MA_API ma_data_source* ma_sound_get_data_source(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return NULL;
+    }
+
+    return pSound->pDataSource;
+}
+
+MA_API ma_result ma_sound_start(ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* If the sound is already playing, do nothing. */
+    if (ma_sound_is_playing(pSound)) {
+        return MA_SUCCESS;
+    }
+
+    /* If the sound is at the end it means we want to start from the start again. */
+    if (ma_sound_at_end(pSound)) {
+        ma_result result = ma_data_source_seek_to_pcm_frame(pSound->pDataSource, 0);
+        if (result != MA_SUCCESS && result != MA_NOT_IMPLEMENTED) {
+            return result;  /* Failed to seek back to the start. */
+        }
+
+        /* Make sure we clear the end indicator. */
+        ma_atomic_exchange_32(&pSound->atEnd, MA_FALSE);
+    }
+
+    /* Make sure the sound is started. If there's a start delay, the sound won't actually start until the start time is reached. */
+    ma_node_set_state(pSound, ma_node_state_started);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_sound_stop(ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* This will stop the sound immediately. Use ma_sound_set_stop_time() to stop the sound at a specific time. */
+    ma_node_set_state(pSound, ma_node_state_stopped);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_sound_stop_with_fade_in_pcm_frames(ma_sound* pSound, ma_uint64 fadeLengthInFrames)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Stopping with a fade out requires us to schedule the stop into the future by the fade length. */
+    ma_sound_set_stop_time_with_fade_in_pcm_frames(pSound, ma_engine_get_time_in_pcm_frames(ma_sound_get_engine(pSound)) + fadeLengthInFrames, fadeLengthInFrames);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_sound_stop_with_fade_in_milliseconds(ma_sound* pSound, ma_uint64 fadeLengthInMilliseconds)
+{
+    ma_uint64 sampleRate;
+
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    sampleRate = ma_engine_get_sample_rate(ma_sound_get_engine(pSound));
+
+    return ma_sound_stop_with_fade_in_pcm_frames(pSound, (fadeLengthInMilliseconds * sampleRate) / 1000);
+}
+
+MA_API void ma_sound_set_volume(ma_sound* pSound, float volume)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_engine_node_set_volume(&pSound->engineNode, volume);
+}
+
+MA_API float ma_sound_get_volume(const ma_sound* pSound)
+{
+    float volume = 0;
+
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    ma_engine_node_get_volume(&pSound->engineNode, &volume);
+
+    return volume;
+}
+
+MA_API void ma_sound_set_pan(ma_sound* pSound, float pan)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_panner_set_pan(&pSound->engineNode.panner, pan);
+}
+
+MA_API float ma_sound_get_pan(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_panner_get_pan(&pSound->engineNode.panner);
+}
+
+MA_API void ma_sound_set_pan_mode(ma_sound* pSound, ma_pan_mode panMode)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_panner_set_mode(&pSound->engineNode.panner, panMode);
+}
+
+MA_API ma_pan_mode ma_sound_get_pan_mode(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return ma_pan_mode_balance;
+    }
+
+    return ma_panner_get_mode(&pSound->engineNode.panner);
+}
+
+MA_API void ma_sound_set_pitch(ma_sound* pSound, float pitch)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    if (pitch <= 0) {
+        return;
+    }
+
+    ma_atomic_exchange_explicit_f32(&pSound->engineNode.pitch, pitch, ma_atomic_memory_order_release);
+}
+
+MA_API float ma_sound_get_pitch(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_atomic_load_f32(&pSound->engineNode.pitch);    /* Naughty const-cast for this. */
+}
+
+MA_API void ma_sound_set_spatialization_enabled(ma_sound* pSound, ma_bool32 enabled)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_atomic_exchange_explicit_32(&pSound->engineNode.isSpatializationDisabled, !enabled, ma_atomic_memory_order_release);
+}
+
+MA_API ma_bool32 ma_sound_is_spatialization_enabled(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_engine_node_is_spatialization_enabled(&pSound->engineNode);
+}
+
+MA_API void ma_sound_set_pinned_listener_index(ma_sound* pSound, ma_uint32 listenerIndex)
+{
+    if (pSound == NULL || listenerIndex >= ma_engine_get_listener_count(ma_sound_get_engine(pSound))) {
+        return;
+    }
+
+    ma_atomic_exchange_explicit_32(&pSound->engineNode.pinnedListenerIndex, listenerIndex, ma_atomic_memory_order_release);
+}
+
+MA_API ma_uint32 ma_sound_get_pinned_listener_index(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_LISTENER_INDEX_CLOSEST;
+    }
+
+    return ma_atomic_load_explicit_32(&pSound->engineNode.pinnedListenerIndex, ma_atomic_memory_order_acquire);
+}
+
+MA_API ma_uint32 ma_sound_get_listener_index(const ma_sound* pSound)
+{
+    ma_uint32 listenerIndex;
+
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    listenerIndex = ma_sound_get_pinned_listener_index(pSound);
+    if (listenerIndex == MA_LISTENER_INDEX_CLOSEST) {
+        ma_vec3f position = ma_sound_get_position(pSound);
+        return ma_engine_find_closest_listener(ma_sound_get_engine(pSound), position.x, position.y, position.z);
+    }
+
+    return listenerIndex;
+}
+
+MA_API ma_vec3f ma_sound_get_direction_to_listener(const ma_sound* pSound)
+{
+    ma_vec3f relativePos;
+    ma_engine* pEngine;
+
+    if (pSound == NULL) {
+        return ma_vec3f_init_3f(0, 0, -1);
+    }
+
+    pEngine = ma_sound_get_engine(pSound);
+    if (pEngine == NULL) {
+        return ma_vec3f_init_3f(0, 0, -1);
+    }
+
+    ma_spatializer_get_relative_position_and_direction(&pSound->engineNode.spatializer, &pEngine->listeners[ma_sound_get_listener_index(pSound)], &relativePos, NULL);
+
+    return ma_vec3f_normalize(ma_vec3f_neg(relativePos));
+}
+
+MA_API void ma_sound_set_position(ma_sound* pSound, float x, float y, float z)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_position(&pSound->engineNode.spatializer, x, y, z);
+}
+
+MA_API ma_vec3f ma_sound_get_position(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_spatializer_get_position(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_direction(ma_sound* pSound, float x, float y, float z)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_direction(&pSound->engineNode.spatializer, x, y, z);
+}
+
+MA_API ma_vec3f ma_sound_get_direction(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_spatializer_get_direction(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_velocity(ma_sound* pSound, float x, float y, float z)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_velocity(&pSound->engineNode.spatializer, x, y, z);
+}
+
+MA_API ma_vec3f ma_sound_get_velocity(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return ma_vec3f_init_3f(0, 0, 0);
+    }
+
+    return ma_spatializer_get_velocity(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_attenuation_model(ma_sound* pSound, ma_attenuation_model attenuationModel)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_attenuation_model(&pSound->engineNode.spatializer, attenuationModel);
+}
+
+MA_API ma_attenuation_model ma_sound_get_attenuation_model(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return ma_attenuation_model_none;
+    }
+
+    return ma_spatializer_get_attenuation_model(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_positioning(ma_sound* pSound, ma_positioning positioning)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_positioning(&pSound->engineNode.spatializer, positioning);
+}
+
+MA_API ma_positioning ma_sound_get_positioning(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return ma_positioning_absolute;
+    }
+
+    return ma_spatializer_get_positioning(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_rolloff(ma_sound* pSound, float rolloff)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_rolloff(&pSound->engineNode.spatializer, rolloff);
+}
+
+MA_API float ma_sound_get_rolloff(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_spatializer_get_rolloff(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_min_gain(ma_sound* pSound, float minGain)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_min_gain(&pSound->engineNode.spatializer, minGain);
+}
+
+MA_API float ma_sound_get_min_gain(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_spatializer_get_min_gain(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_max_gain(ma_sound* pSound, float maxGain)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_max_gain(&pSound->engineNode.spatializer, maxGain);
+}
+
+MA_API float ma_sound_get_max_gain(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_spatializer_get_max_gain(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_min_distance(ma_sound* pSound, float minDistance)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_min_distance(&pSound->engineNode.spatializer, minDistance);
+}
+
+MA_API float ma_sound_get_min_distance(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_spatializer_get_min_distance(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_max_distance(ma_sound* pSound, float maxDistance)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_max_distance(&pSound->engineNode.spatializer, maxDistance);
+}
+
+MA_API float ma_sound_get_max_distance(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_spatializer_get_max_distance(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_cone(ma_sound* pSound, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_cone(&pSound->engineNode.spatializer, innerAngleInRadians, outerAngleInRadians, outerGain);
+}
+
+MA_API void ma_sound_get_cone(const ma_sound* pSound, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
+{
+    if (pInnerAngleInRadians != NULL) {
+        *pInnerAngleInRadians = 0;
+    }
+
+    if (pOuterAngleInRadians != NULL) {
+        *pOuterAngleInRadians = 0;
+    }
+
+    if (pOuterGain != NULL) {
+        *pOuterGain = 0;
+    }
+
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_get_cone(&pSound->engineNode.spatializer, pInnerAngleInRadians, pOuterAngleInRadians, pOuterGain);
+}
+
+MA_API void ma_sound_set_doppler_factor(ma_sound* pSound, float dopplerFactor)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_doppler_factor(&pSound->engineNode.spatializer, dopplerFactor);
+}
+
+MA_API float ma_sound_get_doppler_factor(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_spatializer_get_doppler_factor(&pSound->engineNode.spatializer);
+}
+
+MA_API void ma_sound_set_directional_attenuation_factor(ma_sound* pSound, float directionalAttenuationFactor)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_spatializer_set_directional_attenuation_factor(&pSound->engineNode.spatializer, directionalAttenuationFactor);
+}
+
+MA_API float ma_sound_get_directional_attenuation_factor(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 1;
+    }
+
+    return ma_spatializer_get_directional_attenuation_factor(&pSound->engineNode.spatializer);
+}
+
+
+MA_API void ma_sound_set_fade_in_pcm_frames(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_sound_set_fade_start_in_pcm_frames(pSound, volumeBeg, volumeEnd, fadeLengthInFrames, (~(ma_uint64)0));
+}
+
+MA_API void ma_sound_set_fade_in_milliseconds(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_sound_set_fade_in_pcm_frames(pSound, volumeBeg, volumeEnd, (fadeLengthInMilliseconds * pSound->engineNode.fader.config.sampleRate) / 1000);
+}
+
+MA_API void ma_sound_set_fade_start_in_pcm_frames(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames, ma_uint64 absoluteGlobalTimeInFrames)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    /*
+    We don't want to update the fader at this point because we need to use the engine's current time
+    to derive the fader's start offset. The timer is being updated on the audio thread so in order to
+    do this as accurately as possible we'll need to defer this to the audio thread.
+    */
+    ma_atomic_float_set(&pSound->engineNode.fadeSettings.volumeBeg, volumeBeg);
+    ma_atomic_float_set(&pSound->engineNode.fadeSettings.volumeEnd, volumeEnd);
+    ma_atomic_uint64_set(&pSound->engineNode.fadeSettings.fadeLengthInFrames, fadeLengthInFrames);
+    ma_atomic_uint64_set(&pSound->engineNode.fadeSettings.absoluteGlobalTimeInFrames, absoluteGlobalTimeInFrames);
+}
+
+MA_API void ma_sound_set_fade_start_in_milliseconds(ma_sound* pSound, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds, ma_uint64 absoluteGlobalTimeInMilliseconds)
+{
+    ma_uint32 sampleRate;
+
+    if (pSound == NULL) {
+        return;
+    }
+
+    sampleRate = ma_engine_get_sample_rate(ma_sound_get_engine(pSound));
+
+    ma_sound_set_fade_start_in_pcm_frames(pSound, volumeBeg, volumeEnd, (fadeLengthInMilliseconds * sampleRate) / 1000, (absoluteGlobalTimeInMilliseconds * sampleRate) / 1000);
+}
+
+MA_API float ma_sound_get_current_fade_volume(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    return ma_fader_get_current_volume(&pSound->engineNode.fader);
+}
+
+MA_API void ma_sound_set_start_time_in_pcm_frames(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInFrames)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_node_set_state_time(pSound, ma_node_state_started, absoluteGlobalTimeInFrames);
+}
+
+MA_API void ma_sound_set_start_time_in_milliseconds(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInMilliseconds)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_sound_set_start_time_in_pcm_frames(pSound, absoluteGlobalTimeInMilliseconds * ma_engine_get_sample_rate(ma_sound_get_engine(pSound)) / 1000);
+}
+
+MA_API void ma_sound_set_stop_time_in_pcm_frames(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInFrames)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_sound_set_stop_time_with_fade_in_pcm_frames(pSound, absoluteGlobalTimeInFrames, 0);
+}
+
+MA_API void ma_sound_set_stop_time_in_milliseconds(ma_sound* pSound, ma_uint64 absoluteGlobalTimeInMilliseconds)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    ma_sound_set_stop_time_in_pcm_frames(pSound, absoluteGlobalTimeInMilliseconds * ma_engine_get_sample_rate(ma_sound_get_engine(pSound)) / 1000);
+}
+
+MA_API void ma_sound_set_stop_time_with_fade_in_pcm_frames(ma_sound* pSound, ma_uint64 stopAbsoluteGlobalTimeInFrames, ma_uint64 fadeLengthInFrames)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    if (fadeLengthInFrames > 0) {
+        if (fadeLengthInFrames > stopAbsoluteGlobalTimeInFrames) {
+            fadeLengthInFrames = stopAbsoluteGlobalTimeInFrames;
+        }
+
+        ma_sound_set_fade_start_in_pcm_frames(pSound, -1, 0, fadeLengthInFrames, stopAbsoluteGlobalTimeInFrames - fadeLengthInFrames);
+    }
+
+    ma_node_set_state_time(pSound, ma_node_state_stopped, stopAbsoluteGlobalTimeInFrames);
+}
+
+MA_API void ma_sound_set_stop_time_with_fade_in_milliseconds(ma_sound* pSound, ma_uint64 stopAbsoluteGlobalTimeInMilliseconds, ma_uint64 fadeLengthInMilliseconds)
+{
+    ma_uint32 sampleRate;
+
+    if (pSound == NULL) {
+        return;
+    }
+
+    sampleRate = ma_engine_get_sample_rate(ma_sound_get_engine(pSound));
+
+    ma_sound_set_stop_time_with_fade_in_pcm_frames(pSound, (stopAbsoluteGlobalTimeInMilliseconds * sampleRate) / 1000, (fadeLengthInMilliseconds * sampleRate) / 1000);
+}
+
+MA_API ma_bool32 ma_sound_is_playing(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_node_get_state_by_time(pSound, ma_engine_get_time_in_pcm_frames(ma_sound_get_engine(pSound))) == ma_node_state_started;
+}
+
+MA_API ma_uint64 ma_sound_get_time_in_pcm_frames(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return 0;
+    }
+
+    return ma_node_get_time(pSound);
+}
+
+MA_API ma_uint64 ma_sound_get_time_in_milliseconds(const ma_sound* pSound)
+{
+    return ma_sound_get_time_in_pcm_frames(pSound) * 1000 / ma_engine_get_sample_rate(ma_sound_get_engine(pSound));
+}
+
+MA_API void ma_sound_set_looping(ma_sound* pSound, ma_bool32 isLooping)
+{
+    if (pSound == NULL) {
+        return;
+    }
+
+    /* Looping is only a valid concept if the sound is backed by a data source. */
+    if (pSound->pDataSource == NULL) {
+        return;
+    }
+
+    /* The looping state needs to be applied to the data source in order for any looping to actually happen. */
+    ma_data_source_set_looping(pSound->pDataSource, isLooping);
+}
+
+MA_API ma_bool32 ma_sound_is_looping(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_FALSE;
+    }
+
+    /* There is no notion of looping for sounds that are not backed by a data source. */
+    if (pSound->pDataSource == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_data_source_is_looping(pSound->pDataSource);
+}
+
+MA_API ma_bool32 ma_sound_at_end(const ma_sound* pSound)
+{
+    if (pSound == NULL) {
+        return MA_FALSE;
+    }
+
+    /* There is no notion of an end of a sound if it's not backed by a data source. */
+    if (pSound->pDataSource == NULL) {
+        return MA_FALSE;
+    }
+
+    return ma_sound_get_at_end(pSound);
+}
+
+MA_API ma_result ma_sound_seek_to_pcm_frame(ma_sound* pSound, ma_uint64 frameIndex)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* Seeking is only valid for sounds that are backed by a data source. */
+    if (pSound->pDataSource == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    /* We can't be seeking while reading at the same time. We just set the seek target and get the mixing thread to do the actual seek. */
+    ma_atomic_exchange_64(&pSound->seekTarget, frameIndex);
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_sound_seek_to_second(ma_sound* pSound, float seekPointInSeconds)
+{
+    ma_uint64 frameIndex;
+    ma_uint32 sampleRate;
+    ma_result result;
+
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    result = ma_sound_get_data_format(pSound, NULL, NULL, &sampleRate, NULL, 0);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* We need PCM frames. We need to convert first */
+    frameIndex = (ma_uint64)(seekPointInSeconds * sampleRate);
+
+    return ma_sound_seek_to_pcm_frame(pSound, frameIndex);
+}
+
+MA_API ma_result ma_sound_get_data_format(ma_sound* pSound, ma_format* pFormat, ma_uint32* pChannels, ma_uint32* pSampleRate, ma_channel* pChannelMap, size_t channelMapCap)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The data format is retrieved directly from the data source if the sound is backed by one. Otherwise we pull it from the node. */
+    if (pSound->pDataSource == NULL) {
+        ma_uint32 channels;
+
+        if (pFormat != NULL) {
+            *pFormat = ma_format_f32;
+        }
+
+        channels = ma_node_get_input_channels(&pSound->engineNode, 0);
+        if (pChannels != NULL) {
+            *pChannels = channels;
+        }
+
+        if (pSampleRate != NULL) {
+            *pSampleRate = pSound->engineNode.resampler.config.sampleRateIn;
+        }
+
+        if (pChannelMap != NULL) {
+            ma_channel_map_init_standard(ma_standard_channel_map_default, pChannelMap, channelMapCap, channels);
+        }
+
+        return MA_SUCCESS;
+    } else {
+        return ma_data_source_get_data_format(pSound->pDataSource, pFormat, pChannels, pSampleRate, pChannelMap, channelMapCap);
+    }
+}
+
+MA_API ma_result ma_sound_get_cursor_in_pcm_frames(ma_sound* pSound, ma_uint64* pCursor)
+{
+    ma_uint64 seekTarget;
+
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The notion of a cursor is only valid for sounds that are backed by a data source. */
+    if (pSound->pDataSource == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    seekTarget = ma_atomic_load_64(&pSound->seekTarget);
+    if (seekTarget != MA_SEEK_TARGET_NONE) {
+        *pCursor = seekTarget;
+        return MA_SUCCESS;
+    } else {
+        return ma_data_source_get_cursor_in_pcm_frames(pSound->pDataSource, pCursor);
+    }
+}
+
+MA_API ma_result ma_sound_get_length_in_pcm_frames(ma_sound* pSound, ma_uint64* pLength)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The notion of a sound length is only valid for sounds that are backed by a data source. */
+    if (pSound->pDataSource == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    return ma_data_source_get_length_in_pcm_frames(pSound->pDataSource, pLength);
+}
+
+MA_API ma_result ma_sound_get_cursor_in_seconds(ma_sound* pSound, float* pCursor)
+{
+    ma_result result;
+    ma_uint64 cursorInPCMFrames;
+    ma_uint32 sampleRate;
+
+    if (pCursor != NULL) {
+        *pCursor = 0;
+    }
+
+    result = ma_sound_get_cursor_in_pcm_frames(pSound, &cursorInPCMFrames);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    result = ma_sound_get_data_format(pSound, NULL, NULL, &sampleRate, NULL, 0);
+    if (result != MA_SUCCESS) {
+        return result;
+    }
+
+    /* VC6 does not support division of unsigned 64-bit integers with floating point numbers. Need to use a signed number. This shouldn't effect anything in practice. */
+    *pCursor = (ma_int64)cursorInPCMFrames / (float)sampleRate;
+
+    return MA_SUCCESS;
+}
+
+MA_API ma_result ma_sound_get_length_in_seconds(ma_sound* pSound, float* pLength)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The notion of a sound length is only valid for sounds that are backed by a data source. */
+    if (pSound->pDataSource == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    return ma_data_source_get_length_in_seconds(pSound->pDataSource, pLength);
+}
+
+MA_API ma_result ma_sound_set_end_callback(ma_sound* pSound, ma_sound_end_proc callback, void* pUserData)
+{
+    if (pSound == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* The notion of an end is only valid for sounds that are backed by a data source. */
+    if (pSound->pDataSource == NULL) {
+        return MA_INVALID_OPERATION;
+    }
+
+    pSound->endCallback          = callback;
+    pSound->pEndCallbackUserData = pUserData;
+
+    return MA_SUCCESS;
+}
+
+
+MA_API ma_result ma_sound_group_init(ma_engine* pEngine, ma_uint32 flags, ma_sound_group* pParentGroup, ma_sound_group* pGroup)
+{
+    ma_sound_group_config config = ma_sound_group_config_init_2(pEngine);
+    config.flags              = flags;
+    config.pInitialAttachment = pParentGroup;
+    return ma_sound_group_init_ex(pEngine, &config, pGroup);
+}
+
+MA_API ma_result ma_sound_group_init_ex(ma_engine* pEngine, const ma_sound_group_config* pConfig, ma_sound_group* pGroup)
+{
+    ma_sound_config soundConfig;
+
+    if (pGroup == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    MA_ZERO_OBJECT(pGroup);
+
+    if (pConfig == NULL) {
+        return MA_INVALID_ARGS;
+    }
+
+    /* A sound group is just a sound without a data source. */
+    soundConfig = *pConfig;
+    soundConfig.pFilePath   = NULL;
+    soundConfig.pFilePathW  = NULL;
+    soundConfig.pDataSource = NULL;
+
+    /*
+    Groups need to have spatialization disabled by default because I think it'll be pretty rare
+    that programs will want to spatialize groups (but not unheard of). Certainly it feels like
+    disabling this by default feels like the right option. Spatialization can be enabled with a
+    call to ma_sound_group_set_spatialization_enabled().
+    */
+    soundConfig.flags |= MA_SOUND_FLAG_NO_SPATIALIZATION;
+
+    return ma_sound_init_ex(pEngine, &soundConfig, pGroup);
+}
+
+MA_API void ma_sound_group_uninit(ma_sound_group* pGroup)
+{
+    ma_sound_uninit(pGroup);
+}
+
+MA_API ma_engine* ma_sound_group_get_engine(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_engine(pGroup);
+}
+
+MA_API ma_result ma_sound_group_start(ma_sound_group* pGroup)
+{
+    return ma_sound_start(pGroup);
+}
+
+MA_API ma_result ma_sound_group_stop(ma_sound_group* pGroup)
+{
+    return ma_sound_stop(pGroup);
+}
+
+MA_API void ma_sound_group_set_volume(ma_sound_group* pGroup, float volume)
+{
+    ma_sound_set_volume(pGroup, volume);
+}
+
+MA_API float ma_sound_group_get_volume(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_volume(pGroup);
+}
+
+MA_API void ma_sound_group_set_pan(ma_sound_group* pGroup, float pan)
+{
+    ma_sound_set_pan(pGroup, pan);
+}
+
+MA_API float ma_sound_group_get_pan(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_pan(pGroup);
+}
+
+MA_API void ma_sound_group_set_pan_mode(ma_sound_group* pGroup, ma_pan_mode panMode)
+{
+    ma_sound_set_pan_mode(pGroup, panMode);
+}
+
+MA_API ma_pan_mode ma_sound_group_get_pan_mode(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_pan_mode(pGroup);
+}
+
+MA_API void ma_sound_group_set_pitch(ma_sound_group* pGroup, float pitch)
+{
+    ma_sound_set_pitch(pGroup, pitch);
+}
+
+MA_API float ma_sound_group_get_pitch(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_pitch(pGroup);
+}
+
+MA_API void ma_sound_group_set_spatialization_enabled(ma_sound_group* pGroup, ma_bool32 enabled)
+{
+    ma_sound_set_spatialization_enabled(pGroup, enabled);
+}
+
+MA_API ma_bool32 ma_sound_group_is_spatialization_enabled(const ma_sound_group* pGroup)
+{
+    return ma_sound_is_spatialization_enabled(pGroup);
+}
+
+MA_API void ma_sound_group_set_pinned_listener_index(ma_sound_group* pGroup, ma_uint32 listenerIndex)
+{
+    ma_sound_set_pinned_listener_index(pGroup, listenerIndex);
+}
+
+MA_API ma_uint32 ma_sound_group_get_pinned_listener_index(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_pinned_listener_index(pGroup);
+}
+
+MA_API ma_uint32 ma_sound_group_get_listener_index(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_listener_index(pGroup);
+}
+
+MA_API ma_vec3f ma_sound_group_get_direction_to_listener(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_direction_to_listener(pGroup);
+}
+
+MA_API void ma_sound_group_set_position(ma_sound_group* pGroup, float x, float y, float z)
+{
+    ma_sound_set_position(pGroup, x, y, z);
+}
+
+MA_API ma_vec3f ma_sound_group_get_position(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_position(pGroup);
+}
+
+MA_API void ma_sound_group_set_direction(ma_sound_group* pGroup, float x, float y, float z)
+{
+    ma_sound_set_direction(pGroup, x, y, z);
+}
+
+MA_API ma_vec3f ma_sound_group_get_direction(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_direction(pGroup);
+}
+
+MA_API void ma_sound_group_set_velocity(ma_sound_group* pGroup, float x, float y, float z)
+{
+    ma_sound_set_velocity(pGroup, x, y, z);
+}
+
+MA_API ma_vec3f ma_sound_group_get_velocity(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_velocity(pGroup);
+}
+
+MA_API void ma_sound_group_set_attenuation_model(ma_sound_group* pGroup, ma_attenuation_model attenuationModel)
+{
+    ma_sound_set_attenuation_model(pGroup, attenuationModel);
+}
+
+MA_API ma_attenuation_model ma_sound_group_get_attenuation_model(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_attenuation_model(pGroup);
+}
+
+MA_API void ma_sound_group_set_positioning(ma_sound_group* pGroup, ma_positioning positioning)
+{
+    ma_sound_set_positioning(pGroup, positioning);
+}
+
+MA_API ma_positioning ma_sound_group_get_positioning(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_positioning(pGroup);
+}
+
+MA_API void ma_sound_group_set_rolloff(ma_sound_group* pGroup, float rolloff)
+{
+    ma_sound_set_rolloff(pGroup, rolloff);
+}
+
+MA_API float ma_sound_group_get_rolloff(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_rolloff(pGroup);
+}
+
+MA_API void ma_sound_group_set_min_gain(ma_sound_group* pGroup, float minGain)
+{
+    ma_sound_set_min_gain(pGroup, minGain);
+}
+
+MA_API float ma_sound_group_get_min_gain(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_min_gain(pGroup);
+}
+
+MA_API void ma_sound_group_set_max_gain(ma_sound_group* pGroup, float maxGain)
+{
+    ma_sound_set_max_gain(pGroup, maxGain);
+}
+
+MA_API float ma_sound_group_get_max_gain(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_max_gain(pGroup);
+}
+
+MA_API void ma_sound_group_set_min_distance(ma_sound_group* pGroup, float minDistance)
+{
+    ma_sound_set_min_distance(pGroup, minDistance);
+}
+
+MA_API float ma_sound_group_get_min_distance(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_min_distance(pGroup);
+}
+
+MA_API void ma_sound_group_set_max_distance(ma_sound_group* pGroup, float maxDistance)
+{
+    ma_sound_set_max_distance(pGroup, maxDistance);
+}
+
+MA_API float ma_sound_group_get_max_distance(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_max_distance(pGroup);
+}
+
+MA_API void ma_sound_group_set_cone(ma_sound_group* pGroup, float innerAngleInRadians, float outerAngleInRadians, float outerGain)
+{
+    ma_sound_set_cone(pGroup, innerAngleInRadians, outerAngleInRadians, outerGain);
+}
+
+MA_API void ma_sound_group_get_cone(const ma_sound_group* pGroup, float* pInnerAngleInRadians, float* pOuterAngleInRadians, float* pOuterGain)
+{
+    ma_sound_get_cone(pGroup, pInnerAngleInRadians, pOuterAngleInRadians, pOuterGain);
+}
+
+MA_API void ma_sound_group_set_doppler_factor(ma_sound_group* pGroup, float dopplerFactor)
+{
+    ma_sound_set_doppler_factor(pGroup, dopplerFactor);
+}
+
+MA_API float ma_sound_group_get_doppler_factor(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_doppler_factor(pGroup);
+}
+
+MA_API void ma_sound_group_set_directional_attenuation_factor(ma_sound_group* pGroup, float directionalAttenuationFactor)
+{
+    ma_sound_set_directional_attenuation_factor(pGroup, directionalAttenuationFactor);
+}
+
+MA_API float ma_sound_group_get_directional_attenuation_factor(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_directional_attenuation_factor(pGroup);
+}
+
+MA_API void ma_sound_group_set_fade_in_pcm_frames(ma_sound_group* pGroup, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInFrames)
+{
+    ma_sound_set_fade_in_pcm_frames(pGroup, volumeBeg, volumeEnd, fadeLengthInFrames);
+}
+
+MA_API void ma_sound_group_set_fade_in_milliseconds(ma_sound_group* pGroup, float volumeBeg, float volumeEnd, ma_uint64 fadeLengthInMilliseconds)
+{
+    ma_sound_set_fade_in_milliseconds(pGroup, volumeBeg, volumeEnd, fadeLengthInMilliseconds);
+}
+
+MA_API float ma_sound_group_get_current_fade_volume(ma_sound_group* pGroup)
+{
+    return ma_sound_get_current_fade_volume(pGroup);
+}
+
+MA_API void ma_sound_group_set_start_time_in_pcm_frames(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInFrames)
+{
+    ma_sound_set_start_time_in_pcm_frames(pGroup, absoluteGlobalTimeInFrames);
+}
+
+MA_API void ma_sound_group_set_start_time_in_milliseconds(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInMilliseconds)
+{
+    ma_sound_set_start_time_in_milliseconds(pGroup, absoluteGlobalTimeInMilliseconds);
+}
+
+MA_API void ma_sound_group_set_stop_time_in_pcm_frames(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInFrames)
+{
+    ma_sound_set_stop_time_in_pcm_frames(pGroup, absoluteGlobalTimeInFrames);
+}
+
+MA_API void ma_sound_group_set_stop_time_in_milliseconds(ma_sound_group* pGroup, ma_uint64 absoluteGlobalTimeInMilliseconds)
+{
+    ma_sound_set_stop_time_in_milliseconds(pGroup, absoluteGlobalTimeInMilliseconds);
+}
+
+MA_API ma_bool32 ma_sound_group_is_playing(const ma_sound_group* pGroup)
+{
+    return ma_sound_is_playing(pGroup);
+}
+
+MA_API ma_uint64 ma_sound_group_get_time_in_pcm_frames(const ma_sound_group* pGroup)
+{
+    return ma_sound_get_time_in_pcm_frames(pGroup);
+}
+#endif  /* MA_NO_ENGINE */
+/* END SECTION: miniaudio_engine.c */
+
+
+
+/**************************************************************************************************************************************************************
+***************************************************************************************************************************************************************
+
+Auto Generated
+==============
+All code below is auto-generated from a tool. This mostly consists of decoding backend implementations such as ma_dr_wav, ma_dr_flac, etc. If you find a bug in the
+code below please report the bug to the respective repository for the relevant project (probably dr_libs).
+
+***************************************************************************************************************************************************************
+**************************************************************************************************************************************************************/
+#if !defined(MA_NO_WAV) && (!defined(MA_NO_DECODING) || !defined(MA_NO_ENCODING))
+#if !defined(MA_DR_WAV_IMPLEMENTATION)
+/* dr_wav_c begin */
+#ifndef ma_dr_wav_c
+#define ma_dr_wav_c
+#ifdef __MRC__
+#pragma options opt off
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#ifndef MA_DR_WAV_NO_STDIO
+#include <stdio.h>
+#ifndef MA_DR_WAV_NO_WCHAR
+#include <wchar.h>
+#endif
+#endif
+#ifndef MA_DR_WAV_ASSERT
+#include <assert.h>
+#define MA_DR_WAV_ASSERT(expression)           assert(expression)
+#endif
+#ifndef MA_DR_WAV_MALLOC
+#define MA_DR_WAV_MALLOC(sz)                   malloc((sz))
+#endif
+#ifndef MA_DR_WAV_REALLOC
+#define MA_DR_WAV_REALLOC(p, sz)               realloc((p), (sz))
+#endif
+#ifndef MA_DR_WAV_FREE
+#define MA_DR_WAV_FREE(p)                      free((p))
+#endif
+#ifndef MA_DR_WAV_COPY_MEMORY
+#define MA_DR_WAV_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
+#endif
+#ifndef MA_DR_WAV_ZERO_MEMORY
+#define MA_DR_WAV_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
+#endif
+#ifndef MA_DR_WAV_ZERO_OBJECT
+#define MA_DR_WAV_ZERO_OBJECT(p)               MA_DR_WAV_ZERO_MEMORY((p), sizeof(*p))
+#endif
+#define ma_dr_wav_countof(x)                   (sizeof(x) / sizeof(x[0]))
+#define ma_dr_wav_align(x, a)                  ((((x) + (a) - 1) / (a)) * (a))
+#define ma_dr_wav_min(a, b)                    (((a) < (b)) ? (a) : (b))
+#define ma_dr_wav_max(a, b)                    (((a) > (b)) ? (a) : (b))
+#define ma_dr_wav_clamp(x, lo, hi)             (ma_dr_wav_max((lo), ma_dr_wav_min((hi), (x))))
+#define ma_dr_wav_offset_ptr(p, offset)        (((ma_uint8*)(p)) + (offset))
+#define MA_DR_WAV_MAX_SIMD_VECTOR_SIZE         32
+#define MA_DR_WAV_INT64_MIN ((ma_int64) ((ma_uint64)0x80000000 << 32))
+#define MA_DR_WAV_INT64_MAX ((ma_int64)(((ma_uint64)0x7FFFFFFF << 32) | 0xFFFFFFFF))
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+    #define MA_DR_WAV_HAS_BYTESWAP16_INTRINSIC
+    #define MA_DR_WAV_HAS_BYTESWAP32_INTRINSIC
+    #define MA_DR_WAV_HAS_BYTESWAP64_INTRINSIC
+#elif defined(__clang__)
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_bswap16)
+            #define MA_DR_WAV_HAS_BYTESWAP16_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap32)
+            #define MA_DR_WAV_HAS_BYTESWAP32_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap64)
+            #define MA_DR_WAV_HAS_BYTESWAP64_INTRINSIC
+        #endif
+    #endif
+#elif defined(__GNUC__)
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+        #define MA_DR_WAV_HAS_BYTESWAP32_INTRINSIC
+        #define MA_DR_WAV_HAS_BYTESWAP64_INTRINSIC
+    #endif
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+        #define MA_DR_WAV_HAS_BYTESWAP16_INTRINSIC
+    #endif
+#endif
+MA_API void ma_dr_wav_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision)
+{
+    if (pMajor) {
+        *pMajor = MA_DR_WAV_VERSION_MAJOR;
+    }
+    if (pMinor) {
+        *pMinor = MA_DR_WAV_VERSION_MINOR;
+    }
+    if (pRevision) {
+        *pRevision = MA_DR_WAV_VERSION_REVISION;
+    }
+}
+MA_API const char* ma_dr_wav_version_string(void)
+{
+    return MA_DR_WAV_VERSION_STRING;
+}
+#ifndef MA_DR_WAV_MAX_SAMPLE_RATE
+#define MA_DR_WAV_MAX_SAMPLE_RATE       384000
+#endif
+#ifndef MA_DR_WAV_MAX_CHANNELS
+#define MA_DR_WAV_MAX_CHANNELS          256
+#endif
+#ifndef MA_DR_WAV_MAX_BITS_PER_SAMPLE
+#define MA_DR_WAV_MAX_BITS_PER_SAMPLE   64
+#endif
+static const ma_uint8 ma_dr_wavGUID_W64_RIFF[16] = {0x72,0x69,0x66,0x66, 0x2E,0x91, 0xCF,0x11, 0xA5,0xD6, 0x28,0xDB,0x04,0xC1,0x00,0x00};
+static const ma_uint8 ma_dr_wavGUID_W64_WAVE[16] = {0x77,0x61,0x76,0x65, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};
+static const ma_uint8 ma_dr_wavGUID_W64_FMT [16] = {0x66,0x6D,0x74,0x20, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};
+static const ma_uint8 ma_dr_wavGUID_W64_FACT[16] = {0x66,0x61,0x63,0x74, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};
+static const ma_uint8 ma_dr_wavGUID_W64_DATA[16] = {0x64,0x61,0x74,0x61, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};
+static MA_INLINE int ma_dr_wav__is_little_endian(void)
+{
+#if defined(MA_X86) || defined(MA_X64)
+    return MA_TRUE;
+#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
+    return MA_TRUE;
+#else
+    int n = 1;
+    return (*(char*)&n) == 1;
+#endif
+}
+static MA_INLINE void ma_dr_wav_bytes_to_guid(const ma_uint8* data, ma_uint8* guid)
+{
+    int i;
+    for (i = 0; i < 16; ++i) {
+        guid[i] = data[i];
+    }
+}
+static MA_INLINE ma_uint16 ma_dr_wav__bswap16(ma_uint16 n)
+{
+#ifdef MA_DR_WAV_HAS_BYTESWAP16_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_ushort(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap16(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF00) >> 8) |
+           ((n & 0x00FF) << 8);
+#endif
+}
+static MA_INLINE ma_uint32 ma_dr_wav__bswap32(ma_uint32 n)
+{
+#ifdef MA_DR_WAV_HAS_BYTESWAP32_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_ulong(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        #if defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(MA_64BIT)
+            ma_uint32 r;
+            __asm__ __volatile__ (
+            #if defined(MA_64BIT)
+                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)
+            #else
+                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
+            #endif
+            );
+            return r;
+        #else
+            return __builtin_bswap32(n);
+        #endif
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+static MA_INLINE ma_uint64 ma_dr_wav__bswap64(ma_uint64 n)
+{
+#ifdef MA_DR_WAV_HAS_BYTESWAP64_INTRINSIC
+    #if defined(_MSC_VER)
+        return _byteswap_uint64(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap64(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & ((ma_uint64)0xFF000000 << 32)) >> 56) |
+           ((n & ((ma_uint64)0x00FF0000 << 32)) >> 40) |
+           ((n & ((ma_uint64)0x0000FF00 << 32)) >> 24) |
+           ((n & ((ma_uint64)0x000000FF << 32)) >>  8) |
+           ((n & ((ma_uint64)0xFF000000      )) <<  8) |
+           ((n & ((ma_uint64)0x00FF0000      )) << 24) |
+           ((n & ((ma_uint64)0x0000FF00      )) << 40) |
+           ((n & ((ma_uint64)0x000000FF      )) << 56);
+#endif
+}
+static MA_INLINE ma_int16 ma_dr_wav__bswap_s16(ma_int16 n)
+{
+    return (ma_int16)ma_dr_wav__bswap16((ma_uint16)n);
+}
+static MA_INLINE void ma_dr_wav__bswap_samples_s16(ma_int16* pSamples, ma_uint64 sampleCount)
+{
+    ma_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = ma_dr_wav__bswap_s16(pSamples[iSample]);
+    }
+}
+static MA_INLINE void ma_dr_wav__bswap_s24(ma_uint8* p)
+{
+    ma_uint8 t;
+    t = p[0];
+    p[0] = p[2];
+    p[2] = t;
+}
+static MA_INLINE void ma_dr_wav__bswap_samples_s24(ma_uint8* pSamples, ma_uint64 sampleCount)
+{
+    ma_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        ma_uint8* pSample = pSamples + (iSample*3);
+        ma_dr_wav__bswap_s24(pSample);
+    }
+}
+static MA_INLINE ma_int32 ma_dr_wav__bswap_s32(ma_int32 n)
+{
+    return (ma_int32)ma_dr_wav__bswap32((ma_uint32)n);
+}
+static MA_INLINE void ma_dr_wav__bswap_samples_s32(ma_int32* pSamples, ma_uint64 sampleCount)
+{
+    ma_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = ma_dr_wav__bswap_s32(pSamples[iSample]);
+    }
+}
+static MA_INLINE ma_int64 ma_dr_wav__bswap_s64(ma_int64 n)
+{
+    return (ma_int64)ma_dr_wav__bswap64((ma_uint64)n);
+}
+static MA_INLINE void ma_dr_wav__bswap_samples_s64(ma_int64* pSamples, ma_uint64 sampleCount)
+{
+    ma_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = ma_dr_wav__bswap_s64(pSamples[iSample]);
+    }
+}
+static MA_INLINE float ma_dr_wav__bswap_f32(float n)
+{
+    union {
+        ma_uint32 i;
+        float f;
+    } x;
+    x.f = n;
+    x.i = ma_dr_wav__bswap32(x.i);
+    return x.f;
+}
+static MA_INLINE void ma_dr_wav__bswap_samples_f32(float* pSamples, ma_uint64 sampleCount)
+{
+    ma_uint64 iSample;
+    for (iSample = 0; iSample < sampleCount; iSample += 1) {
+        pSamples[iSample] = ma_dr_wav__bswap_f32(pSamples[iSample]);
+    }
+}
+static MA_INLINE void ma_dr_wav__bswap_samples(void* pSamples, ma_uint64 sampleCount, ma_uint32 bytesPerSample)
+{
+    switch (bytesPerSample)
+    {
+        case 1:
+        {
+        } break;
+        case 2:
+        {
+            ma_dr_wav__bswap_samples_s16((ma_int16*)pSamples, sampleCount);
+        } break;
+        case 3:
+        {
+            ma_dr_wav__bswap_samples_s24((ma_uint8*)pSamples, sampleCount);
+        } break;
+        case 4:
+        {
+            ma_dr_wav__bswap_samples_s32((ma_int32*)pSamples, sampleCount);
+        } break;
+        case 8:
+        {
+            ma_dr_wav__bswap_samples_s64((ma_int64*)pSamples, sampleCount);
+        } break;
+        default:
+        {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+        } break;
+    }
+}
+MA_PRIVATE MA_INLINE ma_bool32 ma_dr_wav_is_container_be(ma_dr_wav_container container)
+{
+    if (container == ma_dr_wav_container_rifx || container == ma_dr_wav_container_aiff) {
+        return MA_TRUE;
+    } else {
+        return MA_FALSE;
+    }
+}
+MA_PRIVATE MA_INLINE ma_uint16 ma_dr_wav_bytes_to_u16_le(const ma_uint8* data)
+{
+    return ((ma_uint16)data[0] << 0) | ((ma_uint16)data[1] << 8);
+}
+MA_PRIVATE MA_INLINE ma_uint16 ma_dr_wav_bytes_to_u16_be(const ma_uint8* data)
+{
+    return ((ma_uint16)data[1] << 0) | ((ma_uint16)data[0] << 8);
+}
+MA_PRIVATE MA_INLINE ma_uint16 ma_dr_wav_bytes_to_u16_ex(const ma_uint8* data, ma_dr_wav_container container)
+{
+    if (ma_dr_wav_is_container_be(container)) {
+        return ma_dr_wav_bytes_to_u16_be(data);
+    } else {
+        return ma_dr_wav_bytes_to_u16_le(data);
+    }
+}
+MA_PRIVATE MA_INLINE ma_uint32 ma_dr_wav_bytes_to_u32_le(const ma_uint8* data)
+{
+    return ((ma_uint32)data[0] << 0) | ((ma_uint32)data[1] << 8) | ((ma_uint32)data[2] << 16) | ((ma_uint32)data[3] << 24);
+}
+MA_PRIVATE MA_INLINE ma_uint32 ma_dr_wav_bytes_to_u32_be(const ma_uint8* data)
+{
+    return ((ma_uint32)data[3] << 0) | ((ma_uint32)data[2] << 8) | ((ma_uint32)data[1] << 16) | ((ma_uint32)data[0] << 24);
+}
+MA_PRIVATE MA_INLINE ma_uint32 ma_dr_wav_bytes_to_u32_ex(const ma_uint8* data, ma_dr_wav_container container)
+{
+    if (ma_dr_wav_is_container_be(container)) {
+        return ma_dr_wav_bytes_to_u32_be(data);
+    } else {
+        return ma_dr_wav_bytes_to_u32_le(data);
+    }
+}
+MA_PRIVATE ma_int64 ma_dr_wav_aiff_extented_to_s64(const ma_uint8* data)
+{
+    ma_uint32 exponent = ((ma_uint32)data[0] << 8) | data[1];
+    ma_uint64 hi = ((ma_uint64)data[2] << 24) | ((ma_uint64)data[3] << 16) | ((ma_uint64)data[4] <<  8) | ((ma_uint64)data[5] <<  0);
+    ma_uint64 lo = ((ma_uint64)data[6] << 24) | ((ma_uint64)data[7] << 16) | ((ma_uint64)data[8] <<  8) | ((ma_uint64)data[9] <<  0);
+    ma_uint64 significand = (hi << 32) | lo;
+    int sign = exponent >> 15;
+    exponent &= 0x7FFF;
+    if (exponent == 0 && significand == 0) {
+        return 0;
+    } else if (exponent == 0x7FFF) {
+        return sign ? MA_DR_WAV_INT64_MIN : MA_DR_WAV_INT64_MAX;
+    }
+    exponent -= 16383;
+    if (exponent > 63) {
+        return sign ? MA_DR_WAV_INT64_MIN : MA_DR_WAV_INT64_MAX;
+    } else if (exponent < 1) {
+        return 0;
+    }
+    significand >>= (63 - exponent);
+    if (sign) {
+        return -(ma_int64)significand;
+    } else {
+        return  (ma_int64)significand;
+    }
+}
+MA_PRIVATE void* ma_dr_wav__malloc_default(size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return MA_DR_WAV_MALLOC(sz);
+}
+MA_PRIVATE void* ma_dr_wav__realloc_default(void* p, size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return MA_DR_WAV_REALLOC(p, sz);
+}
+MA_PRIVATE void ma_dr_wav__free_default(void* p, void* pUserData)
+{
+    (void)pUserData;
+    MA_DR_WAV_FREE(p);
+}
+MA_PRIVATE void* ma_dr_wav__malloc_from_callbacks(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+    if (pAllocationCallbacks->onMalloc != NULL) {
+        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
+    }
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
+    }
+    return NULL;
+}
+MA_PRIVATE void* ma_dr_wav__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
+    }
+    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
+        void* p2;
+        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
+        if (p2 == NULL) {
+            return NULL;
+        }
+        if (p != NULL) {
+            MA_DR_WAV_COPY_MEMORY(p2, p, szOld);
+            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+        }
+        return p2;
+    }
+    return NULL;
+}
+MA_PRIVATE void ma_dr_wav__free_from_callbacks(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (p == NULL || pAllocationCallbacks == NULL) {
+        return;
+    }
+    if (pAllocationCallbacks->onFree != NULL) {
+        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+    }
+}
+MA_PRIVATE ma_allocation_callbacks ma_dr_wav_copy_allocation_callbacks_or_defaults(const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        return *pAllocationCallbacks;
+    } else {
+        ma_allocation_callbacks allocationCallbacks;
+        allocationCallbacks.pUserData = NULL;
+        allocationCallbacks.onMalloc  = ma_dr_wav__malloc_default;
+        allocationCallbacks.onRealloc = ma_dr_wav__realloc_default;
+        allocationCallbacks.onFree    = ma_dr_wav__free_default;
+        return allocationCallbacks;
+    }
+}
+static MA_INLINE ma_bool32 ma_dr_wav__is_compressed_format_tag(ma_uint16 formatTag)
+{
+    return
+        formatTag == MA_DR_WAVE_FORMAT_ADPCM ||
+        formatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM;
+}
+MA_PRIVATE unsigned int ma_dr_wav__chunk_padding_size_riff(ma_uint64 chunkSize)
+{
+    return (unsigned int)(chunkSize % 2);
+}
+MA_PRIVATE unsigned int ma_dr_wav__chunk_padding_size_w64(ma_uint64 chunkSize)
+{
+    return (unsigned int)(chunkSize % 8);
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__msadpcm(ma_dr_wav* pWav, ma_uint64 samplesToRead, ma_int16* pBufferOut);
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__ima(ma_dr_wav* pWav, ma_uint64 samplesToRead, ma_int16* pBufferOut);
+MA_PRIVATE ma_bool32 ma_dr_wav_init_write__internal(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount);
+MA_PRIVATE ma_result ma_dr_wav__read_chunk_header(ma_dr_wav_read_proc onRead, void* pUserData, ma_dr_wav_container container, ma_uint64* pRunningBytesReadOut, ma_dr_wav_chunk_header* pHeaderOut)
+{
+    if (container == ma_dr_wav_container_riff || container == ma_dr_wav_container_rifx || container == ma_dr_wav_container_rf64 || container == ma_dr_wav_container_aiff) {
+        ma_uint8 sizeInBytes[4];
+        if (onRead(pUserData, pHeaderOut->id.fourcc, 4) != 4) {
+            return MA_AT_END;
+        }
+        if (onRead(pUserData, sizeInBytes, 4) != 4) {
+            return MA_INVALID_FILE;
+        }
+        pHeaderOut->sizeInBytes = ma_dr_wav_bytes_to_u32_ex(sizeInBytes, container);
+        pHeaderOut->paddingSize = ma_dr_wav__chunk_padding_size_riff(pHeaderOut->sizeInBytes);
+        *pRunningBytesReadOut += 8;
+    } else if (container == ma_dr_wav_container_w64) {
+        ma_uint8 sizeInBytes[8];
+        if (onRead(pUserData, pHeaderOut->id.guid, 16) != 16) {
+            return MA_AT_END;
+        }
+        if (onRead(pUserData, sizeInBytes, 8) != 8) {
+            return MA_INVALID_FILE;
+        }
+        pHeaderOut->sizeInBytes = ma_dr_wav_bytes_to_u64(sizeInBytes) - 24;
+        pHeaderOut->paddingSize = ma_dr_wav__chunk_padding_size_w64(pHeaderOut->sizeInBytes);
+        *pRunningBytesReadOut += 24;
+    } else {
+        return MA_INVALID_FILE;
+    }
+    return MA_SUCCESS;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav__seek_forward(ma_dr_wav_seek_proc onSeek, ma_uint64 offset, void* pUserData)
+{
+    ma_uint64 bytesRemainingToSeek = offset;
+    while (bytesRemainingToSeek > 0) {
+        if (bytesRemainingToSeek > 0x7FFFFFFF) {
+            if (!onSeek(pUserData, 0x7FFFFFFF, ma_dr_wav_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            bytesRemainingToSeek -= 0x7FFFFFFF;
+        } else {
+            if (!onSeek(pUserData, (int)bytesRemainingToSeek, ma_dr_wav_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            bytesRemainingToSeek = 0;
+        }
+    }
+    return MA_TRUE;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav__seek_from_start(ma_dr_wav_seek_proc onSeek, ma_uint64 offset, void* pUserData)
+{
+    if (offset <= 0x7FFFFFFF) {
+        return onSeek(pUserData, (int)offset, ma_dr_wav_seek_origin_start);
+    }
+    if (!onSeek(pUserData, 0x7FFFFFFF, ma_dr_wav_seek_origin_start)) {
+        return MA_FALSE;
+    }
+    offset -= 0x7FFFFFFF;
+    for (;;) {
+        if (offset <= 0x7FFFFFFF) {
+            return onSeek(pUserData, (int)offset, ma_dr_wav_seek_origin_current);
+        }
+        if (!onSeek(pUserData, 0x7FFFFFFF, ma_dr_wav_seek_origin_current)) {
+            return MA_FALSE;
+        }
+        offset -= 0x7FFFFFFF;
+    }
+}
+MA_PRIVATE size_t ma_dr_wav__on_read(ma_dr_wav_read_proc onRead, void* pUserData, void* pBufferOut, size_t bytesToRead, ma_uint64* pCursor)
+{
+    size_t bytesRead;
+    MA_DR_WAV_ASSERT(onRead != NULL);
+    MA_DR_WAV_ASSERT(pCursor != NULL);
+    bytesRead = onRead(pUserData, pBufferOut, bytesToRead);
+    *pCursor += bytesRead;
+    return bytesRead;
+}
+#if 0
+MA_PRIVATE ma_bool32 ma_dr_wav__on_seek(ma_dr_wav_seek_proc onSeek, void* pUserData, int offset, ma_dr_wav_seek_origin origin, ma_uint64* pCursor)
+{
+    MA_DR_WAV_ASSERT(onSeek != NULL);
+    MA_DR_WAV_ASSERT(pCursor != NULL);
+    if (!onSeek(pUserData, offset, origin)) {
+        return MA_FALSE;
+    }
+    if (origin == ma_dr_wav_seek_origin_start) {
+        *pCursor = offset;
+    } else {
+        *pCursor += offset;
+    }
+    return MA_TRUE;
+}
+#endif
+#define MA_DR_WAV_SMPL_BYTES                    36
+#define MA_DR_WAV_SMPL_LOOP_BYTES               24
+#define MA_DR_WAV_INST_BYTES                    7
+#define MA_DR_WAV_ACID_BYTES                    24
+#define MA_DR_WAV_CUE_BYTES                     4
+#define MA_DR_WAV_BEXT_BYTES                    602
+#define MA_DR_WAV_BEXT_DESCRIPTION_BYTES        256
+#define MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES    32
+#define MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES     32
+#define MA_DR_WAV_BEXT_RESERVED_BYTES           180
+#define MA_DR_WAV_BEXT_UMID_BYTES               64
+#define MA_DR_WAV_CUE_POINT_BYTES               24
+#define MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES      4
+#define MA_DR_WAV_LIST_LABELLED_TEXT_BYTES      20
+#define MA_DR_WAV_METADATA_ALIGNMENT            8
+typedef enum
+{
+    ma_dr_wav__metadata_parser_stage_count,
+    ma_dr_wav__metadata_parser_stage_read
+} ma_dr_wav__metadata_parser_stage;
+typedef struct
+{
+    ma_dr_wav_read_proc onRead;
+    ma_dr_wav_seek_proc onSeek;
+    void *pReadSeekUserData;
+    ma_dr_wav__metadata_parser_stage stage;
+    ma_dr_wav_metadata *pMetadata;
+    ma_uint32 metadataCount;
+    ma_uint8 *pData;
+    ma_uint8 *pDataCursor;
+    ma_uint64 metadataCursor;
+    ma_uint64 extraCapacity;
+} ma_dr_wav__metadata_parser;
+MA_PRIVATE size_t ma_dr_wav__metadata_memory_capacity(ma_dr_wav__metadata_parser* pParser)
+{
+    ma_uint64 cap = sizeof(ma_dr_wav_metadata) * (ma_uint64)pParser->metadataCount + pParser->extraCapacity;
+    if (cap > MA_SIZE_MAX) {
+        return 0;
+    }
+    return (size_t)cap;
+}
+MA_PRIVATE ma_uint8* ma_dr_wav__metadata_get_memory(ma_dr_wav__metadata_parser* pParser, size_t size, size_t align)
+{
+    ma_uint8* pResult;
+    if (align) {
+        ma_uintptr modulo = (ma_uintptr)pParser->pDataCursor % align;
+        if (modulo != 0) {
+            pParser->pDataCursor += align - modulo;
+        }
+    }
+    pResult = pParser->pDataCursor;
+    MA_DR_WAV_ASSERT((pResult + size) <= (pParser->pData + ma_dr_wav__metadata_memory_capacity(pParser)));
+    pParser->pDataCursor += size;
+    return pResult;
+}
+MA_PRIVATE void ma_dr_wav__metadata_request_extra_memory_for_stage_2(ma_dr_wav__metadata_parser* pParser, size_t bytes, size_t align)
+{
+    size_t extra = bytes + (align ? (align - 1) : 0);
+    pParser->extraCapacity += extra;
+}
+MA_PRIVATE ma_result ma_dr_wav__metadata_alloc(ma_dr_wav__metadata_parser* pParser, ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pParser->extraCapacity != 0 || pParser->metadataCount != 0) {
+        pAllocationCallbacks->onFree(pParser->pData, pAllocationCallbacks->pUserData);
+        pParser->pData = (ma_uint8*)pAllocationCallbacks->onMalloc(ma_dr_wav__metadata_memory_capacity(pParser), pAllocationCallbacks->pUserData);
+        pParser->pDataCursor = pParser->pData;
+        if (pParser->pData == NULL) {
+            return MA_OUT_OF_MEMORY;
+        }
+        pParser->pMetadata = (ma_dr_wav_metadata*)ma_dr_wav__metadata_get_memory(pParser, sizeof(ma_dr_wav_metadata) * pParser->metadataCount, 1);
+        pParser->metadataCursor = 0;
+    }
+    return MA_SUCCESS;
+}
+MA_PRIVATE size_t ma_dr_wav__metadata_parser_read(ma_dr_wav__metadata_parser* pParser, void* pBufferOut, size_t bytesToRead, ma_uint64* pCursor)
+{
+    if (pCursor != NULL) {
+        return ma_dr_wav__on_read(pParser->onRead, pParser->pReadSeekUserData, pBufferOut, bytesToRead, pCursor);
+    } else {
+        return pParser->onRead(pParser->pReadSeekUserData, pBufferOut, bytesToRead);
+    }
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__read_smpl_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, const ma_dr_wav_chunk_header* pChunkHeader, ma_dr_wav_metadata* pMetadata)
+{
+    ma_uint8 smplHeaderData[MA_DR_WAV_SMPL_BYTES];
+    ma_uint64 totalBytesRead = 0;
+    size_t bytesJustRead;
+    if (pMetadata == NULL) {
+        return 0;
+    }
+    bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, smplHeaderData, sizeof(smplHeaderData), &totalBytesRead);
+    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
+    MA_DR_WAV_ASSERT(pChunkHeader != NULL);
+    if (pMetadata != NULL && bytesJustRead == sizeof(smplHeaderData)) {
+        ma_uint32 iSampleLoop;
+        pMetadata->type                                     = ma_dr_wav_metadata_type_smpl;
+        pMetadata->data.smpl.manufacturerId                 = ma_dr_wav_bytes_to_u32(smplHeaderData + 0);
+        pMetadata->data.smpl.productId                      = ma_dr_wav_bytes_to_u32(smplHeaderData + 4);
+        pMetadata->data.smpl.samplePeriodNanoseconds        = ma_dr_wav_bytes_to_u32(smplHeaderData + 8);
+        pMetadata->data.smpl.midiUnityNote                  = ma_dr_wav_bytes_to_u32(smplHeaderData + 12);
+        pMetadata->data.smpl.midiPitchFraction              = ma_dr_wav_bytes_to_u32(smplHeaderData + 16);
+        pMetadata->data.smpl.smpteFormat                    = ma_dr_wav_bytes_to_u32(smplHeaderData + 20);
+        pMetadata->data.smpl.smpteOffset                    = ma_dr_wav_bytes_to_u32(smplHeaderData + 24);
+        pMetadata->data.smpl.sampleLoopCount                = ma_dr_wav_bytes_to_u32(smplHeaderData + 28);
+        pMetadata->data.smpl.samplerSpecificDataSizeInBytes = ma_dr_wav_bytes_to_u32(smplHeaderData + 32);
+        if (pMetadata->data.smpl.sampleLoopCount == (pChunkHeader->sizeInBytes - MA_DR_WAV_SMPL_BYTES) / MA_DR_WAV_SMPL_LOOP_BYTES) {
+            pMetadata->data.smpl.pLoops = (ma_dr_wav_smpl_loop*)ma_dr_wav__metadata_get_memory(pParser, sizeof(ma_dr_wav_smpl_loop) * pMetadata->data.smpl.sampleLoopCount, MA_DR_WAV_METADATA_ALIGNMENT);
+            for (iSampleLoop = 0; iSampleLoop < pMetadata->data.smpl.sampleLoopCount; ++iSampleLoop) {
+                ma_uint8 smplLoopData[MA_DR_WAV_SMPL_LOOP_BYTES];
+                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, smplLoopData, sizeof(smplLoopData), &totalBytesRead);
+                if (bytesJustRead == sizeof(smplLoopData)) {
+                    pMetadata->data.smpl.pLoops[iSampleLoop].cuePointId            = ma_dr_wav_bytes_to_u32(smplLoopData + 0);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].type                  = ma_dr_wav_bytes_to_u32(smplLoopData + 4);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].firstSampleByteOffset = ma_dr_wav_bytes_to_u32(smplLoopData + 8);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].lastSampleByteOffset  = ma_dr_wav_bytes_to_u32(smplLoopData + 12);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].sampleFraction        = ma_dr_wav_bytes_to_u32(smplLoopData + 16);
+                    pMetadata->data.smpl.pLoops[iSampleLoop].playCount             = ma_dr_wav_bytes_to_u32(smplLoopData + 20);
+                } else {
+                    break;
+                }
+            }
+            if (pMetadata->data.smpl.samplerSpecificDataSizeInBytes > 0) {
+                pMetadata->data.smpl.pSamplerSpecificData = ma_dr_wav__metadata_get_memory(pParser, pMetadata->data.smpl.samplerSpecificDataSizeInBytes, 1);
+                MA_DR_WAV_ASSERT(pMetadata->data.smpl.pSamplerSpecificData != NULL);
+                ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.smpl.pSamplerSpecificData, pMetadata->data.smpl.samplerSpecificDataSizeInBytes, &totalBytesRead);
+            }
+        }
+    }
+    return totalBytesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__read_cue_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, const ma_dr_wav_chunk_header* pChunkHeader, ma_dr_wav_metadata* pMetadata)
+{
+    ma_uint8 cueHeaderSectionData[MA_DR_WAV_CUE_BYTES];
+    ma_uint64 totalBytesRead = 0;
+    size_t bytesJustRead;
+    if (pMetadata == NULL) {
+        return 0;
+    }
+    bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, cueHeaderSectionData, sizeof(cueHeaderSectionData), &totalBytesRead);
+    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
+    if (bytesJustRead == sizeof(cueHeaderSectionData)) {
+        pMetadata->type                   = ma_dr_wav_metadata_type_cue;
+        pMetadata->data.cue.cuePointCount = ma_dr_wav_bytes_to_u32(cueHeaderSectionData);
+        if (pMetadata->data.cue.cuePointCount == (pChunkHeader->sizeInBytes - MA_DR_WAV_CUE_BYTES) / MA_DR_WAV_CUE_POINT_BYTES) {
+            pMetadata->data.cue.pCuePoints    = (ma_dr_wav_cue_point*)ma_dr_wav__metadata_get_memory(pParser, sizeof(ma_dr_wav_cue_point) * pMetadata->data.cue.cuePointCount, MA_DR_WAV_METADATA_ALIGNMENT);
+            MA_DR_WAV_ASSERT(pMetadata->data.cue.pCuePoints != NULL);
+            if (pMetadata->data.cue.cuePointCount > 0) {
+                ma_uint32 iCuePoint;
+                for (iCuePoint = 0; iCuePoint < pMetadata->data.cue.cuePointCount; ++iCuePoint) {
+                    ma_uint8 cuePointData[MA_DR_WAV_CUE_POINT_BYTES];
+                    bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, cuePointData, sizeof(cuePointData), &totalBytesRead);
+                    if (bytesJustRead == sizeof(cuePointData)) {
+                        pMetadata->data.cue.pCuePoints[iCuePoint].id                = ma_dr_wav_bytes_to_u32(cuePointData + 0);
+                        pMetadata->data.cue.pCuePoints[iCuePoint].playOrderPosition = ma_dr_wav_bytes_to_u32(cuePointData + 4);
+                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[0]    = cuePointData[8];
+                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[1]    = cuePointData[9];
+                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[2]    = cuePointData[10];
+                        pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId[3]    = cuePointData[11];
+                        pMetadata->data.cue.pCuePoints[iCuePoint].chunkStart        = ma_dr_wav_bytes_to_u32(cuePointData + 12);
+                        pMetadata->data.cue.pCuePoints[iCuePoint].blockStart        = ma_dr_wav_bytes_to_u32(cuePointData + 16);
+                        pMetadata->data.cue.pCuePoints[iCuePoint].sampleByteOffset  = ma_dr_wav_bytes_to_u32(cuePointData + 20);
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+    return totalBytesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__read_inst_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata)
+{
+    ma_uint8 instData[MA_DR_WAV_INST_BYTES];
+    ma_uint64 bytesRead;
+    if (pMetadata == NULL) {
+        return 0;
+    }
+    bytesRead = ma_dr_wav__metadata_parser_read(pParser, instData, sizeof(instData), NULL);
+    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
+    if (bytesRead == sizeof(instData)) {
+        pMetadata->type                    = ma_dr_wav_metadata_type_inst;
+        pMetadata->data.inst.midiUnityNote = (ma_int8)instData[0];
+        pMetadata->data.inst.fineTuneCents = (ma_int8)instData[1];
+        pMetadata->data.inst.gainDecibels  = (ma_int8)instData[2];
+        pMetadata->data.inst.lowNote       = (ma_int8)instData[3];
+        pMetadata->data.inst.highNote      = (ma_int8)instData[4];
+        pMetadata->data.inst.lowVelocity   = (ma_int8)instData[5];
+        pMetadata->data.inst.highVelocity  = (ma_int8)instData[6];
+    }
+    return bytesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__read_acid_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata)
+{
+    ma_uint8 acidData[MA_DR_WAV_ACID_BYTES];
+    ma_uint64 bytesRead;
+    if (pMetadata == NULL) {
+        return 0;
+    }
+    bytesRead = ma_dr_wav__metadata_parser_read(pParser, acidData, sizeof(acidData), NULL);
+    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
+    if (bytesRead == sizeof(acidData)) {
+        pMetadata->type                       = ma_dr_wav_metadata_type_acid;
+        pMetadata->data.acid.flags            = ma_dr_wav_bytes_to_u32(acidData + 0);
+        pMetadata->data.acid.midiUnityNote    = ma_dr_wav_bytes_to_u16(acidData + 4);
+        pMetadata->data.acid.reserved1        = ma_dr_wav_bytes_to_u16(acidData + 6);
+        pMetadata->data.acid.reserved2        = ma_dr_wav_bytes_to_f32(acidData + 8);
+        pMetadata->data.acid.numBeats         = ma_dr_wav_bytes_to_u32(acidData + 12);
+        pMetadata->data.acid.meterDenominator = ma_dr_wav_bytes_to_u16(acidData + 16);
+        pMetadata->data.acid.meterNumerator   = ma_dr_wav_bytes_to_u16(acidData + 18);
+        pMetadata->data.acid.tempo            = ma_dr_wav_bytes_to_f32(acidData + 20);
+    }
+    return bytesRead;
+}
+MA_PRIVATE size_t ma_dr_wav__strlen(const char* str)
+{
+    size_t result = 0;
+    while (*str++) {
+        result += 1;
+    }
+    return result;
+}
+MA_PRIVATE size_t ma_dr_wav__strlen_clamped(const char* str, size_t maxToRead)
+{
+    size_t result = 0;
+    while (*str++ && result < maxToRead) {
+        result += 1;
+    }
+    return result;
+}
+MA_PRIVATE char* ma_dr_wav__metadata_copy_string(ma_dr_wav__metadata_parser* pParser, const char* str, size_t maxToRead)
+{
+    size_t len = ma_dr_wav__strlen_clamped(str, maxToRead);
+    if (len) {
+        char* result = (char*)ma_dr_wav__metadata_get_memory(pParser, len + 1, 1);
+        MA_DR_WAV_ASSERT(result != NULL);
+        MA_DR_WAV_COPY_MEMORY(result, str, len);
+        result[len] = '\0';
+        return result;
+    } else {
+        return NULL;
+    }
+}
+typedef struct
+{
+    const void* pBuffer;
+    size_t sizeInBytes;
+    size_t cursor;
+} ma_dr_wav_buffer_reader;
+MA_PRIVATE ma_result ma_dr_wav_buffer_reader_init(const void* pBuffer, size_t sizeInBytes, ma_dr_wav_buffer_reader* pReader)
+{
+    MA_DR_WAV_ASSERT(pBuffer != NULL);
+    MA_DR_WAV_ASSERT(pReader != NULL);
+    MA_DR_WAV_ZERO_OBJECT(pReader);
+    pReader->pBuffer     = pBuffer;
+    pReader->sizeInBytes = sizeInBytes;
+    pReader->cursor      = 0;
+    return MA_SUCCESS;
+}
+MA_PRIVATE const void* ma_dr_wav_buffer_reader_ptr(const ma_dr_wav_buffer_reader* pReader)
+{
+    MA_DR_WAV_ASSERT(pReader != NULL);
+    return ma_dr_wav_offset_ptr(pReader->pBuffer, pReader->cursor);
+}
+MA_PRIVATE ma_result ma_dr_wav_buffer_reader_seek(ma_dr_wav_buffer_reader* pReader, size_t bytesToSeek)
+{
+    MA_DR_WAV_ASSERT(pReader != NULL);
+    if (pReader->cursor + bytesToSeek > pReader->sizeInBytes) {
+        return MA_BAD_SEEK;
+    }
+    pReader->cursor += bytesToSeek;
+    return MA_SUCCESS;
+}
+MA_PRIVATE ma_result ma_dr_wav_buffer_reader_read(ma_dr_wav_buffer_reader* pReader, void* pDst, size_t bytesToRead, size_t* pBytesRead)
+{
+    ma_result result = MA_SUCCESS;
+    size_t bytesRemaining;
+    MA_DR_WAV_ASSERT(pReader != NULL);
+    if (pBytesRead != NULL) {
+        *pBytesRead = 0;
+    }
+    bytesRemaining = (pReader->sizeInBytes - pReader->cursor);
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+    if (pDst == NULL) {
+        result = ma_dr_wav_buffer_reader_seek(pReader, bytesToRead);
+    } else {
+        MA_DR_WAV_COPY_MEMORY(pDst, ma_dr_wav_buffer_reader_ptr(pReader), bytesToRead);
+        pReader->cursor += bytesToRead;
+    }
+    MA_DR_WAV_ASSERT(pReader->cursor <= pReader->sizeInBytes);
+    if (result == MA_SUCCESS) {
+        if (pBytesRead != NULL) {
+            *pBytesRead = bytesToRead;
+        }
+    }
+    return MA_SUCCESS;
+}
+MA_PRIVATE ma_result ma_dr_wav_buffer_reader_read_u16(ma_dr_wav_buffer_reader* pReader, ma_uint16* pDst)
+{
+    ma_result result;
+    size_t bytesRead;
+    ma_uint8 data[2];
+    MA_DR_WAV_ASSERT(pReader != NULL);
+    MA_DR_WAV_ASSERT(pDst != NULL);
+    *pDst = 0;
+    result = ma_dr_wav_buffer_reader_read(pReader, data, sizeof(*pDst), &bytesRead);
+    if (result != MA_SUCCESS || bytesRead != sizeof(*pDst)) {
+        return result;
+    }
+    *pDst = ma_dr_wav_bytes_to_u16(data);
+    return MA_SUCCESS;
+}
+MA_PRIVATE ma_result ma_dr_wav_buffer_reader_read_u32(ma_dr_wav_buffer_reader* pReader, ma_uint32* pDst)
+{
+    ma_result result;
+    size_t bytesRead;
+    ma_uint8 data[4];
+    MA_DR_WAV_ASSERT(pReader != NULL);
+    MA_DR_WAV_ASSERT(pDst != NULL);
+    *pDst = 0;
+    result = ma_dr_wav_buffer_reader_read(pReader, data, sizeof(*pDst), &bytesRead);
+    if (result != MA_SUCCESS || bytesRead != sizeof(*pDst)) {
+        return result;
+    }
+    *pDst = ma_dr_wav_bytes_to_u32(data);
+    return MA_SUCCESS;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__read_bext_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata, ma_uint64 chunkSize)
+{
+    ma_uint8 bextData[MA_DR_WAV_BEXT_BYTES];
+    size_t bytesRead = ma_dr_wav__metadata_parser_read(pParser, bextData, sizeof(bextData), NULL);
+    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
+    if (bytesRead == sizeof(bextData)) {
+        ma_dr_wav_buffer_reader reader;
+        ma_uint32 timeReferenceLow;
+        ma_uint32 timeReferenceHigh;
+        size_t extraBytes;
+        pMetadata->type = ma_dr_wav_metadata_type_bext;
+        if (ma_dr_wav_buffer_reader_init(bextData, bytesRead, &reader) == MA_SUCCESS) {
+            pMetadata->data.bext.pDescription = ma_dr_wav__metadata_copy_string(pParser, (const char*)ma_dr_wav_buffer_reader_ptr(&reader), MA_DR_WAV_BEXT_DESCRIPTION_BYTES);
+            ma_dr_wav_buffer_reader_seek(&reader, MA_DR_WAV_BEXT_DESCRIPTION_BYTES);
+            pMetadata->data.bext.pOriginatorName = ma_dr_wav__metadata_copy_string(pParser, (const char*)ma_dr_wav_buffer_reader_ptr(&reader), MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES);
+            ma_dr_wav_buffer_reader_seek(&reader, MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES);
+            pMetadata->data.bext.pOriginatorReference = ma_dr_wav__metadata_copy_string(pParser, (const char*)ma_dr_wav_buffer_reader_ptr(&reader), MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES);
+            ma_dr_wav_buffer_reader_seek(&reader, MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES);
+            ma_dr_wav_buffer_reader_read(&reader, pMetadata->data.bext.pOriginationDate, sizeof(pMetadata->data.bext.pOriginationDate), NULL);
+            ma_dr_wav_buffer_reader_read(&reader, pMetadata->data.bext.pOriginationTime, sizeof(pMetadata->data.bext.pOriginationTime), NULL);
+            ma_dr_wav_buffer_reader_read_u32(&reader, &timeReferenceLow);
+            ma_dr_wav_buffer_reader_read_u32(&reader, &timeReferenceHigh);
+            pMetadata->data.bext.timeReference = ((ma_uint64)timeReferenceHigh << 32) + timeReferenceLow;
+            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.version);
+            pMetadata->data.bext.pUMID = ma_dr_wav__metadata_get_memory(pParser, MA_DR_WAV_BEXT_UMID_BYTES, 1);
+            ma_dr_wav_buffer_reader_read(&reader, pMetadata->data.bext.pUMID, MA_DR_WAV_BEXT_UMID_BYTES, NULL);
+            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.loudnessValue);
+            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.loudnessRange);
+            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxTruePeakLevel);
+            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxMomentaryLoudness);
+            ma_dr_wav_buffer_reader_read_u16(&reader, &pMetadata->data.bext.maxShortTermLoudness);
+            MA_DR_WAV_ASSERT((ma_dr_wav_offset_ptr(ma_dr_wav_buffer_reader_ptr(&reader), MA_DR_WAV_BEXT_RESERVED_BYTES)) == (bextData + MA_DR_WAV_BEXT_BYTES));
+            extraBytes = (size_t)(chunkSize - MA_DR_WAV_BEXT_BYTES);
+            if (extraBytes > 0) {
+                pMetadata->data.bext.pCodingHistory = (char*)ma_dr_wav__metadata_get_memory(pParser, extraBytes + 1, 1);
+                MA_DR_WAV_ASSERT(pMetadata->data.bext.pCodingHistory != NULL);
+                bytesRead += ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.bext.pCodingHistory, extraBytes, NULL);
+                pMetadata->data.bext.codingHistorySize = (ma_uint32)ma_dr_wav__strlen(pMetadata->data.bext.pCodingHistory);
+            } else {
+                pMetadata->data.bext.pCodingHistory    = NULL;
+                pMetadata->data.bext.codingHistorySize = 0;
+            }
+        }
+    }
+    return bytesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__read_list_label_or_note_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata, ma_uint64 chunkSize, ma_dr_wav_metadata_type type)
+{
+    ma_uint8 cueIDBuffer[MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES];
+    ma_uint64 totalBytesRead = 0;
+    size_t bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, cueIDBuffer, sizeof(cueIDBuffer), &totalBytesRead);
+    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
+    if (bytesJustRead == sizeof(cueIDBuffer)) {
+        ma_uint32 sizeIncludingNullTerminator;
+        pMetadata->type = type;
+        pMetadata->data.labelOrNote.cuePointId = ma_dr_wav_bytes_to_u32(cueIDBuffer);
+        sizeIncludingNullTerminator = (ma_uint32)chunkSize - MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES;
+        if (sizeIncludingNullTerminator > 0) {
+            pMetadata->data.labelOrNote.stringLength = sizeIncludingNullTerminator - 1;
+            pMetadata->data.labelOrNote.pString      = (char*)ma_dr_wav__metadata_get_memory(pParser, sizeIncludingNullTerminator, 1);
+            MA_DR_WAV_ASSERT(pMetadata->data.labelOrNote.pString != NULL);
+            ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.labelOrNote.pString, sizeIncludingNullTerminator, &totalBytesRead);
+        } else {
+            pMetadata->data.labelOrNote.stringLength = 0;
+            pMetadata->data.labelOrNote.pString      = NULL;
+        }
+    }
+    return totalBytesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__read_list_labelled_cue_region_to_metadata_obj(ma_dr_wav__metadata_parser* pParser, ma_dr_wav_metadata* pMetadata, ma_uint64 chunkSize)
+{
+    ma_uint8 buffer[MA_DR_WAV_LIST_LABELLED_TEXT_BYTES];
+    ma_uint64 totalBytesRead = 0;
+    size_t bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, sizeof(buffer), &totalBytesRead);
+    MA_DR_WAV_ASSERT(pParser->stage == ma_dr_wav__metadata_parser_stage_read);
+    if (bytesJustRead == sizeof(buffer)) {
+        ma_uint32 sizeIncludingNullTerminator;
+        pMetadata->type                                = ma_dr_wav_metadata_type_list_labelled_cue_region;
+        pMetadata->data.labelledCueRegion.cuePointId   = ma_dr_wav_bytes_to_u32(buffer + 0);
+        pMetadata->data.labelledCueRegion.sampleLength = ma_dr_wav_bytes_to_u32(buffer + 4);
+        pMetadata->data.labelledCueRegion.purposeId[0] = buffer[8];
+        pMetadata->data.labelledCueRegion.purposeId[1] = buffer[9];
+        pMetadata->data.labelledCueRegion.purposeId[2] = buffer[10];
+        pMetadata->data.labelledCueRegion.purposeId[3] = buffer[11];
+        pMetadata->data.labelledCueRegion.country      = ma_dr_wav_bytes_to_u16(buffer + 12);
+        pMetadata->data.labelledCueRegion.language     = ma_dr_wav_bytes_to_u16(buffer + 14);
+        pMetadata->data.labelledCueRegion.dialect      = ma_dr_wav_bytes_to_u16(buffer + 16);
+        pMetadata->data.labelledCueRegion.codePage     = ma_dr_wav_bytes_to_u16(buffer + 18);
+        sizeIncludingNullTerminator = (ma_uint32)chunkSize - MA_DR_WAV_LIST_LABELLED_TEXT_BYTES;
+        if (sizeIncludingNullTerminator > 0) {
+            pMetadata->data.labelledCueRegion.stringLength = sizeIncludingNullTerminator - 1;
+            pMetadata->data.labelledCueRegion.pString      = (char*)ma_dr_wav__metadata_get_memory(pParser, sizeIncludingNullTerminator, 1);
+            MA_DR_WAV_ASSERT(pMetadata->data.labelledCueRegion.pString != NULL);
+            ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.labelledCueRegion.pString, sizeIncludingNullTerminator, &totalBytesRead);
+        } else {
+            pMetadata->data.labelledCueRegion.stringLength = 0;
+            pMetadata->data.labelledCueRegion.pString      = NULL;
+        }
+    }
+    return totalBytesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__metadata_process_info_text_chunk(ma_dr_wav__metadata_parser* pParser, ma_uint64 chunkSize, ma_dr_wav_metadata_type type)
+{
+    ma_uint64 bytesRead = 0;
+    ma_uint32 stringSizeWithNullTerminator = (ma_uint32)chunkSize;
+    if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+        pParser->metadataCount += 1;
+        ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, stringSizeWithNullTerminator, 1);
+    } else {
+        ma_dr_wav_metadata* pMetadata = &pParser->pMetadata[pParser->metadataCursor];
+        pMetadata->type = type;
+        if (stringSizeWithNullTerminator > 0) {
+            pMetadata->data.infoText.stringLength = stringSizeWithNullTerminator - 1;
+            pMetadata->data.infoText.pString = (char*)ma_dr_wav__metadata_get_memory(pParser, stringSizeWithNullTerminator, 1);
+            MA_DR_WAV_ASSERT(pMetadata->data.infoText.pString != NULL);
+            bytesRead = ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.infoText.pString, (size_t)stringSizeWithNullTerminator, NULL);
+            if (bytesRead == chunkSize) {
+                pParser->metadataCursor += 1;
+            } else {
+            }
+        } else {
+            pMetadata->data.infoText.stringLength = 0;
+            pMetadata->data.infoText.pString      = NULL;
+            pParser->metadataCursor += 1;
+        }
+    }
+    return bytesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__metadata_process_unknown_chunk(ma_dr_wav__metadata_parser* pParser, const ma_uint8* pChunkId, ma_uint64 chunkSize, ma_dr_wav_metadata_location location)
+{
+    ma_uint64 bytesRead = 0;
+    if (location == ma_dr_wav_metadata_location_invalid) {
+        return 0;
+    }
+    if (ma_dr_wav_fourcc_equal(pChunkId, "data") || ma_dr_wav_fourcc_equal(pChunkId, "fmt ") || ma_dr_wav_fourcc_equal(pChunkId, "fact")) {
+        return 0;
+    }
+    if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+        pParser->metadataCount += 1;
+        ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)chunkSize, 1);
+    } else {
+        ma_dr_wav_metadata* pMetadata = &pParser->pMetadata[pParser->metadataCursor];
+        pMetadata->type                         = ma_dr_wav_metadata_type_unknown;
+        pMetadata->data.unknown.chunkLocation   = location;
+        pMetadata->data.unknown.id[0]           = pChunkId[0];
+        pMetadata->data.unknown.id[1]           = pChunkId[1];
+        pMetadata->data.unknown.id[2]           = pChunkId[2];
+        pMetadata->data.unknown.id[3]           = pChunkId[3];
+        pMetadata->data.unknown.dataSizeInBytes = (ma_uint32)chunkSize;
+        pMetadata->data.unknown.pData           = (ma_uint8 *)ma_dr_wav__metadata_get_memory(pParser, (size_t)chunkSize, 1);
+        MA_DR_WAV_ASSERT(pMetadata->data.unknown.pData != NULL);
+        bytesRead = ma_dr_wav__metadata_parser_read(pParser, pMetadata->data.unknown.pData, pMetadata->data.unknown.dataSizeInBytes, NULL);
+        if (bytesRead == pMetadata->data.unknown.dataSizeInBytes) {
+            pParser->metadataCursor += 1;
+        } else {
+        }
+    }
+    return bytesRead;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav__chunk_matches(ma_dr_wav_metadata_type allowedMetadataTypes, const ma_uint8* pChunkID, ma_dr_wav_metadata_type type, const char* pID)
+{
+    return (allowedMetadataTypes & type) && ma_dr_wav_fourcc_equal(pChunkID, pID);
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__metadata_process_chunk(ma_dr_wav__metadata_parser* pParser, const ma_dr_wav_chunk_header* pChunkHeader, ma_dr_wav_metadata_type allowedMetadataTypes)
+{
+    const ma_uint8 *pChunkID = pChunkHeader->id.fourcc;
+    ma_uint64 bytesRead = 0;
+    if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_smpl, "smpl")) {
+        if (pChunkHeader->sizeInBytes >= MA_DR_WAV_SMPL_BYTES) {
+            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+                ma_uint8 buffer[4];
+                size_t bytesJustRead;
+                if (!pParser->onSeek(pParser->pReadSeekUserData, 28, ma_dr_wav_seek_origin_current)) {
+                    return bytesRead;
+                }
+                bytesRead += 28;
+                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, sizeof(buffer), &bytesRead);
+                if (bytesJustRead == sizeof(buffer)) {
+                    ma_uint32 loopCount = ma_dr_wav_bytes_to_u32(buffer);
+                    ma_uint64 calculatedLoopCount;
+                    calculatedLoopCount = (pChunkHeader->sizeInBytes - MA_DR_WAV_SMPL_BYTES) / MA_DR_WAV_SMPL_LOOP_BYTES;
+                    if (calculatedLoopCount == loopCount) {
+                        bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, sizeof(buffer), &bytesRead);
+                        if (bytesJustRead == sizeof(buffer)) {
+                            ma_uint32 samplerSpecificDataSizeInBytes = ma_dr_wav_bytes_to_u32(buffer);
+                            pParser->metadataCount += 1;
+                            ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, sizeof(ma_dr_wav_smpl_loop) * loopCount, MA_DR_WAV_METADATA_ALIGNMENT);
+                            ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, samplerSpecificDataSizeInBytes, 1);
+                        }
+                    } else {
+                    }
+                }
+            } else {
+                bytesRead = ma_dr_wav__read_smpl_to_metadata_obj(pParser, pChunkHeader, &pParser->pMetadata[pParser->metadataCursor]);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                }
+            }
+        } else {
+        }
+    } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_inst, "inst")) {
+        if (pChunkHeader->sizeInBytes == MA_DR_WAV_INST_BYTES) {
+            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+                pParser->metadataCount += 1;
+            } else {
+                bytesRead = ma_dr_wav__read_inst_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor]);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                }
+            }
+        } else {
+        }
+    } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_acid, "acid")) {
+        if (pChunkHeader->sizeInBytes == MA_DR_WAV_ACID_BYTES) {
+            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+                pParser->metadataCount += 1;
+            } else {
+                bytesRead = ma_dr_wav__read_acid_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor]);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                }
+            }
+        } else {
+        }
+    } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_cue, "cue ")) {
+        if (pChunkHeader->sizeInBytes >= MA_DR_WAV_CUE_BYTES) {
+            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+                size_t cueCount;
+                pParser->metadataCount += 1;
+                cueCount = (size_t)(pChunkHeader->sizeInBytes - MA_DR_WAV_CUE_BYTES) / MA_DR_WAV_CUE_POINT_BYTES;
+                ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, sizeof(ma_dr_wav_cue_point) * cueCount, MA_DR_WAV_METADATA_ALIGNMENT);
+            } else {
+                bytesRead = ma_dr_wav__read_cue_to_metadata_obj(pParser, pChunkHeader, &pParser->pMetadata[pParser->metadataCursor]);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                }
+            }
+        } else {
+        }
+    } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, pChunkID, ma_dr_wav_metadata_type_bext, "bext")) {
+        if (pChunkHeader->sizeInBytes >= MA_DR_WAV_BEXT_BYTES) {
+            if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+                char buffer[MA_DR_WAV_BEXT_DESCRIPTION_BYTES + 1];
+                size_t allocSizeNeeded = MA_DR_WAV_BEXT_UMID_BYTES;
+                size_t bytesJustRead;
+                buffer[MA_DR_WAV_BEXT_DESCRIPTION_BYTES] = '\0';
+                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, MA_DR_WAV_BEXT_DESCRIPTION_BYTES, &bytesRead);
+                if (bytesJustRead != MA_DR_WAV_BEXT_DESCRIPTION_BYTES) {
+                    return bytesRead;
+                }
+                allocSizeNeeded += ma_dr_wav__strlen(buffer) + 1;
+                buffer[MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES] = '\0';
+                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES, &bytesRead);
+                if (bytesJustRead != MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES) {
+                    return bytesRead;
+                }
+                allocSizeNeeded += ma_dr_wav__strlen(buffer) + 1;
+                buffer[MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES] = '\0';
+                bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, buffer, MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES, &bytesRead);
+                if (bytesJustRead != MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES) {
+                    return bytesRead;
+                }
+                allocSizeNeeded += ma_dr_wav__strlen(buffer) + 1;
+                allocSizeNeeded += (size_t)pChunkHeader->sizeInBytes - MA_DR_WAV_BEXT_BYTES;
+                ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, allocSizeNeeded, 1);
+                pParser->metadataCount += 1;
+            } else {
+                bytesRead = ma_dr_wav__read_bext_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], pChunkHeader->sizeInBytes);
+                if (bytesRead == pChunkHeader->sizeInBytes) {
+                    pParser->metadataCursor += 1;
+                } else {
+                }
+            }
+        } else {
+        }
+    } else if (ma_dr_wav_fourcc_equal(pChunkID, "LIST") || ma_dr_wav_fourcc_equal(pChunkID, "list")) {
+        ma_dr_wav_metadata_location listType = ma_dr_wav_metadata_location_invalid;
+        while (bytesRead < pChunkHeader->sizeInBytes) {
+            ma_uint8 subchunkId[4];
+            ma_uint8 subchunkSizeBuffer[4];
+            ma_uint64 subchunkDataSize;
+            ma_uint64 subchunkBytesRead = 0;
+            ma_uint64 bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, subchunkId, sizeof(subchunkId), &bytesRead);
+            if (bytesJustRead != sizeof(subchunkId)) {
+                break;
+            }
+            if (ma_dr_wav_fourcc_equal(subchunkId, "adtl")) {
+                listType = ma_dr_wav_metadata_location_inside_adtl_list;
+                continue;
+            } else if (ma_dr_wav_fourcc_equal(subchunkId, "INFO")) {
+                listType = ma_dr_wav_metadata_location_inside_info_list;
+                continue;
+            }
+            bytesJustRead = ma_dr_wav__metadata_parser_read(pParser, subchunkSizeBuffer, sizeof(subchunkSizeBuffer), &bytesRead);
+            if (bytesJustRead != sizeof(subchunkSizeBuffer)) {
+                break;
+            }
+            subchunkDataSize = ma_dr_wav_bytes_to_u32(subchunkSizeBuffer);
+            if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_label, "labl") || ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_note, "note")) {
+                if (subchunkDataSize >= MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES) {
+                    ma_uint64 stringSizeWithNullTerm = subchunkDataSize - MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES;
+                    if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+                        pParser->metadataCount += 1;
+                        ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)stringSizeWithNullTerm, 1);
+                    } else {
+                        subchunkBytesRead = ma_dr_wav__read_list_label_or_note_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], subchunkDataSize, ma_dr_wav_fourcc_equal(subchunkId, "labl") ? ma_dr_wav_metadata_type_list_label : ma_dr_wav_metadata_type_list_note);
+                        if (subchunkBytesRead == subchunkDataSize) {
+                            pParser->metadataCursor += 1;
+                        } else {
+                        }
+                    }
+                } else {
+                }
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_labelled_cue_region, "ltxt")) {
+                if (subchunkDataSize >= MA_DR_WAV_LIST_LABELLED_TEXT_BYTES) {
+                    ma_uint64 stringSizeWithNullTerminator = subchunkDataSize - MA_DR_WAV_LIST_LABELLED_TEXT_BYTES;
+                    if (pParser->stage == ma_dr_wav__metadata_parser_stage_count) {
+                        pParser->metadataCount += 1;
+                        ma_dr_wav__metadata_request_extra_memory_for_stage_2(pParser, (size_t)stringSizeWithNullTerminator, 1);
+                    } else {
+                        subchunkBytesRead = ma_dr_wav__read_list_labelled_cue_region_to_metadata_obj(pParser, &pParser->pMetadata[pParser->metadataCursor], subchunkDataSize);
+                        if (subchunkBytesRead == subchunkDataSize) {
+                            pParser->metadataCursor += 1;
+                        } else {
+                        }
+                    }
+                } else {
+                }
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_software, "ISFT")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_software);
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_copyright, "ICOP")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_copyright);
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_title, "INAM")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_title);
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_artist, "IART")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_artist);
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_comment, "ICMT")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_comment);
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_date, "ICRD")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_date);
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_genre, "IGNR")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_genre);
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_album, "IPRD")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_album);
+            } else if (ma_dr_wav__chunk_matches(allowedMetadataTypes, subchunkId, ma_dr_wav_metadata_type_list_info_tracknumber, "ITRK")) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_info_text_chunk(pParser, subchunkDataSize,  ma_dr_wav_metadata_type_list_info_tracknumber);
+            } else if ((allowedMetadataTypes & ma_dr_wav_metadata_type_unknown) != 0) {
+                subchunkBytesRead = ma_dr_wav__metadata_process_unknown_chunk(pParser, subchunkId, subchunkDataSize, listType);
+            }
+            bytesRead += subchunkBytesRead;
+            MA_DR_WAV_ASSERT(subchunkBytesRead <= subchunkDataSize);
+            if (subchunkBytesRead < subchunkDataSize) {
+                ma_uint64 bytesToSeek = subchunkDataSize - subchunkBytesRead;
+                if (!pParser->onSeek(pParser->pReadSeekUserData, (int)bytesToSeek, ma_dr_wav_seek_origin_current)) {
+                    break;
+                }
+                bytesRead += bytesToSeek;
+            }
+            if ((subchunkDataSize % 2) == 1) {
+                if (!pParser->onSeek(pParser->pReadSeekUserData, 1, ma_dr_wav_seek_origin_current)) {
+                    break;
+                }
+                bytesRead += 1;
+            }
+        }
+    } else if ((allowedMetadataTypes & ma_dr_wav_metadata_type_unknown) != 0) {
+        bytesRead = ma_dr_wav__metadata_process_unknown_chunk(pParser, pChunkID, pChunkHeader->sizeInBytes, ma_dr_wav_metadata_location_top_level);
+    }
+    return bytesRead;
+}
+MA_PRIVATE ma_uint32 ma_dr_wav_get_bytes_per_pcm_frame(ma_dr_wav* pWav)
+{
+    ma_uint32 bytesPerFrame;
+    if ((pWav->bitsPerSample & 0x7) == 0) {
+        bytesPerFrame = (pWav->bitsPerSample * pWav->fmt.channels) >> 3;
+    } else {
+        bytesPerFrame = pWav->fmt.blockAlign;
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ALAW || pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_MULAW) {
+        if (bytesPerFrame != pWav->fmt.channels) {
+            return 0;
+        }
+    }
+    return bytesPerFrame;
+}
+MA_API ma_uint16 ma_dr_wav_fmt_get_format(const ma_dr_wav_fmt* pFMT)
+{
+    if (pFMT == NULL) {
+        return 0;
+    }
+    if (pFMT->formatTag != MA_DR_WAVE_FORMAT_EXTENSIBLE) {
+        return pFMT->formatTag;
+    } else {
+        return ma_dr_wav_bytes_to_u16(pFMT->subFormat);
+    }
+}
+MA_PRIVATE ma_bool32 ma_dr_wav_preinit(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pReadSeekUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pWav == NULL || onRead == NULL || onSeek == NULL) {
+        return MA_FALSE;
+    }
+    MA_DR_WAV_ZERO_MEMORY(pWav, sizeof(*pWav));
+    pWav->onRead    = onRead;
+    pWav->onSeek    = onSeek;
+    pWav->pUserData = pReadSeekUserData;
+    pWav->allocationCallbacks = ma_dr_wav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
+    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
+        return MA_FALSE;
+    }
+    return MA_TRUE;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav_init__internal(ma_dr_wav* pWav, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags)
+{
+    ma_result result;
+    ma_uint64 cursor;
+    ma_bool32 sequential;
+    ma_uint8 riff[4];
+    ma_dr_wav_fmt fmt;
+    unsigned short translatedFormatTag;
+    ma_uint64 dataChunkSize = 0;
+    ma_uint64 sampleCountFromFactChunk = 0;
+    ma_uint64 metadataStartPos;
+    ma_dr_wav__metadata_parser metadataParser;
+    ma_bool8 isProcessingMetadata = MA_FALSE;
+    ma_bool8 foundChunk_fmt  = MA_FALSE;
+    ma_bool8 foundChunk_data = MA_FALSE;
+    ma_bool8 isAIFCFormType = MA_FALSE;
+    ma_uint64 aiffFrameCount = 0;
+    cursor = 0;
+    sequential = (flags & MA_DR_WAV_SEQUENTIAL) != 0;
+    MA_DR_WAV_ZERO_OBJECT(&fmt);
+    if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, riff, sizeof(riff), &cursor) != sizeof(riff)) {
+        return MA_FALSE;
+    }
+    if (ma_dr_wav_fourcc_equal(riff, "RIFF")) {
+        pWav->container = ma_dr_wav_container_riff;
+    } else if (ma_dr_wav_fourcc_equal(riff, "RIFX")) {
+        pWav->container = ma_dr_wav_container_rifx;
+    } else if (ma_dr_wav_fourcc_equal(riff, "riff")) {
+        int i;
+        ma_uint8 riff2[12];
+        pWav->container = ma_dr_wav_container_w64;
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, riff2, sizeof(riff2), &cursor) != sizeof(riff2)) {
+            return MA_FALSE;
+        }
+        for (i = 0; i < 12; ++i) {
+            if (riff2[i] != ma_dr_wavGUID_W64_RIFF[i+4]) {
+                return MA_FALSE;
+            }
+        }
+    } else if (ma_dr_wav_fourcc_equal(riff, "RF64")) {
+        pWav->container = ma_dr_wav_container_rf64;
+    } else if (ma_dr_wav_fourcc_equal(riff, "FORM")) {
+        pWav->container = ma_dr_wav_container_aiff;
+    } else {
+        return MA_FALSE;
+    }
+    if (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx || pWav->container == ma_dr_wav_container_rf64) {
+        ma_uint8 chunkSizeBytes[4];
+        ma_uint8 wave[4];
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
+            return MA_FALSE;
+        }
+        if (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx) {
+            if (ma_dr_wav_bytes_to_u32_ex(chunkSizeBytes, pWav->container) < 36) {
+            }
+        } else if (pWav->container == ma_dr_wav_container_rf64) {
+            if (ma_dr_wav_bytes_to_u32_le(chunkSizeBytes) != 0xFFFFFFFF) {
+                return MA_FALSE;
+            }
+        } else {
+            return MA_FALSE;
+        }
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_wav_fourcc_equal(wave, "WAVE")) {
+            return MA_FALSE;
+        }
+    } else if (pWav->container == ma_dr_wav_container_w64) {
+        ma_uint8 chunkSizeBytes[8];
+        ma_uint8 wave[16];
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
+            return MA_FALSE;
+        }
+        if (ma_dr_wav_bytes_to_u64(chunkSizeBytes) < 80) {
+            return MA_FALSE;
+        }
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_wav_guid_equal(wave, ma_dr_wavGUID_W64_WAVE)) {
+            return MA_FALSE;
+        }
+    } else if (pWav->container == ma_dr_wav_container_aiff) {
+        ma_uint8 chunkSizeBytes[4];
+        ma_uint8 aiff[4];
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
+            return MA_FALSE;
+        }
+        if (ma_dr_wav_bytes_to_u32_be(chunkSizeBytes) < 18) {
+            return MA_FALSE;
+        }
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, aiff, sizeof(aiff), &cursor) != sizeof(aiff)) {
+            return MA_FALSE;
+        }
+        if (ma_dr_wav_fourcc_equal(aiff, "AIFF")) {
+            isAIFCFormType = MA_FALSE;
+        } else if (ma_dr_wav_fourcc_equal(aiff, "AIFC")) {
+            isAIFCFormType = MA_TRUE;
+        } else {
+            return MA_FALSE;
+        }
+    } else {
+        return MA_FALSE;
+    }
+    if (pWav->container == ma_dr_wav_container_rf64) {
+        ma_uint8 sizeBytes[8];
+        ma_uint64 bytesRemainingInChunk;
+        ma_dr_wav_chunk_header header;
+        result = ma_dr_wav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
+        if (result != MA_SUCCESS) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_wav_fourcc_equal(header.id.fourcc, "ds64")) {
+            return MA_FALSE;
+        }
+        bytesRemainingInChunk = header.sizeInBytes + header.paddingSize;
+        if (!ma_dr_wav__seek_forward(pWav->onSeek, 8, pWav->pUserData)) {
+            return MA_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        cursor += 8;
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
+            return MA_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        dataChunkSize = ma_dr_wav_bytes_to_u64(sizeBytes);
+        if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
+            return MA_FALSE;
+        }
+        bytesRemainingInChunk -= 8;
+        sampleCountFromFactChunk = ma_dr_wav_bytes_to_u64(sizeBytes);
+        if (!ma_dr_wav__seek_forward(pWav->onSeek, bytesRemainingInChunk, pWav->pUserData)) {
+            return MA_FALSE;
+        }
+        cursor += bytesRemainingInChunk;
+    }
+    metadataStartPos = cursor;
+    isProcessingMetadata = !sequential && ((flags & MA_DR_WAV_WITH_METADATA) != 0);
+    if (pWav->container != ma_dr_wav_container_riff && pWav->container != ma_dr_wav_container_rf64) {
+        isProcessingMetadata = MA_FALSE;
+    }
+    MA_DR_WAV_ZERO_MEMORY(&metadataParser, sizeof(metadataParser));
+    if (isProcessingMetadata) {
+        metadataParser.onRead = pWav->onRead;
+        metadataParser.onSeek = pWav->onSeek;
+        metadataParser.pReadSeekUserData = pWav->pUserData;
+        metadataParser.stage  = ma_dr_wav__metadata_parser_stage_count;
+    }
+    for (;;) {
+        ma_dr_wav_chunk_header header;
+        ma_uint64 chunkSize;
+        result = ma_dr_wav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
+        if (result != MA_SUCCESS) {
+            break;
+        }
+        chunkSize = header.sizeInBytes;
+        if (!sequential && onChunk != NULL) {
+            ma_uint64 callbackBytesRead = onChunk(pChunkUserData, pWav->onRead, pWav->onSeek, pWav->pUserData, &header, pWav->container, &fmt);
+            if (callbackBytesRead > 0) {
+                if (ma_dr_wav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData) == MA_FALSE) {
+                    return MA_FALSE;
+                }
+            }
+        }
+        if (((pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx || pWav->container == ma_dr_wav_container_rf64) && ma_dr_wav_fourcc_equal(header.id.fourcc, "fmt ")) ||
+            ((pWav->container == ma_dr_wav_container_w64) && ma_dr_wav_guid_equal(header.id.guid, ma_dr_wavGUID_W64_FMT))) {
+            ma_uint8 fmtData[16];
+            foundChunk_fmt = MA_TRUE;
+            if (pWav->onRead(pWav->pUserData, fmtData, sizeof(fmtData)) != sizeof(fmtData)) {
+                return MA_FALSE;
+            }
+            cursor += sizeof(fmtData);
+            fmt.formatTag      = ma_dr_wav_bytes_to_u16_ex(fmtData + 0,  pWav->container);
+            fmt.channels       = ma_dr_wav_bytes_to_u16_ex(fmtData + 2,  pWav->container);
+            fmt.sampleRate     = ma_dr_wav_bytes_to_u32_ex(fmtData + 4,  pWav->container);
+            fmt.avgBytesPerSec = ma_dr_wav_bytes_to_u32_ex(fmtData + 8,  pWav->container);
+            fmt.blockAlign     = ma_dr_wav_bytes_to_u16_ex(fmtData + 12, pWav->container);
+            fmt.bitsPerSample  = ma_dr_wav_bytes_to_u16_ex(fmtData + 14, pWav->container);
+            fmt.extendedSize       = 0;
+            fmt.validBitsPerSample = 0;
+            fmt.channelMask        = 0;
+            MA_DR_WAV_ZERO_MEMORY(fmt.subFormat, sizeof(fmt.subFormat));
+            if (header.sizeInBytes > 16) {
+                ma_uint8 fmt_cbSize[2];
+                int bytesReadSoFar = 0;
+                if (pWav->onRead(pWav->pUserData, fmt_cbSize, sizeof(fmt_cbSize)) != sizeof(fmt_cbSize)) {
+                    return MA_FALSE;
+                }
+                cursor += sizeof(fmt_cbSize);
+                bytesReadSoFar = 18;
+                fmt.extendedSize = ma_dr_wav_bytes_to_u16_ex(fmt_cbSize, pWav->container);
+                if (fmt.extendedSize > 0) {
+                    if (fmt.formatTag == MA_DR_WAVE_FORMAT_EXTENSIBLE) {
+                        if (fmt.extendedSize != 22) {
+                            return MA_FALSE;
+                        }
+                    }
+                    if (fmt.formatTag == MA_DR_WAVE_FORMAT_EXTENSIBLE) {
+                        ma_uint8 fmtext[22];
+                        if (pWav->onRead(pWav->pUserData, fmtext, fmt.extendedSize) != fmt.extendedSize) {
+                            return MA_FALSE;
+                        }
+                        fmt.validBitsPerSample = ma_dr_wav_bytes_to_u16_ex(fmtext + 0, pWav->container);
+                        fmt.channelMask        = ma_dr_wav_bytes_to_u32_ex(fmtext + 2, pWav->container);
+                        ma_dr_wav_bytes_to_guid(fmtext + 6, fmt.subFormat);
+                    } else {
+                        if (pWav->onSeek(pWav->pUserData, fmt.extendedSize, ma_dr_wav_seek_origin_current) == MA_FALSE) {
+                            return MA_FALSE;
+                        }
+                    }
+                    cursor += fmt.extendedSize;
+                    bytesReadSoFar += fmt.extendedSize;
+                }
+                if (pWav->onSeek(pWav->pUserData, (int)(header.sizeInBytes - bytesReadSoFar), ma_dr_wav_seek_origin_current) == MA_FALSE) {
+                    return MA_FALSE;
+                }
+                cursor += (header.sizeInBytes - bytesReadSoFar);
+            }
+            if (header.paddingSize > 0) {
+                if (ma_dr_wav__seek_forward(pWav->onSeek, header.paddingSize, pWav->pUserData) == MA_FALSE) {
+                    break;
+                }
+                cursor += header.paddingSize;
+            }
+            continue;
+        }
+        if (((pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx || pWav->container == ma_dr_wav_container_rf64) && ma_dr_wav_fourcc_equal(header.id.fourcc, "data")) ||
+            ((pWav->container == ma_dr_wav_container_w64) && ma_dr_wav_guid_equal(header.id.guid, ma_dr_wavGUID_W64_DATA))) {
+            foundChunk_data = MA_TRUE;
+            pWav->dataChunkDataPos  = cursor;
+            if (pWav->container != ma_dr_wav_container_rf64) {
+                dataChunkSize = chunkSize;
+            }
+            if (sequential || !isProcessingMetadata) {
+                break;
+            } else {
+                chunkSize += header.paddingSize;
+                if (ma_dr_wav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == MA_FALSE) {
+                    break;
+                }
+                cursor += chunkSize;
+                continue;
+            }
+        }
+        if (((pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx || pWav->container == ma_dr_wav_container_rf64) && ma_dr_wav_fourcc_equal(header.id.fourcc, "fact")) ||
+            ((pWav->container == ma_dr_wav_container_w64) && ma_dr_wav_guid_equal(header.id.guid, ma_dr_wavGUID_W64_FACT))) {
+            if (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx) {
+                ma_uint8 sampleCount[4];
+                if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, &sampleCount, 4, &cursor) != 4) {
+                    return MA_FALSE;
+                }
+                chunkSize -= 4;
+                if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
+                    sampleCountFromFactChunk = ma_dr_wav_bytes_to_u32_ex(sampleCount, pWav->container);
+                } else {
+                    sampleCountFromFactChunk = 0;
+                }
+            } else if (pWav->container == ma_dr_wav_container_w64) {
+                if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, &sampleCountFromFactChunk, 8, &cursor) != 8) {
+                    return MA_FALSE;
+                }
+                chunkSize -= 8;
+            } else if (pWav->container == ma_dr_wav_container_rf64) {
+            }
+            chunkSize += header.paddingSize;
+            if (ma_dr_wav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == MA_FALSE) {
+                break;
+            }
+            cursor += chunkSize;
+            continue;
+        }
+        if (pWav->container == ma_dr_wav_container_aiff && ma_dr_wav_fourcc_equal(header.id.fourcc, "COMM")) {
+            ma_uint8 commData[24];
+            ma_uint32 commDataBytesToRead;
+            ma_uint16 channels;
+            ma_uint32 frameCount;
+            ma_uint16 sampleSizeInBits;
+            ma_int64  sampleRate;
+            ma_uint16 compressionFormat;
+            foundChunk_fmt = MA_TRUE;
+            if (isAIFCFormType) {
+                commDataBytesToRead = 24;
+                if (header.sizeInBytes < commDataBytesToRead) {
+                    return MA_FALSE;
+                }
+            } else {
+                commDataBytesToRead = 18;
+                if (header.sizeInBytes != commDataBytesToRead) {
+                    return MA_FALSE;
+                }
+            }
+            if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, commData, commDataBytesToRead, &cursor) != commDataBytesToRead) {
+                return MA_FALSE;
+            }
+            channels         = ma_dr_wav_bytes_to_u16_ex     (commData + 0, pWav->container);
+            frameCount       = ma_dr_wav_bytes_to_u32_ex     (commData + 2, pWav->container);
+            sampleSizeInBits = ma_dr_wav_bytes_to_u16_ex     (commData + 6, pWav->container);
+            sampleRate       = ma_dr_wav_aiff_extented_to_s64(commData + 8);
+            if (sampleRate < 0 || sampleRate > 0xFFFFFFFF) {
+                return MA_FALSE;
+            }
+            if (isAIFCFormType) {
+                const ma_uint8* type = commData + 18;
+                if (ma_dr_wav_fourcc_equal(type, "NONE")) {
+                    compressionFormat = MA_DR_WAVE_FORMAT_PCM;
+                } else if (ma_dr_wav_fourcc_equal(type, "raw ")) {
+                    compressionFormat = MA_DR_WAVE_FORMAT_PCM;
+                    if (sampleSizeInBits == 8) {
+                        pWav->aiff.isUnsigned = MA_TRUE;
+                    }
+                } else if (ma_dr_wav_fourcc_equal(type, "sowt")) {
+                    compressionFormat = MA_DR_WAVE_FORMAT_PCM;
+                    pWav->aiff.isLE = MA_TRUE;
+                } else if (ma_dr_wav_fourcc_equal(type, "fl32") || ma_dr_wav_fourcc_equal(type, "fl64") || ma_dr_wav_fourcc_equal(type, "FL32") || ma_dr_wav_fourcc_equal(type, "FL64")) {
+                    compressionFormat = MA_DR_WAVE_FORMAT_IEEE_FLOAT;
+                } else if (ma_dr_wav_fourcc_equal(type, "alaw") || ma_dr_wav_fourcc_equal(type, "ALAW")) {
+                    compressionFormat = MA_DR_WAVE_FORMAT_ALAW;
+                } else if (ma_dr_wav_fourcc_equal(type, "ulaw") || ma_dr_wav_fourcc_equal(type, "ULAW")) {
+                    compressionFormat = MA_DR_WAVE_FORMAT_MULAW;
+                } else if (ma_dr_wav_fourcc_equal(type, "ima4")) {
+                    compressionFormat = MA_DR_WAVE_FORMAT_DVI_ADPCM;
+                    sampleSizeInBits  = 4;
+                    (void)compressionFormat;
+                    (void)sampleSizeInBits;
+                    return MA_FALSE;
+                } else {
+                    return MA_FALSE;
+                }
+            } else {
+                compressionFormat = MA_DR_WAVE_FORMAT_PCM;
+            }
+            aiffFrameCount = frameCount;
+            fmt.formatTag      = compressionFormat;
+            fmt.channels       = channels;
+            fmt.sampleRate     = (ma_uint32)sampleRate;
+            fmt.bitsPerSample  = sampleSizeInBits;
+            fmt.blockAlign     = (ma_uint16)(fmt.channels * fmt.bitsPerSample / 8);
+            fmt.avgBytesPerSec = fmt.blockAlign * fmt.sampleRate;
+            if (fmt.blockAlign == 0 && compressionFormat == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+                fmt.blockAlign = 34 * fmt.channels;
+            }
+            if (compressionFormat == MA_DR_WAVE_FORMAT_ALAW || compressionFormat == MA_DR_WAVE_FORMAT_MULAW) {
+                if (fmt.bitsPerSample > 8) {
+                    fmt.bitsPerSample = 8;
+                    fmt.blockAlign = fmt.channels;
+                }
+            }
+            fmt.bitsPerSample += (fmt.bitsPerSample & 7);
+            if (isAIFCFormType) {
+                if (ma_dr_wav__seek_forward(pWav->onSeek, (chunkSize - commDataBytesToRead), pWav->pUserData) == MA_FALSE) {
+                    return MA_FALSE;
+                }
+                cursor += (chunkSize - commDataBytesToRead);
+            }
+            continue;
+        }
+        if (pWav->container == ma_dr_wav_container_aiff && ma_dr_wav_fourcc_equal(header.id.fourcc, "SSND")) {
+            ma_uint8 offsetAndBlockSizeData[8];
+            ma_uint32 offset;
+            foundChunk_data = MA_TRUE;
+            if (ma_dr_wav__on_read(pWav->onRead, pWav->pUserData, offsetAndBlockSizeData, sizeof(offsetAndBlockSizeData), &cursor) != sizeof(offsetAndBlockSizeData)) {
+                return MA_FALSE;
+            }
+            offset = ma_dr_wav_bytes_to_u32_ex(offsetAndBlockSizeData + 0, pWav->container);
+            if (ma_dr_wav__seek_forward(pWav->onSeek, offset, pWav->pUserData) == MA_FALSE) {
+                return MA_FALSE;
+            }
+            cursor += offset;
+            pWav->dataChunkDataPos = cursor;
+            dataChunkSize = chunkSize;
+            if (sequential || !isProcessingMetadata) {
+                break;
+            } else {
+                if (ma_dr_wav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == MA_FALSE) {
+                    break;
+                }
+                cursor += chunkSize;
+                continue;
+            }
+        }
+        if (isProcessingMetadata) {
+            ma_dr_wav__metadata_process_chunk(&metadataParser, &header, ma_dr_wav_metadata_type_all_including_unknown);
+            if (ma_dr_wav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData) == MA_FALSE) {
+                break;
+            }
+        }
+        chunkSize += header.paddingSize;
+        if (ma_dr_wav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData) == MA_FALSE) {
+            break;
+        }
+        cursor += chunkSize;
+    }
+    if (!foundChunk_fmt || !foundChunk_data) {
+        return MA_FALSE;
+    }
+    if ((fmt.sampleRate    == 0 || fmt.sampleRate    > MA_DR_WAV_MAX_SAMPLE_RATE    ) ||
+        (fmt.channels      == 0 || fmt.channels      > MA_DR_WAV_MAX_CHANNELS       ) ||
+        (fmt.bitsPerSample == 0 || fmt.bitsPerSample > MA_DR_WAV_MAX_BITS_PER_SAMPLE) ||
+        fmt.blockAlign == 0) {
+        return MA_FALSE;
+    }
+    translatedFormatTag = fmt.formatTag;
+    if (translatedFormatTag == MA_DR_WAVE_FORMAT_EXTENSIBLE) {
+        translatedFormatTag = ma_dr_wav_bytes_to_u16_ex(fmt.subFormat + 0, pWav->container);
+    }
+    if (!sequential) {
+        if (!ma_dr_wav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData)) {
+            return MA_FALSE;
+        }
+        cursor = pWav->dataChunkDataPos;
+    }
+    if (isProcessingMetadata && metadataParser.metadataCount > 0) {
+        if (ma_dr_wav__seek_from_start(pWav->onSeek, metadataStartPos, pWav->pUserData) == MA_FALSE) {
+            return MA_FALSE;
+        }
+        result = ma_dr_wav__metadata_alloc(&metadataParser, &pWav->allocationCallbacks);
+        if (result != MA_SUCCESS) {
+            return MA_FALSE;
+        }
+        metadataParser.stage = ma_dr_wav__metadata_parser_stage_read;
+        for (;;) {
+            ma_dr_wav_chunk_header header;
+            ma_uint64 metadataBytesRead;
+            result = ma_dr_wav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
+            if (result != MA_SUCCESS) {
+                break;
+            }
+            metadataBytesRead = ma_dr_wav__metadata_process_chunk(&metadataParser, &header, ma_dr_wav_metadata_type_all_including_unknown);
+            if (ma_dr_wav__seek_forward(pWav->onSeek, (header.sizeInBytes + header.paddingSize) - metadataBytesRead, pWav->pUserData) == MA_FALSE) {
+                ma_dr_wav_free(metadataParser.pMetadata, &pWav->allocationCallbacks);
+                return MA_FALSE;
+            }
+        }
+        pWav->pMetadata     = metadataParser.pMetadata;
+        pWav->metadataCount = metadataParser.metadataCount;
+    }
+    if (dataChunkSize == 0xFFFFFFFF && (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rifx) && pWav->isSequentialWrite == MA_FALSE) {
+        dataChunkSize = 0;
+        for (;;) {
+            ma_uint8 temp[4096];
+            size_t bytesRead = pWav->onRead(pWav->pUserData, temp, sizeof(temp));
+            dataChunkSize += bytesRead;
+            if (bytesRead < sizeof(temp)) {
+                break;
+            }
+        }
+    }
+    if (ma_dr_wav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData) == MA_FALSE) {
+        ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+        return MA_FALSE;
+    }
+    pWav->fmt                 = fmt;
+    pWav->sampleRate          = fmt.sampleRate;
+    pWav->channels            = fmt.channels;
+    pWav->bitsPerSample       = fmt.bitsPerSample;
+    pWav->bytesRemaining      = dataChunkSize;
+    pWav->translatedFormatTag = translatedFormatTag;
+    pWav->dataChunkDataSize   = dataChunkSize;
+    if (sampleCountFromFactChunk != 0) {
+        pWav->totalPCMFrameCount = sampleCountFromFactChunk;
+    } else if (aiffFrameCount != 0) {
+        pWav->totalPCMFrameCount = aiffFrameCount;
+    } else {
+        ma_uint32 bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+        if (bytesPerFrame == 0) {
+            ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+            return MA_FALSE;
+        }
+        pWav->totalPCMFrameCount = dataChunkSize / bytesPerFrame;
+        if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
+            ma_uint64 totalBlockHeaderSizeInBytes;
+            ma_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
+                blockCount += 1;
+            }
+            totalBlockHeaderSizeInBytes = blockCount * (6*fmt.channels);
+            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
+        }
+        if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+            ma_uint64 totalBlockHeaderSizeInBytes;
+            ma_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
+                blockCount += 1;
+            }
+            totalBlockHeaderSizeInBytes = blockCount * (4*fmt.channels);
+            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
+            pWav->totalPCMFrameCount += blockCount;
+        }
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+        if (pWav->channels > 2) {
+            ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+            return MA_FALSE;
+        }
+    }
+    if (ma_dr_wav_get_bytes_per_pcm_frame(pWav) == 0) {
+        ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+        return MA_FALSE;
+    }
+#ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
+        ma_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (6*pWav->channels))) * 2)) / fmt.channels;
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+        ma_uint64 blockCount = dataChunkSize / fmt.blockAlign;
+        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (4*pWav->channels))) * 2) + (blockCount * pWav->channels)) / fmt.channels;
+    }
+#endif
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_wav_init(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_ex(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, ma_dr_wav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!ma_dr_wav_preinit(pWav, onRead, onSeek, pReadSeekUserData, pAllocationCallbacks)) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init__internal(pWav, onChunk, pChunkUserData, flags);
+}
+MA_API ma_bool32 ma_dr_wav_init_with_metadata(ma_dr_wav* pWav, ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!ma_dr_wav_preinit(pWav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init__internal(pWav, NULL, NULL, flags | MA_DR_WAV_WITH_METADATA);
+}
+MA_API ma_dr_wav_metadata* ma_dr_wav_take_ownership_of_metadata(ma_dr_wav* pWav)
+{
+    ma_dr_wav_metadata *result = pWav->pMetadata;
+    pWav->pMetadata     = NULL;
+    pWav->metadataCount = 0;
+    return result;
+}
+MA_PRIVATE size_t ma_dr_wav__write(ma_dr_wav* pWav, const void* pData, size_t dataSize)
+{
+    MA_DR_WAV_ASSERT(pWav          != NULL);
+    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
+    return pWav->onWrite(pWav->pUserData, pData, dataSize);
+}
+MA_PRIVATE size_t ma_dr_wav__write_byte(ma_dr_wav* pWav, ma_uint8 byte)
+{
+    MA_DR_WAV_ASSERT(pWav          != NULL);
+    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
+    return pWav->onWrite(pWav->pUserData, &byte, 1);
+}
+MA_PRIVATE size_t ma_dr_wav__write_u16ne_to_le(ma_dr_wav* pWav, ma_uint16 value)
+{
+    MA_DR_WAV_ASSERT(pWav          != NULL);
+    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
+    if (!ma_dr_wav__is_little_endian()) {
+        value = ma_dr_wav__bswap16(value);
+    }
+    return ma_dr_wav__write(pWav, &value, 2);
+}
+MA_PRIVATE size_t ma_dr_wav__write_u32ne_to_le(ma_dr_wav* pWav, ma_uint32 value)
+{
+    MA_DR_WAV_ASSERT(pWav          != NULL);
+    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
+    if (!ma_dr_wav__is_little_endian()) {
+        value = ma_dr_wav__bswap32(value);
+    }
+    return ma_dr_wav__write(pWav, &value, 4);
+}
+MA_PRIVATE size_t ma_dr_wav__write_u64ne_to_le(ma_dr_wav* pWav, ma_uint64 value)
+{
+    MA_DR_WAV_ASSERT(pWav          != NULL);
+    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
+    if (!ma_dr_wav__is_little_endian()) {
+        value = ma_dr_wav__bswap64(value);
+    }
+    return ma_dr_wav__write(pWav, &value, 8);
+}
+MA_PRIVATE size_t ma_dr_wav__write_f32ne_to_le(ma_dr_wav* pWav, float value)
+{
+    union {
+       ma_uint32 u32;
+       float f32;
+    } u;
+    MA_DR_WAV_ASSERT(pWav          != NULL);
+    MA_DR_WAV_ASSERT(pWav->onWrite != NULL);
+    u.f32 = value;
+    if (!ma_dr_wav__is_little_endian()) {
+        u.u32 = ma_dr_wav__bswap32(u.u32);
+    }
+    return ma_dr_wav__write(pWav, &u.u32, 4);
+}
+MA_PRIVATE size_t ma_dr_wav__write_or_count(ma_dr_wav* pWav, const void* pData, size_t dataSize)
+{
+    if (pWav == NULL) {
+        return dataSize;
+    }
+    return ma_dr_wav__write(pWav, pData, dataSize);
+}
+MA_PRIVATE size_t ma_dr_wav__write_or_count_byte(ma_dr_wav* pWav, ma_uint8 byte)
+{
+    if (pWav == NULL) {
+        return 1;
+    }
+    return ma_dr_wav__write_byte(pWav, byte);
+}
+MA_PRIVATE size_t ma_dr_wav__write_or_count_u16ne_to_le(ma_dr_wav* pWav, ma_uint16 value)
+{
+    if (pWav == NULL) {
+        return 2;
+    }
+    return ma_dr_wav__write_u16ne_to_le(pWav, value);
+}
+MA_PRIVATE size_t ma_dr_wav__write_or_count_u32ne_to_le(ma_dr_wav* pWav, ma_uint32 value)
+{
+    if (pWav == NULL) {
+        return 4;
+    }
+    return ma_dr_wav__write_u32ne_to_le(pWav, value);
+}
+#if 0
+MA_PRIVATE size_t ma_dr_wav__write_or_count_u64ne_to_le(ma_dr_wav* pWav, ma_uint64 value)
+{
+    if (pWav == NULL) {
+        return 8;
+    }
+    return ma_dr_wav__write_u64ne_to_le(pWav, value);
+}
+#endif
+MA_PRIVATE size_t ma_dr_wav__write_or_count_f32ne_to_le(ma_dr_wav* pWav, float value)
+{
+    if (pWav == NULL) {
+        return 4;
+    }
+    return ma_dr_wav__write_f32ne_to_le(pWav, value);
+}
+MA_PRIVATE size_t ma_dr_wav__write_or_count_string_to_fixed_size_buf(ma_dr_wav* pWav, char* str, size_t bufFixedSize)
+{
+    size_t len;
+    if (pWav == NULL) {
+        return bufFixedSize;
+    }
+    len = ma_dr_wav__strlen_clamped(str, bufFixedSize);
+    ma_dr_wav__write_or_count(pWav, str, len);
+    if (len < bufFixedSize) {
+        size_t i;
+        for (i = 0; i < bufFixedSize - len; ++i) {
+            ma_dr_wav__write_byte(pWav, 0);
+        }
+    }
+    return bufFixedSize;
+}
+MA_PRIVATE size_t ma_dr_wav__write_or_count_metadata(ma_dr_wav* pWav, ma_dr_wav_metadata* pMetadatas, ma_uint32 metadataCount)
+{
+    size_t bytesWritten = 0;
+    ma_bool32 hasListAdtl = MA_FALSE;
+    ma_bool32 hasListInfo = MA_FALSE;
+    ma_uint32 iMetadata;
+    if (pMetadatas == NULL || metadataCount == 0) {
+        return 0;
+    }
+    for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+        ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
+        ma_uint32 chunkSize = 0;
+        if ((pMetadata->type & ma_dr_wav_metadata_type_list_all_info_strings) || (pMetadata->type == ma_dr_wav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_info_list)) {
+            hasListInfo = MA_TRUE;
+        }
+        if ((pMetadata->type & ma_dr_wav_metadata_type_list_all_adtl) || (pMetadata->type == ma_dr_wav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_adtl_list)) {
+            hasListAdtl = MA_TRUE;
+        }
+        switch (pMetadata->type) {
+            case ma_dr_wav_metadata_type_smpl:
+            {
+                ma_uint32 iLoop;
+                chunkSize = MA_DR_WAV_SMPL_BYTES + MA_DR_WAV_SMPL_LOOP_BYTES * pMetadata->data.smpl.sampleLoopCount + pMetadata->data.smpl.samplerSpecificDataSizeInBytes;
+                bytesWritten += ma_dr_wav__write_or_count(pWav, "smpl", 4);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.manufacturerId);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.productId);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.samplePeriodNanoseconds);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.midiUnityNote);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.midiPitchFraction);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.smpteFormat);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.smpteOffset);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.sampleLoopCount);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.samplerSpecificDataSizeInBytes);
+                for (iLoop = 0; iLoop < pMetadata->data.smpl.sampleLoopCount; ++iLoop) {
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].cuePointId);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].type);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].firstSampleByteOffset);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].lastSampleByteOffset);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].sampleFraction);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.smpl.pLoops[iLoop].playCount);
+                }
+                if (pMetadata->data.smpl.samplerSpecificDataSizeInBytes > 0) {
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.smpl.pSamplerSpecificData, pMetadata->data.smpl.samplerSpecificDataSizeInBytes);
+                }
+            } break;
+            case ma_dr_wav_metadata_type_inst:
+            {
+                chunkSize = MA_DR_WAV_INST_BYTES;
+                bytesWritten += ma_dr_wav__write_or_count(pWav, "inst", 4);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.midiUnityNote, 1);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.fineTuneCents, 1);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.gainDecibels, 1);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.lowNote, 1);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.highNote, 1);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.lowVelocity, 1);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, &pMetadata->data.inst.highVelocity, 1);
+            } break;
+            case ma_dr_wav_metadata_type_cue:
+            {
+                ma_uint32 iCuePoint;
+                chunkSize = MA_DR_WAV_CUE_BYTES + MA_DR_WAV_CUE_POINT_BYTES * pMetadata->data.cue.cuePointCount;
+                bytesWritten += ma_dr_wav__write_or_count(pWav, "cue ", 4);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.cuePointCount);
+                for (iCuePoint = 0; iCuePoint < pMetadata->data.cue.cuePointCount; ++iCuePoint) {
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].id);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].playOrderPosition);
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].dataChunkId, 4);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].chunkStart);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].blockStart);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.cue.pCuePoints[iCuePoint].sampleByteOffset);
+                }
+            } break;
+            case ma_dr_wav_metadata_type_acid:
+            {
+                chunkSize = MA_DR_WAV_ACID_BYTES;
+                bytesWritten += ma_dr_wav__write_or_count(pWav, "acid", 4);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.acid.flags);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.midiUnityNote);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.reserved1);
+                bytesWritten += ma_dr_wav__write_or_count_f32ne_to_le(pWav, pMetadata->data.acid.reserved2);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.acid.numBeats);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.meterDenominator);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.acid.meterNumerator);
+                bytesWritten += ma_dr_wav__write_or_count_f32ne_to_le(pWav, pMetadata->data.acid.tempo);
+            } break;
+            case ma_dr_wav_metadata_type_bext:
+            {
+                char reservedBuf[MA_DR_WAV_BEXT_RESERVED_BYTES];
+                ma_uint32 timeReferenceLow;
+                ma_uint32 timeReferenceHigh;
+                chunkSize = MA_DR_WAV_BEXT_BYTES + pMetadata->data.bext.codingHistorySize;
+                bytesWritten += ma_dr_wav__write_or_count(pWav, "bext", 4);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                bytesWritten += ma_dr_wav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pDescription, MA_DR_WAV_BEXT_DESCRIPTION_BYTES);
+                bytesWritten += ma_dr_wav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pOriginatorName, MA_DR_WAV_BEXT_ORIGINATOR_NAME_BYTES);
+                bytesWritten += ma_dr_wav__write_or_count_string_to_fixed_size_buf(pWav, pMetadata->data.bext.pOriginatorReference, MA_DR_WAV_BEXT_ORIGINATOR_REF_BYTES);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.bext.pOriginationDate, sizeof(pMetadata->data.bext.pOriginationDate));
+                bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.bext.pOriginationTime, sizeof(pMetadata->data.bext.pOriginationTime));
+                timeReferenceLow  = (ma_uint32)(pMetadata->data.bext.timeReference & 0xFFFFFFFF);
+                timeReferenceHigh = (ma_uint32)(pMetadata->data.bext.timeReference >> 32);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, timeReferenceLow);
+                bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, timeReferenceHigh);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.version);
+                bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.bext.pUMID, MA_DR_WAV_BEXT_UMID_BYTES);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.loudnessValue);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.loudnessRange);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxTruePeakLevel);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxMomentaryLoudness);
+                bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.bext.maxShortTermLoudness);
+                MA_DR_WAV_ZERO_MEMORY(reservedBuf, sizeof(reservedBuf));
+                bytesWritten += ma_dr_wav__write_or_count(pWav, reservedBuf, sizeof(reservedBuf));
+                if (pMetadata->data.bext.codingHistorySize > 0) {
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.bext.pCodingHistory, pMetadata->data.bext.codingHistorySize);
+                }
+            } break;
+            case ma_dr_wav_metadata_type_unknown:
+            {
+                if (pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_top_level) {
+                    chunkSize = pMetadata->data.unknown.dataSizeInBytes;
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.pData, pMetadata->data.unknown.dataSizeInBytes);
+                }
+            } break;
+            default: break;
+        }
+        if ((chunkSize % 2) != 0) {
+            bytesWritten += ma_dr_wav__write_or_count_byte(pWav, 0);
+        }
+    }
+    if (hasListInfo) {
+        ma_uint32 chunkSize = 4;
+        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+            ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
+            if ((pMetadata->type & ma_dr_wav_metadata_type_list_all_info_strings)) {
+                chunkSize += 8;
+                chunkSize += pMetadata->data.infoText.stringLength + 1;
+            } else if (pMetadata->type == ma_dr_wav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_info_list) {
+                chunkSize += 8;
+                chunkSize += pMetadata->data.unknown.dataSizeInBytes;
+            }
+            if ((chunkSize % 2) != 0) {
+                chunkSize += 1;
+            }
+        }
+        bytesWritten += ma_dr_wav__write_or_count(pWav, "LIST", 4);
+        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
+        bytesWritten += ma_dr_wav__write_or_count(pWav, "INFO", 4);
+        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+            ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
+            ma_uint32 subchunkSize = 0;
+            if (pMetadata->type & ma_dr_wav_metadata_type_list_all_info_strings) {
+                const char* pID = NULL;
+                switch (pMetadata->type) {
+                    case ma_dr_wav_metadata_type_list_info_software:    pID = "ISFT"; break;
+                    case ma_dr_wav_metadata_type_list_info_copyright:   pID = "ICOP"; break;
+                    case ma_dr_wav_metadata_type_list_info_title:       pID = "INAM"; break;
+                    case ma_dr_wav_metadata_type_list_info_artist:      pID = "IART"; break;
+                    case ma_dr_wav_metadata_type_list_info_comment:     pID = "ICMT"; break;
+                    case ma_dr_wav_metadata_type_list_info_date:        pID = "ICRD"; break;
+                    case ma_dr_wav_metadata_type_list_info_genre:       pID = "IGNR"; break;
+                    case ma_dr_wav_metadata_type_list_info_album:       pID = "IPRD"; break;
+                    case ma_dr_wav_metadata_type_list_info_tracknumber: pID = "ITRK"; break;
+                    default: break;
+                }
+                MA_DR_WAV_ASSERT(pID != NULL);
+                if (pMetadata->data.infoText.stringLength) {
+                    subchunkSize = pMetadata->data.infoText.stringLength + 1;
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pID, 4);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, subchunkSize);
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.infoText.pString, pMetadata->data.infoText.stringLength);
+                    bytesWritten += ma_dr_wav__write_or_count_byte(pWav, '\0');
+                }
+            } else if (pMetadata->type == ma_dr_wav_metadata_type_unknown && pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_info_list) {
+                if (pMetadata->data.unknown.dataSizeInBytes) {
+                    subchunkSize = pMetadata->data.unknown.dataSizeInBytes;
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.unknown.dataSizeInBytes);
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.pData, subchunkSize);
+                }
+            }
+            if ((subchunkSize % 2) != 0) {
+                bytesWritten += ma_dr_wav__write_or_count_byte(pWav, 0);
+            }
+        }
+    }
+    if (hasListAdtl) {
+        ma_uint32 chunkSize = 4;
+        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+            ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
+            switch (pMetadata->type)
+            {
+                case ma_dr_wav_metadata_type_list_label:
+                case ma_dr_wav_metadata_type_list_note:
+                {
+                    chunkSize += 8;
+                    chunkSize += MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES;
+                    if (pMetadata->data.labelOrNote.stringLength > 0) {
+                        chunkSize += pMetadata->data.labelOrNote.stringLength + 1;
+                    }
+                } break;
+                case ma_dr_wav_metadata_type_list_labelled_cue_region:
+                {
+                    chunkSize += 8;
+                    chunkSize += MA_DR_WAV_LIST_LABELLED_TEXT_BYTES;
+                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
+                        chunkSize += pMetadata->data.labelledCueRegion.stringLength + 1;
+                    }
+                } break;
+                case ma_dr_wav_metadata_type_unknown:
+                {
+                    if (pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_adtl_list) {
+                        chunkSize += 8;
+                        chunkSize += pMetadata->data.unknown.dataSizeInBytes;
+                    }
+                } break;
+                default: break;
+            }
+            if ((chunkSize % 2) != 0) {
+                chunkSize += 1;
+            }
+        }
+        bytesWritten += ma_dr_wav__write_or_count(pWav, "LIST", 4);
+        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, chunkSize);
+        bytesWritten += ma_dr_wav__write_or_count(pWav, "adtl", 4);
+        for (iMetadata = 0; iMetadata < metadataCount; ++iMetadata) {
+            ma_dr_wav_metadata* pMetadata = &pMetadatas[iMetadata];
+            ma_uint32 subchunkSize = 0;
+            switch (pMetadata->type)
+            {
+                case ma_dr_wav_metadata_type_list_label:
+                case ma_dr_wav_metadata_type_list_note:
+                {
+                    if (pMetadata->data.labelOrNote.stringLength > 0) {
+                        const char *pID = NULL;
+                        if (pMetadata->type == ma_dr_wav_metadata_type_list_label) {
+                            pID = "labl";
+                        }
+                        else if (pMetadata->type == ma_dr_wav_metadata_type_list_note) {
+                            pID = "note";
+                        }
+                        MA_DR_WAV_ASSERT(pID != NULL);
+                        MA_DR_WAV_ASSERT(pMetadata->data.labelOrNote.pString != NULL);
+                        subchunkSize = MA_DR_WAV_LIST_LABEL_OR_NOTE_BYTES;
+                        bytesWritten += ma_dr_wav__write_or_count(pWav, pID, 4);
+                        subchunkSize += pMetadata->data.labelOrNote.stringLength + 1;
+                        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, subchunkSize);
+                        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelOrNote.cuePointId);
+                        bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.labelOrNote.pString, pMetadata->data.labelOrNote.stringLength);
+                        bytesWritten += ma_dr_wav__write_or_count_byte(pWav, '\0');
+                    }
+                } break;
+                case ma_dr_wav_metadata_type_list_labelled_cue_region:
+                {
+                    subchunkSize = MA_DR_WAV_LIST_LABELLED_TEXT_BYTES;
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, "ltxt", 4);
+                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
+                        subchunkSize += pMetadata->data.labelledCueRegion.stringLength + 1;
+                    }
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, subchunkSize);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelledCueRegion.cuePointId);
+                    bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, pMetadata->data.labelledCueRegion.sampleLength);
+                    bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.labelledCueRegion.purposeId, 4);
+                    bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.country);
+                    bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.language);
+                    bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.dialect);
+                    bytesWritten += ma_dr_wav__write_or_count_u16ne_to_le(pWav, pMetadata->data.labelledCueRegion.codePage);
+                    if (pMetadata->data.labelledCueRegion.stringLength > 0) {
+                        MA_DR_WAV_ASSERT(pMetadata->data.labelledCueRegion.pString != NULL);
+                        bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.labelledCueRegion.pString, pMetadata->data.labelledCueRegion.stringLength);
+                        bytesWritten += ma_dr_wav__write_or_count_byte(pWav, '\0');
+                    }
+                } break;
+                case ma_dr_wav_metadata_type_unknown:
+                {
+                    if (pMetadata->data.unknown.chunkLocation == ma_dr_wav_metadata_location_inside_adtl_list) {
+                        subchunkSize = pMetadata->data.unknown.dataSizeInBytes;
+                        MA_DR_WAV_ASSERT(pMetadata->data.unknown.pData != NULL);
+                        bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.id, 4);
+                        bytesWritten += ma_dr_wav__write_or_count_u32ne_to_le(pWav, subchunkSize);
+                        bytesWritten += ma_dr_wav__write_or_count(pWav, pMetadata->data.unknown.pData, subchunkSize);
+                    }
+                } break;
+                default: break;
+            }
+            if ((subchunkSize % 2) != 0) {
+                bytesWritten += ma_dr_wav__write_or_count_byte(pWav, 0);
+            }
+        }
+    }
+    MA_DR_WAV_ASSERT((bytesWritten % 2) == 0);
+    return bytesWritten;
+}
+MA_PRIVATE ma_uint32 ma_dr_wav__riff_chunk_size_riff(ma_uint64 dataChunkSize, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount)
+{
+    ma_uint64 chunkSize = 4 + 24 + (ma_uint64)ma_dr_wav__write_or_count_metadata(NULL, pMetadata, metadataCount) + 8 + dataChunkSize + ma_dr_wav__chunk_padding_size_riff(dataChunkSize);
+    if (chunkSize > 0xFFFFFFFFUL) {
+        chunkSize = 0xFFFFFFFFUL;
+    }
+    return (ma_uint32)chunkSize;
+}
+MA_PRIVATE ma_uint32 ma_dr_wav__data_chunk_size_riff(ma_uint64 dataChunkSize)
+{
+    if (dataChunkSize <= 0xFFFFFFFFUL) {
+        return (ma_uint32)dataChunkSize;
+    } else {
+        return 0xFFFFFFFFUL;
+    }
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__riff_chunk_size_w64(ma_uint64 dataChunkSize)
+{
+    ma_uint64 dataSubchunkPaddingSize = ma_dr_wav__chunk_padding_size_w64(dataChunkSize);
+    return 80 + 24 + dataChunkSize + dataSubchunkPaddingSize;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__data_chunk_size_w64(ma_uint64 dataChunkSize)
+{
+    return 24 + dataChunkSize;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__riff_chunk_size_rf64(ma_uint64 dataChunkSize, ma_dr_wav_metadata *metadata, ma_uint32 numMetadata)
+{
+    ma_uint64 chunkSize = 4 + 36 + 24 + (ma_uint64)ma_dr_wav__write_or_count_metadata(NULL, metadata, numMetadata) + 8 + dataChunkSize + ma_dr_wav__chunk_padding_size_riff(dataChunkSize);
+    if (chunkSize > 0xFFFFFFFFUL) {
+        chunkSize = 0xFFFFFFFFUL;
+    }
+    return chunkSize;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav__data_chunk_size_rf64(ma_uint64 dataChunkSize)
+{
+    return dataChunkSize;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav_preinit_write(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_bool32 isSequential, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pWav == NULL || onWrite == NULL) {
+        return MA_FALSE;
+    }
+    if (!isSequential && onSeek == NULL) {
+        return MA_FALSE;
+    }
+    if (pFormat->format == MA_DR_WAVE_FORMAT_EXTENSIBLE) {
+        return MA_FALSE;
+    }
+    if (pFormat->format == MA_DR_WAVE_FORMAT_ADPCM || pFormat->format == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+        return MA_FALSE;
+    }
+    MA_DR_WAV_ZERO_MEMORY(pWav, sizeof(*pWav));
+    pWav->onWrite   = onWrite;
+    pWav->onSeek    = onSeek;
+    pWav->pUserData = pUserData;
+    pWav->allocationCallbacks = ma_dr_wav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
+    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
+        return MA_FALSE;
+    }
+    pWav->fmt.formatTag = (ma_uint16)pFormat->format;
+    pWav->fmt.channels = (ma_uint16)pFormat->channels;
+    pWav->fmt.sampleRate = pFormat->sampleRate;
+    pWav->fmt.avgBytesPerSec = (ma_uint32)((pFormat->bitsPerSample * pFormat->sampleRate * pFormat->channels) / 8);
+    pWav->fmt.blockAlign = (ma_uint16)((pFormat->channels * pFormat->bitsPerSample) / 8);
+    pWav->fmt.bitsPerSample = (ma_uint16)pFormat->bitsPerSample;
+    pWav->fmt.extendedSize = 0;
+    pWav->isSequentialWrite = isSequential;
+    return MA_TRUE;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav_init_write__internal(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount)
+{
+    size_t runningPos = 0;
+    ma_uint64 initialDataChunkSize = 0;
+    ma_uint64 chunkSizeFMT;
+    if (pWav->isSequentialWrite) {
+        initialDataChunkSize = (totalSampleCount * pWav->fmt.bitsPerSample) / 8;
+        if (pFormat->container == ma_dr_wav_container_riff) {
+            if (initialDataChunkSize > (0xFFFFFFFFUL - 36)) {
+                return MA_FALSE;
+            }
+        }
+    }
+    pWav->dataChunkDataSizeTargetWrite = initialDataChunkSize;
+    if (pFormat->container == ma_dr_wav_container_riff) {
+        ma_uint32 chunkSizeRIFF = 28 + (ma_uint32)initialDataChunkSize;
+        runningPos += ma_dr_wav__write(pWav, "RIFF", 4);
+        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, chunkSizeRIFF);
+        runningPos += ma_dr_wav__write(pWav, "WAVE", 4);
+    } else if (pFormat->container == ma_dr_wav_container_w64) {
+        ma_uint64 chunkSizeRIFF = 80 + 24 + initialDataChunkSize;
+        runningPos += ma_dr_wav__write(pWav, ma_dr_wavGUID_W64_RIFF, 16);
+        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, chunkSizeRIFF);
+        runningPos += ma_dr_wav__write(pWav, ma_dr_wavGUID_W64_WAVE, 16);
+    } else if (pFormat->container == ma_dr_wav_container_rf64) {
+        runningPos += ma_dr_wav__write(pWav, "RF64", 4);
+        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, 0xFFFFFFFF);
+        runningPos += ma_dr_wav__write(pWav, "WAVE", 4);
+    } else {
+        return MA_FALSE;
+    }
+    if (pFormat->container == ma_dr_wav_container_rf64) {
+        ma_uint32 initialds64ChunkSize = 28;
+        ma_uint64 initialRiffChunkSize = 8 + initialds64ChunkSize + initialDataChunkSize;
+        runningPos += ma_dr_wav__write(pWav, "ds64", 4);
+        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, initialds64ChunkSize);
+        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, initialRiffChunkSize);
+        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, initialDataChunkSize);
+        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, totalSampleCount);
+        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, 0);
+    }
+    if (pFormat->container == ma_dr_wav_container_riff || pFormat->container == ma_dr_wav_container_rf64) {
+        chunkSizeFMT = 16;
+        runningPos += ma_dr_wav__write(pWav, "fmt ", 4);
+        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, (ma_uint32)chunkSizeFMT);
+    } else if (pFormat->container == ma_dr_wav_container_w64) {
+        chunkSizeFMT = 40;
+        runningPos += ma_dr_wav__write(pWav, ma_dr_wavGUID_W64_FMT, 16);
+        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, chunkSizeFMT);
+    }
+    runningPos += ma_dr_wav__write_u16ne_to_le(pWav, pWav->fmt.formatTag);
+    runningPos += ma_dr_wav__write_u16ne_to_le(pWav, pWav->fmt.channels);
+    runningPos += ma_dr_wav__write_u32ne_to_le(pWav, pWav->fmt.sampleRate);
+    runningPos += ma_dr_wav__write_u32ne_to_le(pWav, pWav->fmt.avgBytesPerSec);
+    runningPos += ma_dr_wav__write_u16ne_to_le(pWav, pWav->fmt.blockAlign);
+    runningPos += ma_dr_wav__write_u16ne_to_le(pWav, pWav->fmt.bitsPerSample);
+    if (!pWav->isSequentialWrite && pWav->pMetadata != NULL && pWav->metadataCount > 0 && (pFormat->container == ma_dr_wav_container_riff || pFormat->container == ma_dr_wav_container_rf64)) {
+        runningPos += ma_dr_wav__write_or_count_metadata(pWav, pWav->pMetadata, pWav->metadataCount);
+    }
+    pWav->dataChunkDataPos = runningPos;
+    if (pFormat->container == ma_dr_wav_container_riff) {
+        ma_uint32 chunkSizeDATA = (ma_uint32)initialDataChunkSize;
+        runningPos += ma_dr_wav__write(pWav, "data", 4);
+        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, chunkSizeDATA);
+    } else if (pFormat->container == ma_dr_wav_container_w64) {
+        ma_uint64 chunkSizeDATA = 24 + initialDataChunkSize;
+        runningPos += ma_dr_wav__write(pWav, ma_dr_wavGUID_W64_DATA, 16);
+        runningPos += ma_dr_wav__write_u64ne_to_le(pWav, chunkSizeDATA);
+    } else if (pFormat->container == ma_dr_wav_container_rf64) {
+        runningPos += ma_dr_wav__write(pWav, "data", 4);
+        runningPos += ma_dr_wav__write_u32ne_to_le(pWav, 0xFFFFFFFF);
+    }
+    pWav->container = pFormat->container;
+    pWav->channels = (ma_uint16)pFormat->channels;
+    pWav->sampleRate = pFormat->sampleRate;
+    pWav->bitsPerSample = (ma_uint16)pFormat->bitsPerSample;
+    pWav->translatedFormatTag = (ma_uint16)pFormat->format;
+    pWav->dataChunkDataPos = runningPos;
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_wav_init_write(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!ma_dr_wav_preinit_write(pWav, pFormat, MA_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_write__internal(pWav, pFormat, 0);
+}
+MA_API ma_bool32 ma_dr_wav_init_write_sequential(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_dr_wav_write_proc onWrite, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (!ma_dr_wav_preinit_write(pWav, pFormat, MA_TRUE, onWrite, NULL, pUserData, pAllocationCallbacks)) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_write__internal(pWav, pFormat, totalSampleCount);
+}
+MA_API ma_bool32 ma_dr_wav_init_write_sequential_pcm_frames(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, ma_dr_wav_write_proc onWrite, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_write_sequential(pWav, pFormat, totalPCMFrameCount*pFormat->channels, onWrite, pUserData, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_write_with_metadata(ma_dr_wav* pWav, const ma_dr_wav_data_format* pFormat, ma_dr_wav_write_proc onWrite, ma_dr_wav_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount)
+{
+    if (!ma_dr_wav_preinit_write(pWav, pFormat, MA_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) {
+        return MA_FALSE;
+    }
+    pWav->pMetadata     = pMetadata;
+    pWav->metadataCount = metadataCount;
+    return ma_dr_wav_init_write__internal(pWav, pFormat, 0);
+}
+MA_API ma_uint64 ma_dr_wav_target_write_size_bytes(const ma_dr_wav_data_format* pFormat, ma_uint64 totalFrameCount, ma_dr_wav_metadata* pMetadata, ma_uint32 metadataCount)
+{
+    ma_uint64 targetDataSizeBytes = (ma_uint64)((ma_int64)totalFrameCount * pFormat->channels * pFormat->bitsPerSample/8.0);
+    ma_uint64 riffChunkSizeBytes;
+    ma_uint64 fileSizeBytes = 0;
+    if (pFormat->container == ma_dr_wav_container_riff) {
+        riffChunkSizeBytes = ma_dr_wav__riff_chunk_size_riff(targetDataSizeBytes, pMetadata, metadataCount);
+        fileSizeBytes = (8 + riffChunkSizeBytes);
+    } else if (pFormat->container == ma_dr_wav_container_w64) {
+        riffChunkSizeBytes = ma_dr_wav__riff_chunk_size_w64(targetDataSizeBytes);
+        fileSizeBytes = riffChunkSizeBytes;
+    } else if (pFormat->container == ma_dr_wav_container_rf64) {
+        riffChunkSizeBytes = ma_dr_wav__riff_chunk_size_rf64(targetDataSizeBytes, pMetadata, metadataCount);
+        fileSizeBytes = (8 + riffChunkSizeBytes);
+    }
+    return fileSizeBytes;
+}
+#ifndef MA_DR_WAV_NO_STDIO
+MA_PRIVATE size_t ma_dr_wav__on_read_stdio(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    return fread(pBufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+MA_PRIVATE size_t ma_dr_wav__on_write_stdio(void* pUserData, const void* pData, size_t bytesToWrite)
+{
+    return fwrite(pData, 1, bytesToWrite, (FILE*)pUserData);
+}
+MA_PRIVATE ma_bool32 ma_dr_wav__on_seek_stdio(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
+{
+    return fseek((FILE*)pUserData, offset, (origin == ma_dr_wav_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
+}
+MA_API ma_bool32 ma_dr_wav_init_file(ma_dr_wav* pWav, const char* filename, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_file_ex(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
+}
+MA_PRIVATE ma_bool32 ma_dr_wav_init_file__internal_FILE(ma_dr_wav* pWav, FILE* pFile, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_bool32 result;
+    result = ma_dr_wav_preinit(pWav, ma_dr_wav__on_read_stdio, ma_dr_wav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (result != MA_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+    result = ma_dr_wav_init__internal(pWav, onChunk, pChunkUserData, flags);
+    if (result != MA_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_wav_init_file_ex(ma_dr_wav* pWav, const char* filename, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (ma_fopen(&pFile, filename, "rb") != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
+}
+#ifndef MA_DR_WAV_NO_WCHAR
+MA_API ma_bool32 ma_dr_wav_init_file_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_file_ex_w(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_file_ex_w(ma_dr_wav* pWav, const wchar_t* filename, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (ma_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
+}
+#endif
+MA_API ma_bool32 ma_dr_wav_init_file_with_metadata(ma_dr_wav* pWav, const char* filename, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (ma_fopen(&pFile, filename, "rb") != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_file__internal_FILE(pWav, pFile, NULL, NULL, flags | MA_DR_WAV_WITH_METADATA, pAllocationCallbacks);
+}
+#ifndef MA_DR_WAV_NO_WCHAR
+MA_API ma_bool32 ma_dr_wav_init_file_with_metadata_w(ma_dr_wav* pWav, const wchar_t* filename, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (ma_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_file__internal_FILE(pWav, pFile, NULL, NULL, flags | MA_DR_WAV_WITH_METADATA, pAllocationCallbacks);
+}
+#endif
+MA_PRIVATE ma_bool32 ma_dr_wav_init_file_write__internal_FILE(ma_dr_wav* pWav, FILE* pFile, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_bool32 isSequential, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_bool32 result;
+    result = ma_dr_wav_preinit_write(pWav, pFormat, isSequential, ma_dr_wav__on_write_stdio, ma_dr_wav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (result != MA_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+    result = ma_dr_wav_init_write__internal(pWav, pFormat, totalSampleCount);
+    if (result != MA_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+    return MA_TRUE;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav_init_file_write__internal(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_bool32 isSequential, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (ma_fopen(&pFile, filename, "wb") != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
+}
+#ifndef MA_DR_WAV_NO_WCHAR
+MA_PRIVATE ma_bool32 ma_dr_wav_init_file_write_w__internal(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_bool32 isSequential, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    FILE* pFile;
+    if (ma_wfopen(&pFile, filename, L"wb", pAllocationCallbacks) != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
+}
+#endif
+MA_API ma_bool32 ma_dr_wav_init_file_write(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_file_write__internal(pWav, filename, pFormat, 0, MA_FALSE, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_file_write_sequential(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_file_write__internal(pWav, filename, pFormat, totalSampleCount, MA_TRUE, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_pcm_frames(ma_dr_wav* pWav, const char* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_file_write_sequential(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+#ifndef MA_DR_WAV_NO_WCHAR
+MA_API ma_bool32 ma_dr_wav_init_file_write_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_file_write_w__internal(pWav, filename, pFormat, 0, MA_FALSE, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_file_write_w__internal(pWav, filename, pFormat, totalSampleCount, MA_TRUE, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_file_write_sequential_pcm_frames_w(ma_dr_wav* pWav, const wchar_t* filename, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_file_write_sequential_w(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+#endif
+#endif
+MA_PRIVATE size_t ma_dr_wav__on_read_memory(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    ma_dr_wav* pWav = (ma_dr_wav*)pUserData;
+    size_t bytesRemaining;
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    MA_DR_WAV_ASSERT(pWav->memoryStream.dataSize >= pWav->memoryStream.currentReadPos);
+    bytesRemaining = pWav->memoryStream.dataSize - pWav->memoryStream.currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+    if (bytesToRead > 0) {
+        MA_DR_WAV_COPY_MEMORY(pBufferOut, pWav->memoryStream.data + pWav->memoryStream.currentReadPos, bytesToRead);
+        pWav->memoryStream.currentReadPos += bytesToRead;
+    }
+    return bytesToRead;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav__on_seek_memory(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
+{
+    ma_dr_wav* pWav = (ma_dr_wav*)pUserData;
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    if (origin == ma_dr_wav_seek_origin_current) {
+        if (offset > 0) {
+            if (pWav->memoryStream.currentReadPos + offset > pWav->memoryStream.dataSize) {
+                return MA_FALSE;
+            }
+        } else {
+            if (pWav->memoryStream.currentReadPos < (size_t)-offset) {
+                return MA_FALSE;
+            }
+        }
+        pWav->memoryStream.currentReadPos += offset;
+    } else {
+        if ((ma_uint32)offset <= pWav->memoryStream.dataSize) {
+            pWav->memoryStream.currentReadPos = offset;
+        } else {
+            return MA_FALSE;
+        }
+    }
+    return MA_TRUE;
+}
+MA_PRIVATE size_t ma_dr_wav__on_write_memory(void* pUserData, const void* pDataIn, size_t bytesToWrite)
+{
+    ma_dr_wav* pWav = (ma_dr_wav*)pUserData;
+    size_t bytesRemaining;
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    MA_DR_WAV_ASSERT(pWav->memoryStreamWrite.dataCapacity >= pWav->memoryStreamWrite.currentWritePos);
+    bytesRemaining = pWav->memoryStreamWrite.dataCapacity - pWav->memoryStreamWrite.currentWritePos;
+    if (bytesRemaining < bytesToWrite) {
+        void* pNewData;
+        size_t newDataCapacity = (pWav->memoryStreamWrite.dataCapacity == 0) ? 256 : pWav->memoryStreamWrite.dataCapacity * 2;
+        if ((newDataCapacity - pWav->memoryStreamWrite.currentWritePos) < bytesToWrite) {
+            newDataCapacity = pWav->memoryStreamWrite.currentWritePos + bytesToWrite;
+        }
+        pNewData = ma_dr_wav__realloc_from_callbacks(*pWav->memoryStreamWrite.ppData, newDataCapacity, pWav->memoryStreamWrite.dataCapacity, &pWav->allocationCallbacks);
+        if (pNewData == NULL) {
+            return 0;
+        }
+        *pWav->memoryStreamWrite.ppData = pNewData;
+        pWav->memoryStreamWrite.dataCapacity = newDataCapacity;
+    }
+    MA_DR_WAV_COPY_MEMORY(((ma_uint8*)(*pWav->memoryStreamWrite.ppData)) + pWav->memoryStreamWrite.currentWritePos, pDataIn, bytesToWrite);
+    pWav->memoryStreamWrite.currentWritePos += bytesToWrite;
+    if (pWav->memoryStreamWrite.dataSize < pWav->memoryStreamWrite.currentWritePos) {
+        pWav->memoryStreamWrite.dataSize = pWav->memoryStreamWrite.currentWritePos;
+    }
+    *pWav->memoryStreamWrite.pDataSize = pWav->memoryStreamWrite.dataSize;
+    return bytesToWrite;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav__on_seek_memory_write(void* pUserData, int offset, ma_dr_wav_seek_origin origin)
+{
+    ma_dr_wav* pWav = (ma_dr_wav*)pUserData;
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    if (origin == ma_dr_wav_seek_origin_current) {
+        if (offset > 0) {
+            if (pWav->memoryStreamWrite.currentWritePos + offset > pWav->memoryStreamWrite.dataSize) {
+                offset = (int)(pWav->memoryStreamWrite.dataSize - pWav->memoryStreamWrite.currentWritePos);
+            }
+        } else {
+            if (pWav->memoryStreamWrite.currentWritePos < (size_t)-offset) {
+                offset = -(int)pWav->memoryStreamWrite.currentWritePos;
+            }
+        }
+        pWav->memoryStreamWrite.currentWritePos += offset;
+    } else {
+        if ((ma_uint32)offset <= pWav->memoryStreamWrite.dataSize) {
+            pWav->memoryStreamWrite.currentWritePos = offset;
+        } else {
+            pWav->memoryStreamWrite.currentWritePos = pWav->memoryStreamWrite.dataSize;
+        }
+    }
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_wav_init_memory(ma_dr_wav* pWav, const void* data, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_memory_ex(pWav, data, dataSize, NULL, NULL, 0, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_memory_ex(ma_dr_wav* pWav, const void* data, size_t dataSize, ma_dr_wav_chunk_proc onChunk, void* pChunkUserData, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (data == NULL || dataSize == 0) {
+        return MA_FALSE;
+    }
+    if (!ma_dr_wav_preinit(pWav, ma_dr_wav__on_read_memory, ma_dr_wav__on_seek_memory, pWav, pAllocationCallbacks)) {
+        return MA_FALSE;
+    }
+    pWav->memoryStream.data = (const ma_uint8*)data;
+    pWav->memoryStream.dataSize = dataSize;
+    pWav->memoryStream.currentReadPos = 0;
+    return ma_dr_wav_init__internal(pWav, onChunk, pChunkUserData, flags);
+}
+MA_API ma_bool32 ma_dr_wav_init_memory_with_metadata(ma_dr_wav* pWav, const void* data, size_t dataSize, ma_uint32 flags, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (data == NULL || dataSize == 0) {
+        return MA_FALSE;
+    }
+    if (!ma_dr_wav_preinit(pWav, ma_dr_wav__on_read_memory, ma_dr_wav__on_seek_memory, pWav, pAllocationCallbacks)) {
+        return MA_FALSE;
+    }
+    pWav->memoryStream.data = (const ma_uint8*)data;
+    pWav->memoryStream.dataSize = dataSize;
+    pWav->memoryStream.currentReadPos = 0;
+    return ma_dr_wav_init__internal(pWav, NULL, NULL, flags | MA_DR_WAV_WITH_METADATA);
+}
+MA_PRIVATE ma_bool32 ma_dr_wav_init_memory_write__internal(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, ma_bool32 isSequential, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (ppData == NULL || pDataSize == NULL) {
+        return MA_FALSE;
+    }
+    *ppData = NULL;
+    *pDataSize = 0;
+    if (!ma_dr_wav_preinit_write(pWav, pFormat, isSequential, ma_dr_wav__on_write_memory, ma_dr_wav__on_seek_memory_write, pWav, pAllocationCallbacks)) {
+        return MA_FALSE;
+    }
+    pWav->memoryStreamWrite.ppData = ppData;
+    pWav->memoryStreamWrite.pDataSize = pDataSize;
+    pWav->memoryStreamWrite.dataSize = 0;
+    pWav->memoryStreamWrite.dataCapacity = 0;
+    pWav->memoryStreamWrite.currentWritePos = 0;
+    return ma_dr_wav_init_write__internal(pWav, pFormat, totalSampleCount);
+}
+MA_API ma_bool32 ma_dr_wav_init_memory_write(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, 0, MA_FALSE, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_memory_write_sequential(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalSampleCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_wav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, totalSampleCount, MA_TRUE, pAllocationCallbacks);
+}
+MA_API ma_bool32 ma_dr_wav_init_memory_write_sequential_pcm_frames(ma_dr_wav* pWav, void** ppData, size_t* pDataSize, const ma_dr_wav_data_format* pFormat, ma_uint64 totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pFormat == NULL) {
+        return MA_FALSE;
+    }
+    return ma_dr_wav_init_memory_write_sequential(pWav, ppData, pDataSize, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
+}
+MA_API ma_result ma_dr_wav_uninit(ma_dr_wav* pWav)
+{
+    ma_result result = MA_SUCCESS;
+    if (pWav == NULL) {
+        return MA_INVALID_ARGS;
+    }
+    if (pWav->onWrite != NULL) {
+        ma_uint32 paddingSize = 0;
+        if (pWav->container == ma_dr_wav_container_riff || pWav->container == ma_dr_wav_container_rf64) {
+            paddingSize = ma_dr_wav__chunk_padding_size_riff(pWav->dataChunkDataSize);
+        } else {
+            paddingSize = ma_dr_wav__chunk_padding_size_w64(pWav->dataChunkDataSize);
+        }
+        if (paddingSize > 0) {
+            ma_uint64 paddingData = 0;
+            ma_dr_wav__write(pWav, &paddingData, paddingSize);
+        }
+        if (pWav->onSeek && !pWav->isSequentialWrite) {
+            if (pWav->container == ma_dr_wav_container_riff) {
+                if (pWav->onSeek(pWav->pUserData, 4, ma_dr_wav_seek_origin_start)) {
+                    ma_uint32 riffChunkSize = ma_dr_wav__riff_chunk_size_riff(pWav->dataChunkDataSize, pWav->pMetadata, pWav->metadataCount);
+                    ma_dr_wav__write_u32ne_to_le(pWav, riffChunkSize);
+                }
+                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos - 4, ma_dr_wav_seek_origin_start)) {
+                    ma_uint32 dataChunkSize = ma_dr_wav__data_chunk_size_riff(pWav->dataChunkDataSize);
+                    ma_dr_wav__write_u32ne_to_le(pWav, dataChunkSize);
+                }
+            } else if (pWav->container == ma_dr_wav_container_w64) {
+                if (pWav->onSeek(pWav->pUserData, 16, ma_dr_wav_seek_origin_start)) {
+                    ma_uint64 riffChunkSize = ma_dr_wav__riff_chunk_size_w64(pWav->dataChunkDataSize);
+                    ma_dr_wav__write_u64ne_to_le(pWav, riffChunkSize);
+                }
+                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos - 8, ma_dr_wav_seek_origin_start)) {
+                    ma_uint64 dataChunkSize = ma_dr_wav__data_chunk_size_w64(pWav->dataChunkDataSize);
+                    ma_dr_wav__write_u64ne_to_le(pWav, dataChunkSize);
+                }
+            } else if (pWav->container == ma_dr_wav_container_rf64) {
+                int ds64BodyPos = 12 + 8;
+                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 0, ma_dr_wav_seek_origin_start)) {
+                    ma_uint64 riffChunkSize = ma_dr_wav__riff_chunk_size_rf64(pWav->dataChunkDataSize, pWav->pMetadata, pWav->metadataCount);
+                    ma_dr_wav__write_u64ne_to_le(pWav, riffChunkSize);
+                }
+                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 8, ma_dr_wav_seek_origin_start)) {
+                    ma_uint64 dataChunkSize = ma_dr_wav__data_chunk_size_rf64(pWav->dataChunkDataSize);
+                    ma_dr_wav__write_u64ne_to_le(pWav, dataChunkSize);
+                }
+            }
+        }
+        if (pWav->isSequentialWrite) {
+            if (pWav->dataChunkDataSize != pWav->dataChunkDataSizeTargetWrite) {
+                result = MA_INVALID_FILE;
+            }
+        }
+    } else {
+        ma_dr_wav_free(pWav->pMetadata, &pWav->allocationCallbacks);
+    }
+#ifndef MA_DR_WAV_NO_STDIO
+    if (pWav->onRead == ma_dr_wav__on_read_stdio || pWav->onWrite == ma_dr_wav__on_write_stdio) {
+        fclose((FILE*)pWav->pUserData);
+    }
+#endif
+    return result;
+}
+MA_API size_t ma_dr_wav_read_raw(ma_dr_wav* pWav, size_t bytesToRead, void* pBufferOut)
+{
+    size_t bytesRead;
+    ma_uint32 bytesPerFrame;
+    if (pWav == NULL || bytesToRead == 0) {
+        return 0;
+    }
+    if (bytesToRead > pWav->bytesRemaining) {
+        bytesToRead = (size_t)pWav->bytesRemaining;
+    }
+    if (bytesToRead == 0) {
+        return 0;
+    }
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    if (pBufferOut != NULL) {
+        bytesRead = pWav->onRead(pWav->pUserData, pBufferOut, bytesToRead);
+    } else {
+        bytesRead = 0;
+        while (bytesRead < bytesToRead) {
+            size_t bytesToSeek = (bytesToRead - bytesRead);
+            if (bytesToSeek > 0x7FFFFFFF) {
+                bytesToSeek = 0x7FFFFFFF;
+            }
+            if (pWav->onSeek(pWav->pUserData, (int)bytesToSeek, ma_dr_wav_seek_origin_current) == MA_FALSE) {
+                break;
+            }
+            bytesRead += bytesToSeek;
+        }
+        while (bytesRead < bytesToRead) {
+            ma_uint8 buffer[4096];
+            size_t bytesSeeked;
+            size_t bytesToSeek = (bytesToRead - bytesRead);
+            if (bytesToSeek > sizeof(buffer)) {
+                bytesToSeek = sizeof(buffer);
+            }
+            bytesSeeked = pWav->onRead(pWav->pUserData, buffer, bytesToSeek);
+            bytesRead += bytesSeeked;
+            if (bytesSeeked < bytesToSeek) {
+                break;
+            }
+        }
+    }
+    pWav->readCursorInPCMFrames += bytesRead / bytesPerFrame;
+    pWav->bytesRemaining -= bytesRead;
+    return bytesRead;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_le(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut)
+{
+    ma_uint32 bytesPerFrame;
+    ma_uint64 bytesToRead;
+    ma_uint64 framesRemainingInFile;
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+    if (ma_dr_wav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        return 0;
+    }
+    framesRemainingInFile = pWav->totalPCMFrameCount - pWav->readCursorInPCMFrames;
+    if (framesToRead > framesRemainingInFile) {
+        framesToRead = framesRemainingInFile;
+    }
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesToRead = framesToRead * bytesPerFrame;
+    if (bytesToRead > MA_SIZE_MAX) {
+        bytesToRead = (MA_SIZE_MAX / bytesPerFrame) * bytesPerFrame;
+    }
+    if (bytesToRead == 0) {
+        return 0;
+    }
+    return ma_dr_wav_read_raw(pWav, (size_t)bytesToRead, pBufferOut) / bytesPerFrame;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_be(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut)
+{
+    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL) {
+        ma_uint32 bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+        if (bytesPerFrame == 0) {
+            return 0;
+        }
+        ma_dr_wav__bswap_samples(pBufferOut, framesRead*pWav->channels, bytesPerFrame/pWav->channels);
+    }
+    return framesRead;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames(ma_dr_wav* pWav, ma_uint64 framesToRead, void* pBufferOut)
+{
+    ma_uint64 framesRead = 0;
+    if (ma_dr_wav_is_container_be(pWav->container)) {
+        if (pWav->container != ma_dr_wav_container_aiff || pWav->aiff.isLE == MA_FALSE) {
+            if (ma_dr_wav__is_little_endian()) {
+                framesRead = ma_dr_wav_read_pcm_frames_be(pWav, framesToRead, pBufferOut);
+            } else {
+                framesRead = ma_dr_wav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
+            }
+            goto post_process;
+        }
+    }
+    if (ma_dr_wav__is_little_endian()) {
+        framesRead = ma_dr_wav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
+    } else {
+        framesRead = ma_dr_wav_read_pcm_frames_be(pWav, framesToRead, pBufferOut);
+    }
+    post_process:
+    {
+        if (pWav->container == ma_dr_wav_container_aiff && pWav->bitsPerSample == 8 && pWav->aiff.isUnsigned == MA_FALSE) {
+            if (pBufferOut != NULL) {
+                ma_uint64 iSample;
+                for (iSample = 0; iSample < framesRead * pWav->channels; iSample += 1) {
+                    ((ma_uint8*)pBufferOut)[iSample] += 128;
+                }
+            }
+        }
+    }
+    return framesRead;
+}
+MA_PRIVATE ma_bool32 ma_dr_wav_seek_to_first_pcm_frame(ma_dr_wav* pWav)
+{
+    if (pWav->onWrite != NULL) {
+        return MA_FALSE;
+    }
+    if (!pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos, ma_dr_wav_seek_origin_start)) {
+        return MA_FALSE;
+    }
+    if (ma_dr_wav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
+            MA_DR_WAV_ZERO_OBJECT(&pWav->msadpcm);
+        } else if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+            MA_DR_WAV_ZERO_OBJECT(&pWav->ima);
+        } else {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+        }
+    }
+    pWav->readCursorInPCMFrames = 0;
+    pWav->bytesRemaining = pWav->dataChunkDataSize;
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_wav_seek_to_pcm_frame(ma_dr_wav* pWav, ma_uint64 targetFrameIndex)
+{
+    if (pWav == NULL || pWav->onSeek == NULL) {
+        return MA_FALSE;
+    }
+    if (pWav->onWrite != NULL) {
+        return MA_FALSE;
+    }
+    if (pWav->totalPCMFrameCount == 0) {
+        return MA_TRUE;
+    }
+    if (targetFrameIndex > pWav->totalPCMFrameCount) {
+        targetFrameIndex = pWav->totalPCMFrameCount;
+    }
+    if (ma_dr_wav__is_compressed_format_tag(pWav->translatedFormatTag)) {
+        if (targetFrameIndex < pWav->readCursorInPCMFrames) {
+            if (!ma_dr_wav_seek_to_first_pcm_frame(pWav)) {
+                return MA_FALSE;
+            }
+        }
+        if (targetFrameIndex > pWav->readCursorInPCMFrames) {
+            ma_uint64 offsetInFrames = targetFrameIndex - pWav->readCursorInPCMFrames;
+            ma_int16 devnull[2048];
+            while (offsetInFrames > 0) {
+                ma_uint64 framesRead = 0;
+                ma_uint64 framesToRead = offsetInFrames;
+                if (framesToRead > ma_dr_wav_countof(devnull)/pWav->channels) {
+                    framesToRead = ma_dr_wav_countof(devnull)/pWav->channels;
+                }
+                if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
+                    framesRead = ma_dr_wav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, devnull);
+                } else if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+                    framesRead = ma_dr_wav_read_pcm_frames_s16__ima(pWav, framesToRead, devnull);
+                } else {
+                    MA_DR_WAV_ASSERT(MA_FALSE);
+                }
+                if (framesRead != framesToRead) {
+                    return MA_FALSE;
+                }
+                offsetInFrames -= framesRead;
+            }
+        }
+    } else {
+        ma_uint64 totalSizeInBytes;
+        ma_uint64 currentBytePos;
+        ma_uint64 targetBytePos;
+        ma_uint64 offset;
+        ma_uint32 bytesPerFrame;
+        bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+        if (bytesPerFrame == 0) {
+            return MA_FALSE;
+        }
+        totalSizeInBytes = pWav->totalPCMFrameCount * bytesPerFrame;
+        currentBytePos = totalSizeInBytes - pWav->bytesRemaining;
+        targetBytePos  = targetFrameIndex * bytesPerFrame;
+        if (currentBytePos < targetBytePos) {
+            offset = (targetBytePos - currentBytePos);
+        } else {
+            if (!ma_dr_wav_seek_to_first_pcm_frame(pWav)) {
+                return MA_FALSE;
+            }
+            offset = targetBytePos;
+        }
+        while (offset > 0) {
+            int offset32 = ((offset > INT_MAX) ? INT_MAX : (int)offset);
+            if (!pWav->onSeek(pWav->pUserData, offset32, ma_dr_wav_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            pWav->readCursorInPCMFrames += offset32 / bytesPerFrame;
+            pWav->bytesRemaining        -= offset32;
+            offset                      -= offset32;
+        }
+    }
+    return MA_TRUE;
+}
+MA_API ma_result ma_dr_wav_get_cursor_in_pcm_frames(ma_dr_wav* pWav, ma_uint64* pCursor)
+{
+    if (pCursor == NULL) {
+        return MA_INVALID_ARGS;
+    }
+    *pCursor = 0;
+    if (pWav == NULL) {
+        return MA_INVALID_ARGS;
+    }
+    *pCursor = pWav->readCursorInPCMFrames;
+    return MA_SUCCESS;
+}
+MA_API ma_result ma_dr_wav_get_length_in_pcm_frames(ma_dr_wav* pWav, ma_uint64* pLength)
+{
+    if (pLength == NULL) {
+        return MA_INVALID_ARGS;
+    }
+    *pLength = 0;
+    if (pWav == NULL) {
+        return MA_INVALID_ARGS;
+    }
+    *pLength = pWav->totalPCMFrameCount;
+    return MA_SUCCESS;
+}
+MA_API size_t ma_dr_wav_write_raw(ma_dr_wav* pWav, size_t bytesToWrite, const void* pData)
+{
+    size_t bytesWritten;
+    if (pWav == NULL || bytesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+    bytesWritten = pWav->onWrite(pWav->pUserData, pData, bytesToWrite);
+    pWav->dataChunkDataSize += bytesWritten;
+    return bytesWritten;
+}
+MA_API ma_uint64 ma_dr_wav_write_pcm_frames_le(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData)
+{
+    ma_uint64 bytesToWrite;
+    ma_uint64 bytesWritten;
+    const ma_uint8* pRunningData;
+    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
+    if (bytesToWrite > MA_SIZE_MAX) {
+        return 0;
+    }
+    bytesWritten = 0;
+    pRunningData = (const ma_uint8*)pData;
+    while (bytesToWrite > 0) {
+        size_t bytesJustWritten;
+        ma_uint64 bytesToWriteThisIteration;
+        bytesToWriteThisIteration = bytesToWrite;
+        MA_DR_WAV_ASSERT(bytesToWriteThisIteration <= MA_SIZE_MAX);
+        bytesJustWritten = ma_dr_wav_write_raw(pWav, (size_t)bytesToWriteThisIteration, pRunningData);
+        if (bytesJustWritten == 0) {
+            break;
+        }
+        bytesToWrite -= bytesJustWritten;
+        bytesWritten += bytesJustWritten;
+        pRunningData += bytesJustWritten;
+    }
+    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
+}
+MA_API ma_uint64 ma_dr_wav_write_pcm_frames_be(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData)
+{
+    ma_uint64 bytesToWrite;
+    ma_uint64 bytesWritten;
+    ma_uint32 bytesPerSample;
+    const ma_uint8* pRunningData;
+    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
+        return 0;
+    }
+    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
+    if (bytesToWrite > MA_SIZE_MAX) {
+        return 0;
+    }
+    bytesWritten = 0;
+    pRunningData = (const ma_uint8*)pData;
+    bytesPerSample = ma_dr_wav_get_bytes_per_pcm_frame(pWav) / pWav->channels;
+    if (bytesPerSample == 0) {
+        return 0;
+    }
+    while (bytesToWrite > 0) {
+        ma_uint8 temp[4096];
+        ma_uint32 sampleCount;
+        size_t bytesJustWritten;
+        ma_uint64 bytesToWriteThisIteration;
+        bytesToWriteThisIteration = bytesToWrite;
+        MA_DR_WAV_ASSERT(bytesToWriteThisIteration <= MA_SIZE_MAX);
+        sampleCount = sizeof(temp)/bytesPerSample;
+        if (bytesToWriteThisIteration > ((ma_uint64)sampleCount)*bytesPerSample) {
+            bytesToWriteThisIteration = ((ma_uint64)sampleCount)*bytesPerSample;
+        }
+        MA_DR_WAV_COPY_MEMORY(temp, pRunningData, (size_t)bytesToWriteThisIteration);
+        ma_dr_wav__bswap_samples(temp, sampleCount, bytesPerSample);
+        bytesJustWritten = ma_dr_wav_write_raw(pWav, (size_t)bytesToWriteThisIteration, temp);
+        if (bytesJustWritten == 0) {
+            break;
+        }
+        bytesToWrite -= bytesJustWritten;
+        bytesWritten += bytesJustWritten;
+        pRunningData += bytesJustWritten;
+    }
+    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
+}
+MA_API ma_uint64 ma_dr_wav_write_pcm_frames(ma_dr_wav* pWav, ma_uint64 framesToWrite, const void* pData)
+{
+    if (ma_dr_wav__is_little_endian()) {
+        return ma_dr_wav_write_pcm_frames_le(pWav, framesToWrite, pData);
+    } else {
+        return ma_dr_wav_write_pcm_frames_be(pWav, framesToWrite, pData);
+    }
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__msadpcm(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 totalFramesRead = 0;
+    static ma_int32 adaptationTable[] = {
+        230, 230, 230, 230, 307, 409, 512, 614,
+        768, 614, 512, 409, 307, 230, 230, 230
+    };
+    static ma_int32 coeff1Table[] = { 256, 512, 0, 192, 240, 460,  392 };
+    static ma_int32 coeff2Table[] = { 0,  -256, 0, 64,  0,  -208, -232 };
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    MA_DR_WAV_ASSERT(framesToRead > 0);
+    while (pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
+        MA_DR_WAV_ASSERT(framesToRead > 0);
+        if (pWav->msadpcm.cachedFrameCount == 0 && pWav->msadpcm.bytesRemainingInBlock == 0) {
+            if (pWav->channels == 1) {
+                ma_uint8 header[7];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+                pWav->msadpcm.predictor[0]     = header[0];
+                pWav->msadpcm.delta[0]         = ma_dr_wav_bytes_to_s16(header + 1);
+                pWav->msadpcm.prevFrames[0][1] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 3);
+                pWav->msadpcm.prevFrames[0][0] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 5);
+                pWav->msadpcm.cachedFrames[2]  = pWav->msadpcm.prevFrames[0][0];
+                pWav->msadpcm.cachedFrames[3]  = pWav->msadpcm.prevFrames[0][1];
+                pWav->msadpcm.cachedFrameCount = 2;
+                if (pWav->msadpcm.predictor[0] >= ma_dr_wav_countof(coeff1Table)) {
+                    return totalFramesRead;
+                }
+            } else {
+                ma_uint8 header[14];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+                pWav->msadpcm.predictor[0] = header[0];
+                pWav->msadpcm.predictor[1] = header[1];
+                pWav->msadpcm.delta[0] = ma_dr_wav_bytes_to_s16(header + 2);
+                pWav->msadpcm.delta[1] = ma_dr_wav_bytes_to_s16(header + 4);
+                pWav->msadpcm.prevFrames[0][1] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 6);
+                pWav->msadpcm.prevFrames[1][1] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 8);
+                pWav->msadpcm.prevFrames[0][0] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 10);
+                pWav->msadpcm.prevFrames[1][0] = (ma_int32)ma_dr_wav_bytes_to_s16(header + 12);
+                pWav->msadpcm.cachedFrames[0] = pWav->msadpcm.prevFrames[0][0];
+                pWav->msadpcm.cachedFrames[1] = pWav->msadpcm.prevFrames[1][0];
+                pWav->msadpcm.cachedFrames[2] = pWav->msadpcm.prevFrames[0][1];
+                pWav->msadpcm.cachedFrames[3] = pWav->msadpcm.prevFrames[1][1];
+                pWav->msadpcm.cachedFrameCount = 2;
+                if (pWav->msadpcm.predictor[0] >= ma_dr_wav_countof(coeff1Table) || pWav->msadpcm.predictor[1] >= ma_dr_wav_countof(coeff2Table)) {
+                    return totalFramesRead;
+                }
+            }
+        }
+        while (framesToRead > 0 && pWav->msadpcm.cachedFrameCount > 0 && pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
+            if (pBufferOut != NULL) {
+                ma_uint32 iSample = 0;
+                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
+                    pBufferOut[iSample] = (ma_int16)pWav->msadpcm.cachedFrames[(ma_dr_wav_countof(pWav->msadpcm.cachedFrames) - (pWav->msadpcm.cachedFrameCount*pWav->channels)) + iSample];
+                }
+                pBufferOut += pWav->channels;
+            }
+            framesToRead    -= 1;
+            totalFramesRead += 1;
+            pWav->readCursorInPCMFrames += 1;
+            pWav->msadpcm.cachedFrameCount -= 1;
+        }
+        if (framesToRead == 0) {
+            break;
+        }
+        if (pWav->msadpcm.cachedFrameCount == 0) {
+            if (pWav->msadpcm.bytesRemainingInBlock == 0) {
+                continue;
+            } else {
+                ma_uint8 nibbles;
+                ma_int32 nibble0;
+                ma_int32 nibble1;
+                if (pWav->onRead(pWav->pUserData, &nibbles, 1) != 1) {
+                    return totalFramesRead;
+                }
+                pWav->msadpcm.bytesRemainingInBlock -= 1;
+                nibble0 = ((nibbles & 0xF0) >> 4); if ((nibbles & 0x80)) { nibble0 |= 0xFFFFFFF0UL; }
+                nibble1 = ((nibbles & 0x0F) >> 0); if ((nibbles & 0x08)) { nibble1 |= 0xFFFFFFF0UL; }
+                if (pWav->channels == 1) {
+                    ma_int32 newSample0;
+                    ma_int32 newSample1;
+                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
+                    newSample0  = ma_dr_wav_clamp(newSample0, -32768, 32767);
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample0;
+                    newSample1  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample1 += nibble1 * pWav->msadpcm.delta[0];
+                    newSample1  = ma_dr_wav_clamp(newSample1, -32768, 32767);
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample1;
+                    pWav->msadpcm.cachedFrames[2] = newSample0;
+                    pWav->msadpcm.cachedFrames[3] = newSample1;
+                    pWav->msadpcm.cachedFrameCount = 2;
+                } else {
+                    ma_int32 newSample0;
+                    ma_int32 newSample1;
+                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
+                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
+                    newSample0  = ma_dr_wav_clamp(newSample0, -32768, 32767);
+                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
+                    if (pWav->msadpcm.delta[0] < 16) {
+                        pWav->msadpcm.delta[0] = 16;
+                    }
+                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
+                    pWav->msadpcm.prevFrames[0][1] = newSample0;
+                    newSample1  = ((pWav->msadpcm.prevFrames[1][1] * coeff1Table[pWav->msadpcm.predictor[1]]) + (pWav->msadpcm.prevFrames[1][0] * coeff2Table[pWav->msadpcm.predictor[1]])) >> 8;
+                    newSample1 += nibble1 * pWav->msadpcm.delta[1];
+                    newSample1  = ma_dr_wav_clamp(newSample1, -32768, 32767);
+                    pWav->msadpcm.delta[1] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[1]) >> 8;
+                    if (pWav->msadpcm.delta[1] < 16) {
+                        pWav->msadpcm.delta[1] = 16;
+                    }
+                    pWav->msadpcm.prevFrames[1][0] = pWav->msadpcm.prevFrames[1][1];
+                    pWav->msadpcm.prevFrames[1][1] = newSample1;
+                    pWav->msadpcm.cachedFrames[2] = newSample0;
+                    pWav->msadpcm.cachedFrames[3] = newSample1;
+                    pWav->msadpcm.cachedFrameCount = 1;
+                }
+            }
+        }
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__ima(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 totalFramesRead = 0;
+    ma_uint32 iChannel;
+    static ma_int32 indexTable[16] = {
+        -1, -1, -1, -1, 2, 4, 6, 8,
+        -1, -1, -1, -1, 2, 4, 6, 8
+    };
+    static ma_int32 stepTable[89] = {
+        7,     8,     9,     10,    11,    12,    13,    14,    16,    17,
+        19,    21,    23,    25,    28,    31,    34,    37,    41,    45,
+        50,    55,    60,    66,    73,    80,    88,    97,    107,   118,
+        130,   143,   157,   173,   190,   209,   230,   253,   279,   307,
+        337,   371,   408,   449,   494,   544,   598,   658,   724,   796,
+        876,   963,   1060,  1166,  1282,  1411,  1552,  1707,  1878,  2066,
+        2272,  2499,  2749,  3024,  3327,  3660,  4026,  4428,  4871,  5358,
+        5894,  6484,  7132,  7845,  8630,  9493,  10442, 11487, 12635, 13899,
+        15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767
+    };
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    MA_DR_WAV_ASSERT(framesToRead > 0);
+    while (pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
+        MA_DR_WAV_ASSERT(framesToRead > 0);
+        if (pWav->ima.cachedFrameCount == 0 && pWav->ima.bytesRemainingInBlock == 0) {
+            if (pWav->channels == 1) {
+                ma_uint8 header[4];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+                if (header[2] >= ma_dr_wav_countof(stepTable)) {
+                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, ma_dr_wav_seek_origin_current);
+                    pWav->ima.bytesRemainingInBlock = 0;
+                    return totalFramesRead;
+                }
+                pWav->ima.predictor[0] = (ma_int16)ma_dr_wav_bytes_to_u16(header + 0);
+                pWav->ima.stepIndex[0] = ma_dr_wav_clamp(header[2], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
+                pWav->ima.cachedFrames[ma_dr_wav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[0];
+                pWav->ima.cachedFrameCount = 1;
+            } else {
+                ma_uint8 header[8];
+                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
+                    return totalFramesRead;
+                }
+                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
+                if (header[2] >= ma_dr_wav_countof(stepTable) || header[6] >= ma_dr_wav_countof(stepTable)) {
+                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, ma_dr_wav_seek_origin_current);
+                    pWav->ima.bytesRemainingInBlock = 0;
+                    return totalFramesRead;
+                }
+                pWav->ima.predictor[0] = ma_dr_wav_bytes_to_s16(header + 0);
+                pWav->ima.stepIndex[0] = ma_dr_wav_clamp(header[2], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
+                pWav->ima.predictor[1] = ma_dr_wav_bytes_to_s16(header + 4);
+                pWav->ima.stepIndex[1] = ma_dr_wav_clamp(header[6], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
+                pWav->ima.cachedFrames[ma_dr_wav_countof(pWav->ima.cachedFrames) - 2] = pWav->ima.predictor[0];
+                pWav->ima.cachedFrames[ma_dr_wav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[1];
+                pWav->ima.cachedFrameCount = 1;
+            }
+        }
+        while (framesToRead > 0 && pWav->ima.cachedFrameCount > 0 && pWav->readCursorInPCMFrames < pWav->totalPCMFrameCount) {
+            if (pBufferOut != NULL) {
+                ma_uint32 iSample;
+                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
+                    pBufferOut[iSample] = (ma_int16)pWav->ima.cachedFrames[(ma_dr_wav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + iSample];
+                }
+                pBufferOut += pWav->channels;
+            }
+            framesToRead    -= 1;
+            totalFramesRead += 1;
+            pWav->readCursorInPCMFrames += 1;
+            pWav->ima.cachedFrameCount -= 1;
+        }
+        if (framesToRead == 0) {
+            break;
+        }
+        if (pWav->ima.cachedFrameCount == 0) {
+            if (pWav->ima.bytesRemainingInBlock == 0) {
+                continue;
+            } else {
+                pWav->ima.cachedFrameCount = 8;
+                for (iChannel = 0; iChannel < pWav->channels; ++iChannel) {
+                    ma_uint32 iByte;
+                    ma_uint8 nibbles[4];
+                    if (pWav->onRead(pWav->pUserData, &nibbles, 4) != 4) {
+                        pWav->ima.cachedFrameCount = 0;
+                        return totalFramesRead;
+                    }
+                    pWav->ima.bytesRemainingInBlock -= 4;
+                    for (iByte = 0; iByte < 4; ++iByte) {
+                        ma_uint8 nibble0 = ((nibbles[iByte] & 0x0F) >> 0);
+                        ma_uint8 nibble1 = ((nibbles[iByte] & 0xF0) >> 4);
+                        ma_int32 step      = stepTable[pWav->ima.stepIndex[iChannel]];
+                        ma_int32 predictor = pWav->ima.predictor[iChannel];
+                        ma_int32      diff  = step >> 3;
+                        if (nibble0 & 1) diff += step >> 2;
+                        if (nibble0 & 2) diff += step >> 1;
+                        if (nibble0 & 4) diff += step;
+                        if (nibble0 & 8) diff  = -diff;
+                        predictor = ma_dr_wav_clamp(predictor + diff, -32768, 32767);
+                        pWav->ima.predictor[iChannel] = predictor;
+                        pWav->ima.stepIndex[iChannel] = ma_dr_wav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble0], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
+                        pWav->ima.cachedFrames[(ma_dr_wav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+0)*pWav->channels + iChannel] = predictor;
+                        step      = stepTable[pWav->ima.stepIndex[iChannel]];
+                        predictor = pWav->ima.predictor[iChannel];
+                                         diff  = step >> 3;
+                        if (nibble1 & 1) diff += step >> 2;
+                        if (nibble1 & 2) diff += step >> 1;
+                        if (nibble1 & 4) diff += step;
+                        if (nibble1 & 8) diff  = -diff;
+                        predictor = ma_dr_wav_clamp(predictor + diff, -32768, 32767);
+                        pWav->ima.predictor[iChannel] = predictor;
+                        pWav->ima.stepIndex[iChannel] = ma_dr_wav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble1], 0, (ma_int32)ma_dr_wav_countof(stepTable)-1);
+                        pWav->ima.cachedFrames[(ma_dr_wav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+1)*pWav->channels + iChannel] = predictor;
+                    }
+                }
+            }
+        }
+    }
+    return totalFramesRead;
+}
+#ifndef MA_DR_WAV_NO_CONVERSION_API
+static unsigned short g_ma_dr_wavAlawTable[256] = {
+    0xEA80, 0xEB80, 0xE880, 0xE980, 0xEE80, 0xEF80, 0xEC80, 0xED80, 0xE280, 0xE380, 0xE080, 0xE180, 0xE680, 0xE780, 0xE480, 0xE580,
+    0xF540, 0xF5C0, 0xF440, 0xF4C0, 0xF740, 0xF7C0, 0xF640, 0xF6C0, 0xF140, 0xF1C0, 0xF040, 0xF0C0, 0xF340, 0xF3C0, 0xF240, 0xF2C0,
+    0xAA00, 0xAE00, 0xA200, 0xA600, 0xBA00, 0xBE00, 0xB200, 0xB600, 0x8A00, 0x8E00, 0x8200, 0x8600, 0x9A00, 0x9E00, 0x9200, 0x9600,
+    0xD500, 0xD700, 0xD100, 0xD300, 0xDD00, 0xDF00, 0xD900, 0xDB00, 0xC500, 0xC700, 0xC100, 0xC300, 0xCD00, 0xCF00, 0xC900, 0xCB00,
+    0xFEA8, 0xFEB8, 0xFE88, 0xFE98, 0xFEE8, 0xFEF8, 0xFEC8, 0xFED8, 0xFE28, 0xFE38, 0xFE08, 0xFE18, 0xFE68, 0xFE78, 0xFE48, 0xFE58,
+    0xFFA8, 0xFFB8, 0xFF88, 0xFF98, 0xFFE8, 0xFFF8, 0xFFC8, 0xFFD8, 0xFF28, 0xFF38, 0xFF08, 0xFF18, 0xFF68, 0xFF78, 0xFF48, 0xFF58,
+    0xFAA0, 0xFAE0, 0xFA20, 0xFA60, 0xFBA0, 0xFBE0, 0xFB20, 0xFB60, 0xF8A0, 0xF8E0, 0xF820, 0xF860, 0xF9A0, 0xF9E0, 0xF920, 0xF960,
+    0xFD50, 0xFD70, 0xFD10, 0xFD30, 0xFDD0, 0xFDF0, 0xFD90, 0xFDB0, 0xFC50, 0xFC70, 0xFC10, 0xFC30, 0xFCD0, 0xFCF0, 0xFC90, 0xFCB0,
+    0x1580, 0x1480, 0x1780, 0x1680, 0x1180, 0x1080, 0x1380, 0x1280, 0x1D80, 0x1C80, 0x1F80, 0x1E80, 0x1980, 0x1880, 0x1B80, 0x1A80,
+    0x0AC0, 0x0A40, 0x0BC0, 0x0B40, 0x08C0, 0x0840, 0x09C0, 0x0940, 0x0EC0, 0x0E40, 0x0FC0, 0x0F40, 0x0CC0, 0x0C40, 0x0DC0, 0x0D40,
+    0x5600, 0x5200, 0x5E00, 0x5A00, 0x4600, 0x4200, 0x4E00, 0x4A00, 0x7600, 0x7200, 0x7E00, 0x7A00, 0x6600, 0x6200, 0x6E00, 0x6A00,
+    0x2B00, 0x2900, 0x2F00, 0x2D00, 0x2300, 0x2100, 0x2700, 0x2500, 0x3B00, 0x3900, 0x3F00, 0x3D00, 0x3300, 0x3100, 0x3700, 0x3500,
+    0x0158, 0x0148, 0x0178, 0x0168, 0x0118, 0x0108, 0x0138, 0x0128, 0x01D8, 0x01C8, 0x01F8, 0x01E8, 0x0198, 0x0188, 0x01B8, 0x01A8,
+    0x0058, 0x0048, 0x0078, 0x0068, 0x0018, 0x0008, 0x0038, 0x0028, 0x00D8, 0x00C8, 0x00F8, 0x00E8, 0x0098, 0x0088, 0x00B8, 0x00A8,
+    0x0560, 0x0520, 0x05E0, 0x05A0, 0x0460, 0x0420, 0x04E0, 0x04A0, 0x0760, 0x0720, 0x07E0, 0x07A0, 0x0660, 0x0620, 0x06E0, 0x06A0,
+    0x02B0, 0x0290, 0x02F0, 0x02D0, 0x0230, 0x0210, 0x0270, 0x0250, 0x03B0, 0x0390, 0x03F0, 0x03D0, 0x0330, 0x0310, 0x0370, 0x0350
+};
+static unsigned short g_ma_dr_wavMulawTable[256] = {
+    0x8284, 0x8684, 0x8A84, 0x8E84, 0x9284, 0x9684, 0x9A84, 0x9E84, 0xA284, 0xA684, 0xAA84, 0xAE84, 0xB284, 0xB684, 0xBA84, 0xBE84,
+    0xC184, 0xC384, 0xC584, 0xC784, 0xC984, 0xCB84, 0xCD84, 0xCF84, 0xD184, 0xD384, 0xD584, 0xD784, 0xD984, 0xDB84, 0xDD84, 0xDF84,
+    0xE104, 0xE204, 0xE304, 0xE404, 0xE504, 0xE604, 0xE704, 0xE804, 0xE904, 0xEA04, 0xEB04, 0xEC04, 0xED04, 0xEE04, 0xEF04, 0xF004,
+    0xF0C4, 0xF144, 0xF1C4, 0xF244, 0xF2C4, 0xF344, 0xF3C4, 0xF444, 0xF4C4, 0xF544, 0xF5C4, 0xF644, 0xF6C4, 0xF744, 0xF7C4, 0xF844,
+    0xF8A4, 0xF8E4, 0xF924, 0xF964, 0xF9A4, 0xF9E4, 0xFA24, 0xFA64, 0xFAA4, 0xFAE4, 0xFB24, 0xFB64, 0xFBA4, 0xFBE4, 0xFC24, 0xFC64,
+    0xFC94, 0xFCB4, 0xFCD4, 0xFCF4, 0xFD14, 0xFD34, 0xFD54, 0xFD74, 0xFD94, 0xFDB4, 0xFDD4, 0xFDF4, 0xFE14, 0xFE34, 0xFE54, 0xFE74,
+    0xFE8C, 0xFE9C, 0xFEAC, 0xFEBC, 0xFECC, 0xFEDC, 0xFEEC, 0xFEFC, 0xFF0C, 0xFF1C, 0xFF2C, 0xFF3C, 0xFF4C, 0xFF5C, 0xFF6C, 0xFF7C,
+    0xFF88, 0xFF90, 0xFF98, 0xFFA0, 0xFFA8, 0xFFB0, 0xFFB8, 0xFFC0, 0xFFC8, 0xFFD0, 0xFFD8, 0xFFE0, 0xFFE8, 0xFFF0, 0xFFF8, 0x0000,
+    0x7D7C, 0x797C, 0x757C, 0x717C, 0x6D7C, 0x697C, 0x657C, 0x617C, 0x5D7C, 0x597C, 0x557C, 0x517C, 0x4D7C, 0x497C, 0x457C, 0x417C,
+    0x3E7C, 0x3C7C, 0x3A7C, 0x387C, 0x367C, 0x347C, 0x327C, 0x307C, 0x2E7C, 0x2C7C, 0x2A7C, 0x287C, 0x267C, 0x247C, 0x227C, 0x207C,
+    0x1EFC, 0x1DFC, 0x1CFC, 0x1BFC, 0x1AFC, 0x19FC, 0x18FC, 0x17FC, 0x16FC, 0x15FC, 0x14FC, 0x13FC, 0x12FC, 0x11FC, 0x10FC, 0x0FFC,
+    0x0F3C, 0x0EBC, 0x0E3C, 0x0DBC, 0x0D3C, 0x0CBC, 0x0C3C, 0x0BBC, 0x0B3C, 0x0ABC, 0x0A3C, 0x09BC, 0x093C, 0x08BC, 0x083C, 0x07BC,
+    0x075C, 0x071C, 0x06DC, 0x069C, 0x065C, 0x061C, 0x05DC, 0x059C, 0x055C, 0x051C, 0x04DC, 0x049C, 0x045C, 0x041C, 0x03DC, 0x039C,
+    0x036C, 0x034C, 0x032C, 0x030C, 0x02EC, 0x02CC, 0x02AC, 0x028C, 0x026C, 0x024C, 0x022C, 0x020C, 0x01EC, 0x01CC, 0x01AC, 0x018C,
+    0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104, 0x00F4, 0x00E4, 0x00D4, 0x00C4, 0x00B4, 0x00A4, 0x0094, 0x0084,
+    0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040, 0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000
+};
+static MA_INLINE ma_int16 ma_dr_wav__alaw_to_s16(ma_uint8 sampleIn)
+{
+    return (short)g_ma_dr_wavAlawTable[sampleIn];
+}
+static MA_INLINE ma_int16 ma_dr_wav__mulaw_to_s16(ma_uint8 sampleIn)
+{
+    return (short)g_ma_dr_wavMulawTable[sampleIn];
+}
+MA_PRIVATE void ma_dr_wav__pcm_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    size_t i;
+    if (bytesPerSample == 1) {
+        ma_dr_wav_u8_to_s16(pOut, pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 2) {
+        for (i = 0; i < totalSampleCount; ++i) {
+           *pOut++ = ((const ma_int16*)pIn)[i];
+        }
+        return;
+    }
+    if (bytesPerSample == 3) {
+        ma_dr_wav_s24_to_s16(pOut, pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        ma_dr_wav_s32_to_s16(pOut, (const ma_int32*)pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample > 8) {
+        MA_DR_WAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+    for (i = 0; i < totalSampleCount; ++i) {
+        ma_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            MA_DR_WAV_ASSERT(j < 8);
+            sample |= (ma_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+        pIn += j;
+        *pOut++ = (ma_int16)((ma_int64)sample >> 48);
+    }
+}
+MA_PRIVATE void ma_dr_wav__ieee_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        ma_dr_wav_f32_to_s16(pOut, (const float*)pIn, totalSampleCount);
+        return;
+    } else if (bytesPerSample == 8) {
+        ma_dr_wav_f64_to_s16(pOut, (const double*)pIn, totalSampleCount);
+        return;
+    } else {
+        MA_DR_WAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__pcm(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    if ((pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 16) || pBufferOut == NULL) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav__pcm_to_s16(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__ieee(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    if (pBufferOut == NULL) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav__ieee_to_s16(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__alaw(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    if (pBufferOut == NULL) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav_alaw_to_s16(pBufferOut, sampleData, (size_t)samplesRead);
+        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == ma_dr_wav_container_aiff) {
+                ma_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s16__mulaw(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    if (pBufferOut == NULL) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav_mulaw_to_s16(pBufferOut, sampleData, (size_t)samplesRead);
+        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == ma_dr_wav_container_aiff) {
+                ma_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+    if (pBufferOut == NULL) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+    if (framesToRead * pWav->channels * sizeof(ma_int16) > MA_SIZE_MAX) {
+        framesToRead = MA_SIZE_MAX / sizeof(ma_int16) / pWav->channels;
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM) {
+        return ma_dr_wav_read_pcm_frames_s16__pcm(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return ma_dr_wav_read_pcm_frames_s16__ieee(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ALAW) {
+        return ma_dr_wav_read_pcm_frames_s16__alaw(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_MULAW) {
+        return ma_dr_wav_read_pcm_frames_s16__mulaw(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM) {
+        return ma_dr_wav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+        return ma_dr_wav_read_pcm_frames_s16__ima(pWav, framesToRead, pBufferOut);
+    }
+    return 0;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16le(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_FALSE) {
+        ma_dr_wav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
+    }
+    return framesRead;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s16be(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_TRUE) {
+        ma_dr_wav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
+    }
+    return framesRead;
+}
+MA_API void ma_dr_wav_u8_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = pIn[i];
+        r = x << 8;
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+MA_API void ma_dr_wav_s24_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = ((int)(((unsigned int)(((const ma_uint8*)pIn)[i*3+0]) << 8) | ((unsigned int)(((const ma_uint8*)pIn)[i*3+1]) << 16) | ((unsigned int)(((const ma_uint8*)pIn)[i*3+2])) << 24)) >> 8;
+        r = x >> 8;
+        pOut[i] = (short)r;
+    }
+}
+MA_API void ma_dr_wav_s32_to_s16(ma_int16* pOut, const ma_int32* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        int x = pIn[i];
+        r = x >> 16;
+        pOut[i] = (short)r;
+    }
+}
+MA_API void ma_dr_wav_f32_to_s16(ma_int16* pOut, const float* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        float x = pIn[i];
+        float c;
+        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
+        c = c + 1;
+        r = (int)(c * 32767.5f);
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+MA_API void ma_dr_wav_f64_to_s16(ma_int16* pOut, const double* pIn, size_t sampleCount)
+{
+    int r;
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        double x = pIn[i];
+        double c;
+        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
+        c = c + 1;
+        r = (int)(c * 32767.5);
+        r = r - 32768;
+        pOut[i] = (short)r;
+    }
+}
+MA_API void ma_dr_wav_alaw_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        pOut[i] = ma_dr_wav__alaw_to_s16(pIn[i]);
+    }
+}
+MA_API void ma_dr_wav_mulaw_to_s16(ma_int16* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    for (i = 0; i < sampleCount; ++i) {
+        pOut[i] = ma_dr_wav__mulaw_to_s16(pIn[i]);
+    }
+}
+MA_PRIVATE void ma_dr_wav__pcm_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
+{
+    unsigned int i;
+    if (bytesPerSample == 1) {
+        ma_dr_wav_u8_to_f32(pOut, pIn, sampleCount);
+        return;
+    }
+    if (bytesPerSample == 2) {
+        ma_dr_wav_s16_to_f32(pOut, (const ma_int16*)pIn, sampleCount);
+        return;
+    }
+    if (bytesPerSample == 3) {
+        ma_dr_wav_s24_to_f32(pOut, pIn, sampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        ma_dr_wav_s32_to_f32(pOut, (const ma_int32*)pIn, sampleCount);
+        return;
+    }
+    if (bytesPerSample > 8) {
+        MA_DR_WAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        ma_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            MA_DR_WAV_ASSERT(j < 8);
+            sample |= (ma_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+        pIn += j;
+        *pOut++ = (float)((ma_int64)sample / 9223372036854775807.0);
+    }
+}
+MA_PRIVATE void ma_dr_wav__ieee_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        unsigned int i;
+        for (i = 0; i < sampleCount; ++i) {
+            *pOut++ = ((const float*)pIn)[i];
+        }
+        return;
+    } else if (bytesPerSample == 8) {
+        ma_dr_wav_f64_to_f32(pOut, (const double*)pIn, sampleCount);
+        return;
+    } else {
+        MA_DR_WAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
+        return;
+    }
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__pcm(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav__pcm_to_f32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__msadpcm_ima(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_int16 samples16[2048];
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, ma_dr_wav_countof(samples16)/pWav->channels);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, framesToReadThisIteration, samples16);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        ma_dr_wav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__ieee(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_IEEE_FLOAT && pWav->bitsPerSample == 32) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav__ieee_to_f32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__alaw(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav_alaw_to_f32(pBufferOut, sampleData, (size_t)samplesRead);
+        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == ma_dr_wav_container_aiff) {
+                ma_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_f32__mulaw(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav_mulaw_to_f32(pBufferOut, sampleData, (size_t)samplesRead);
+        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == ma_dr_wav_container_aiff) {
+                ma_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+    if (pBufferOut == NULL) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+    if (framesToRead * pWav->channels * sizeof(float) > MA_SIZE_MAX) {
+        framesToRead = MA_SIZE_MAX / sizeof(float) / pWav->channels;
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM) {
+        return ma_dr_wav_read_pcm_frames_f32__pcm(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+        return ma_dr_wav_read_pcm_frames_f32__msadpcm_ima(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return ma_dr_wav_read_pcm_frames_f32__ieee(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ALAW) {
+        return ma_dr_wav_read_pcm_frames_f32__alaw(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_MULAW) {
+        return ma_dr_wav_read_pcm_frames_f32__mulaw(pWav, framesToRead, pBufferOut);
+    }
+    return 0;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32le(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
+{
+    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_FALSE) {
+        ma_dr_wav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
+    }
+    return framesRead;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_f32be(ma_dr_wav* pWav, ma_uint64 framesToRead, float* pBufferOut)
+{
+    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_TRUE) {
+        ma_dr_wav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
+    }
+    return framesRead;
+}
+MA_API void ma_dr_wav_u8_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+#ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (pIn[i] / 256.0f) * 2 - 1;
+    }
+#else
+    for (i = 0; i < sampleCount; ++i) {
+        float x = pIn[i];
+        x = x * 0.00784313725490196078f;
+        x = x - 1;
+        *pOut++ = x;
+    }
+#endif
+}
+MA_API void ma_dr_wav_s16_to_f32(float* pOut, const ma_int16* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = pIn[i] * 0.000030517578125f;
+    }
+}
+MA_API void ma_dr_wav_s24_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        double x;
+        ma_uint32 a = ((ma_uint32)(pIn[i*3+0]) <<  8);
+        ma_uint32 b = ((ma_uint32)(pIn[i*3+1]) << 16);
+        ma_uint32 c = ((ma_uint32)(pIn[i*3+2]) << 24);
+        x = (double)((ma_int32)(a | b | c) >> 8);
+        *pOut++ = (float)(x * 0.00000011920928955078125);
+    }
+}
+MA_API void ma_dr_wav_s32_to_f32(float* pOut, const ma_int32* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (float)(pIn[i] / 2147483648.0);
+    }
+}
+MA_API void ma_dr_wav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (float)pIn[i];
+    }
+}
+MA_API void ma_dr_wav_alaw_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = ma_dr_wav__alaw_to_s16(pIn[i]) / 32768.0f;
+    }
+}
+MA_API void ma_dr_wav_mulaw_to_f32(float* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = ma_dr_wav__mulaw_to_s16(pIn[i]) / 32768.0f;
+    }
+}
+MA_PRIVATE void ma_dr_wav__pcm_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    unsigned int i;
+    if (bytesPerSample == 1) {
+        ma_dr_wav_u8_to_s32(pOut, pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 2) {
+        ma_dr_wav_s16_to_s32(pOut, (const ma_int16*)pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 3) {
+        ma_dr_wav_s24_to_s32(pOut, pIn, totalSampleCount);
+        return;
+    }
+    if (bytesPerSample == 4) {
+        for (i = 0; i < totalSampleCount; ++i) {
+           *pOut++ = ((const ma_int32*)pIn)[i];
+        }
+        return;
+    }
+    if (bytesPerSample > 8) {
+        MA_DR_WAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+    for (i = 0; i < totalSampleCount; ++i) {
+        ma_uint64 sample = 0;
+        unsigned int shift  = (8 - bytesPerSample) * 8;
+        unsigned int j;
+        for (j = 0; j < bytesPerSample; j += 1) {
+            MA_DR_WAV_ASSERT(j < 8);
+            sample |= (ma_uint64)(pIn[j]) << shift;
+            shift  += 8;
+        }
+        pIn += j;
+        *pOut++ = (ma_int32)((ma_int64)sample >> 32);
+    }
+}
+MA_PRIVATE void ma_dr_wav__ieee_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
+{
+    if (bytesPerSample == 4) {
+        ma_dr_wav_f32_to_s32(pOut, (const float*)pIn, totalSampleCount);
+        return;
+    } else if (bytesPerSample == 8) {
+        ma_dr_wav_f64_to_s32(pOut, (const double*)pIn, totalSampleCount);
+        return;
+    } else {
+        MA_DR_WAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
+        return;
+    }
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__pcm(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 32) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, pBufferOut);
+    }
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav__pcm_to_s32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__msadpcm_ima(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    ma_uint64 totalFramesRead = 0;
+    ma_int16 samples16[2048];
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, ma_dr_wav_countof(samples16)/pWav->channels);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, framesToReadThisIteration, samples16);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        ma_dr_wav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));
+        pBufferOut      += framesRead*pWav->channels;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__ieee(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav__ieee_to_s32(pBufferOut, sampleData, (size_t)samplesRead, bytesPerSample);
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__alaw(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav_alaw_to_s32(pBufferOut, sampleData, (size_t)samplesRead);
+        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == ma_dr_wav_container_aiff) {
+                ma_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_PRIVATE ma_uint64 ma_dr_wav_read_pcm_frames_s32__mulaw(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    ma_uint64 totalFramesRead;
+    ma_uint8 sampleData[4096] = {0};
+    ma_uint32 bytesPerFrame;
+    ma_uint32 bytesPerSample;
+    ma_uint64 samplesRead;
+    bytesPerFrame = ma_dr_wav_get_bytes_per_pcm_frame(pWav);
+    if (bytesPerFrame == 0) {
+        return 0;
+    }
+    bytesPerSample = bytesPerFrame / pWav->channels;
+    if (bytesPerSample == 0 || (bytesPerFrame % pWav->channels) != 0) {
+        return 0;
+    }
+    totalFramesRead = 0;
+    while (framesToRead > 0) {
+        ma_uint64 framesToReadThisIteration = ma_dr_wav_min(framesToRead, sizeof(sampleData)/bytesPerFrame);
+        ma_uint64 framesRead = ma_dr_wav_read_pcm_frames(pWav, framesToReadThisIteration, sampleData);
+        if (framesRead == 0) {
+            break;
+        }
+        MA_DR_WAV_ASSERT(framesRead <= framesToReadThisIteration);
+        samplesRead = framesRead * pWav->channels;
+        if ((samplesRead * bytesPerSample) > sizeof(sampleData)) {
+            MA_DR_WAV_ASSERT(MA_FALSE);
+            break;
+        }
+        ma_dr_wav_mulaw_to_s32(pBufferOut, sampleData, (size_t)samplesRead);
+        #ifdef MA_DR_WAV_LIBSNDFILE_COMPAT
+        {
+            if (pWav->container == ma_dr_wav_container_aiff) {
+                ma_uint64 iSample;
+                for (iSample = 0; iSample < samplesRead; iSample += 1) {
+                    pBufferOut[iSample] = -pBufferOut[iSample];
+                }
+            }
+        }
+        #endif
+        pBufferOut      += samplesRead;
+        framesToRead    -= framesRead;
+        totalFramesRead += framesRead;
+    }
+    return totalFramesRead;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    if (pWav == NULL || framesToRead == 0) {
+        return 0;
+    }
+    if (pBufferOut == NULL) {
+        return ma_dr_wav_read_pcm_frames(pWav, framesToRead, NULL);
+    }
+    if (framesToRead * pWav->channels * sizeof(ma_int32) > MA_SIZE_MAX) {
+        framesToRead = MA_SIZE_MAX / sizeof(ma_int32) / pWav->channels;
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_PCM) {
+        return ma_dr_wav_read_pcm_frames_s32__pcm(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_DVI_ADPCM) {
+        return ma_dr_wav_read_pcm_frames_s32__msadpcm_ima(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_IEEE_FLOAT) {
+        return ma_dr_wav_read_pcm_frames_s32__ieee(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_ALAW) {
+        return ma_dr_wav_read_pcm_frames_s32__alaw(pWav, framesToRead, pBufferOut);
+    }
+    if (pWav->translatedFormatTag == MA_DR_WAVE_FORMAT_MULAW) {
+        return ma_dr_wav_read_pcm_frames_s32__mulaw(pWav, framesToRead, pBufferOut);
+    }
+    return 0;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32le(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_FALSE) {
+        ma_dr_wav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
+    }
+    return framesRead;
+}
+MA_API ma_uint64 ma_dr_wav_read_pcm_frames_s32be(ma_dr_wav* pWav, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    ma_uint64 framesRead = ma_dr_wav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
+    if (pBufferOut != NULL && ma_dr_wav__is_little_endian() == MA_TRUE) {
+        ma_dr_wav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
+    }
+    return framesRead;
+}
+MA_API void ma_dr_wav_u8_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = ((int)pIn[i] - 128) << 24;
+    }
+}
+MA_API void ma_dr_wav_s16_to_s32(ma_int32* pOut, const ma_int16* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = pIn[i] << 16;
+    }
+}
+MA_API void ma_dr_wav_s24_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        unsigned int s0 = pIn[i*3 + 0];
+        unsigned int s1 = pIn[i*3 + 1];
+        unsigned int s2 = pIn[i*3 + 2];
+        ma_int32 sample32 = (ma_int32)((s0 << 8) | (s1 << 16) | (s2 << 24));
+        *pOut++ = sample32;
+    }
+}
+MA_API void ma_dr_wav_f32_to_s32(ma_int32* pOut, const float* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (ma_int32)(2147483648.0f * pIn[i]);
+    }
+}
+MA_API void ma_dr_wav_f64_to_s32(ma_int32* pOut, const double* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = (ma_int32)(2147483648.0 * pIn[i]);
+    }
+}
+MA_API void ma_dr_wav_alaw_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i = 0; i < sampleCount; ++i) {
+        *pOut++ = ((ma_int32)ma_dr_wav__alaw_to_s16(pIn[i])) << 16;
+    }
+}
+MA_API void ma_dr_wav_mulaw_to_s32(ma_int32* pOut, const ma_uint8* pIn, size_t sampleCount)
+{
+    size_t i;
+    if (pOut == NULL || pIn == NULL) {
+        return;
+    }
+    for (i= 0; i < sampleCount; ++i) {
+        *pOut++ = ((ma_int32)ma_dr_wav__mulaw_to_s16(pIn[i])) << 16;
+    }
+}
+MA_PRIVATE ma_int16* ma_dr_wav__read_pcm_frames_and_close_s16(ma_dr_wav* pWav, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalFrameCount)
+{
+    ma_uint64 sampleDataSize;
+    ma_int16* pSampleData;
+    ma_uint64 framesRead;
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(ma_int16);
+    if (sampleDataSize > MA_SIZE_MAX) {
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    pSampleData = (ma_int16*)ma_dr_wav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks);
+    if (pSampleData == NULL) {
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    framesRead = ma_dr_wav_read_pcm_frames_s16(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        ma_dr_wav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    ma_dr_wav_uninit(pWav);
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+    return pSampleData;
+}
+MA_PRIVATE float* ma_dr_wav__read_pcm_frames_and_close_f32(ma_dr_wav* pWav, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalFrameCount)
+{
+    ma_uint64 sampleDataSize;
+    float* pSampleData;
+    ma_uint64 framesRead;
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(float);
+    if (sampleDataSize > MA_SIZE_MAX) {
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    pSampleData = (float*)ma_dr_wav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks);
+    if (pSampleData == NULL) {
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    framesRead = ma_dr_wav_read_pcm_frames_f32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        ma_dr_wav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    ma_dr_wav_uninit(pWav);
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+    return pSampleData;
+}
+MA_PRIVATE ma_int32* ma_dr_wav__read_pcm_frames_and_close_s32(ma_dr_wav* pWav, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalFrameCount)
+{
+    ma_uint64 sampleDataSize;
+    ma_int32* pSampleData;
+    ma_uint64 framesRead;
+    MA_DR_WAV_ASSERT(pWav != NULL);
+    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(ma_int32);
+    if (sampleDataSize > MA_SIZE_MAX) {
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    pSampleData = (ma_int32*)ma_dr_wav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks);
+    if (pSampleData == NULL) {
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    framesRead = ma_dr_wav_read_pcm_frames_s32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
+    if (framesRead != pWav->totalPCMFrameCount) {
+        ma_dr_wav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
+        ma_dr_wav_uninit(pWav);
+        return NULL;
+    }
+    ma_dr_wav_uninit(pWav);
+    if (sampleRate) {
+        *sampleRate = pWav->sampleRate;
+    }
+    if (channels) {
+        *channels = pWav->channels;
+    }
+    if (totalFrameCount) {
+        *totalFrameCount = pWav->totalPCMFrameCount;
+    }
+    return pSampleData;
+}
+MA_API ma_int16* ma_dr_wav_open_and_read_pcm_frames_s16(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+MA_API float* ma_dr_wav_open_and_read_pcm_frames_f32(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+MA_API ma_int32* ma_dr_wav_open_and_read_pcm_frames_s32(ma_dr_wav_read_proc onRead, ma_dr_wav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+#ifndef MA_DR_WAV_NO_STDIO
+MA_API ma_int16* ma_dr_wav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+MA_API float* ma_dr_wav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+MA_API ma_int32* ma_dr_wav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_file(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+#ifndef MA_DR_WAV_NO_WCHAR
+MA_API ma_int16* ma_dr_wav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+MA_API float* ma_dr_wav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+MA_API ma_int32* ma_dr_wav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_file_w(&wav, filename, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+#endif
+#endif
+MA_API ma_int16* ma_dr_wav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+MA_API float* ma_dr_wav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+MA_API ma_int32* ma_dr_wav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_wav wav;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalFrameCountOut) {
+        *totalFrameCountOut = 0;
+    }
+    if (!ma_dr_wav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_wav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
+}
+#endif
+MA_API void ma_dr_wav_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        ma_dr_wav__free_from_callbacks(p, pAllocationCallbacks);
+    } else {
+        ma_dr_wav__free_default(p, NULL);
+    }
+}
+MA_API ma_uint16 ma_dr_wav_bytes_to_u16(const ma_uint8* data)
+{
+    return ((ma_uint16)data[0] << 0) | ((ma_uint16)data[1] << 8);
+}
+MA_API ma_int16 ma_dr_wav_bytes_to_s16(const ma_uint8* data)
+{
+    return (ma_int16)ma_dr_wav_bytes_to_u16(data);
+}
+MA_API ma_uint32 ma_dr_wav_bytes_to_u32(const ma_uint8* data)
+{
+    return ma_dr_wav_bytes_to_u32_le(data);
+}
+MA_API float ma_dr_wav_bytes_to_f32(const ma_uint8* data)
+{
+    union {
+        ma_uint32 u32;
+        float f32;
+    } value;
+    value.u32 = ma_dr_wav_bytes_to_u32(data);
+    return value.f32;
+}
+MA_API ma_int32 ma_dr_wav_bytes_to_s32(const ma_uint8* data)
+{
+    return (ma_int32)ma_dr_wav_bytes_to_u32(data);
+}
+MA_API ma_uint64 ma_dr_wav_bytes_to_u64(const ma_uint8* data)
+{
+    return
+        ((ma_uint64)data[0] <<  0) | ((ma_uint64)data[1] <<  8) | ((ma_uint64)data[2] << 16) | ((ma_uint64)data[3] << 24) |
+        ((ma_uint64)data[4] << 32) | ((ma_uint64)data[5] << 40) | ((ma_uint64)data[6] << 48) | ((ma_uint64)data[7] << 56);
+}
+MA_API ma_int64 ma_dr_wav_bytes_to_s64(const ma_uint8* data)
+{
+    return (ma_int64)ma_dr_wav_bytes_to_u64(data);
+}
+MA_API ma_bool32 ma_dr_wav_guid_equal(const ma_uint8 a[16], const ma_uint8 b[16])
+{
+    int i;
+    for (i = 0; i < 16; i += 1) {
+        if (a[i] != b[i]) {
+            return MA_FALSE;
+        }
+    }
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_wav_fourcc_equal(const ma_uint8* a, const char* b)
+{
+    return
+        a[0] == b[0] &&
+        a[1] == b[1] &&
+        a[2] == b[2] &&
+        a[3] == b[3];
+}
+#ifdef __MRC__
+#pragma options opt reset
+#endif
+#endif
+/* dr_wav_c end */
+#endif  /* MA_DR_WAV_IMPLEMENTATION */
+#endif  /* MA_NO_WAV */
+
+#if !defined(MA_NO_FLAC) && !defined(MA_NO_DECODING)
+#if !defined(MA_DR_FLAC_IMPLEMENTATION)
+/* dr_flac_c begin */
+#ifndef ma_dr_flac_c
+#define ma_dr_flac_c
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic push
+    #if __GNUC__ >= 7
+    #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+    #endif
+#endif
+#ifdef __linux__
+    #ifndef _BSD_SOURCE
+        #define _BSD_SOURCE
+    #endif
+    #ifndef _DEFAULT_SOURCE
+        #define _DEFAULT_SOURCE
+    #endif
+    #ifndef __USE_BSD
+        #define __USE_BSD
+    #endif
+    #include <endian.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#if !defined(MA_DR_FLAC_NO_SIMD)
+    #if defined(MA_X64) || defined(MA_X86)
+        #if defined(_MSC_VER) && !defined(__clang__)
+            #if _MSC_VER >= 1400 && !defined(MA_DR_FLAC_NO_SSE2)
+                #define MA_DR_FLAC_SUPPORT_SSE2
+            #endif
+            #if _MSC_VER >= 1600 && !defined(MA_DR_FLAC_NO_SSE41)
+                #define MA_DR_FLAC_SUPPORT_SSE41
+            #endif
+        #elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
+            #if defined(__SSE2__) && !defined(MA_DR_FLAC_NO_SSE2)
+                #define MA_DR_FLAC_SUPPORT_SSE2
+            #endif
+            #if defined(__SSE4_1__) && !defined(MA_DR_FLAC_NO_SSE41)
+                #define MA_DR_FLAC_SUPPORT_SSE41
+            #endif
+        #endif
+        #if !defined(__GNUC__) && !defined(__clang__) && defined(__has_include)
+            #if !defined(MA_DR_FLAC_SUPPORT_SSE2) && !defined(MA_DR_FLAC_NO_SSE2) && __has_include(<emmintrin.h>)
+                #define MA_DR_FLAC_SUPPORT_SSE2
+            #endif
+            #if !defined(MA_DR_FLAC_SUPPORT_SSE41) && !defined(MA_DR_FLAC_NO_SSE41) && __has_include(<smmintrin.h>)
+                #define MA_DR_FLAC_SUPPORT_SSE41
+            #endif
+        #endif
+        #if defined(MA_DR_FLAC_SUPPORT_SSE41)
+            #include <smmintrin.h>
+        #elif defined(MA_DR_FLAC_SUPPORT_SSE2)
+            #include <emmintrin.h>
+        #endif
+    #endif
+    #if defined(MA_ARM)
+        #if !defined(MA_DR_FLAC_NO_NEON) && (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
+            #define MA_DR_FLAC_SUPPORT_NEON
+            #include <arm_neon.h>
+        #endif
+    #endif
+#endif
+#if !defined(MA_DR_FLAC_NO_SIMD) && (defined(MA_X86) || defined(MA_X64))
+    #if defined(_MSC_VER) && !defined(__clang__)
+        #if _MSC_VER >= 1400
+            #include <intrin.h>
+            static void ma_dr_flac__cpuid(int info[4], int fid)
+            {
+                __cpuid(info, fid);
+            }
+        #else
+            #define MA_DR_FLAC_NO_CPUID
+        #endif
+    #else
+        #if defined(__GNUC__) || defined(__clang__)
+            static void ma_dr_flac__cpuid(int info[4], int fid)
+            {
+                #if defined(MA_X86) && defined(__PIC__)
+                    __asm__ __volatile__ (
+                        "xchg{l} {%%}ebx, %k1;"
+                        "cpuid;"
+                        "xchg{l} {%%}ebx, %k1;"
+                        : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
+                    );
+                #else
+                    __asm__ __volatile__ (
+                        "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(fid), "c"(0)
+                    );
+                #endif
+            }
+        #else
+            #define MA_DR_FLAC_NO_CPUID
+        #endif
+    #endif
+#else
+    #define MA_DR_FLAC_NO_CPUID
+#endif
+static MA_INLINE ma_bool32 ma_dr_flac_has_sse2(void)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_DR_FLAC_NO_SSE2)
+        #if defined(MA_X64)
+            return MA_TRUE;
+        #elif (defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)
+            return MA_TRUE;
+        #else
+            #if defined(MA_DR_FLAC_NO_CPUID)
+                return MA_FALSE;
+            #else
+                int info[4];
+                ma_dr_flac__cpuid(info, 1);
+                return (info[3] & (1 << 26)) != 0;
+            #endif
+        #endif
+    #else
+        return MA_FALSE;
+    #endif
+#else
+    return MA_FALSE;
+#endif
+}
+static MA_INLINE ma_bool32 ma_dr_flac_has_sse41(void)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE41)
+    #if (defined(MA_X64) || defined(MA_X86)) && !defined(MA_DR_FLAC_NO_SSE41)
+        #if defined(__SSE4_1__) || defined(__AVX__)
+            return MA_TRUE;
+        #else
+            #if defined(MA_DR_FLAC_NO_CPUID)
+                return MA_FALSE;
+            #else
+                int info[4];
+                ma_dr_flac__cpuid(info, 1);
+                return (info[2] & (1 << 19)) != 0;
+            #endif
+        #endif
+    #else
+        return MA_FALSE;
+    #endif
+#else
+    return MA_FALSE;
+#endif
+}
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && (defined(MA_X86) || defined(MA_X64)) && !defined(__clang__)
+    #define MA_DR_FLAC_HAS_LZCNT_INTRINSIC
+#elif (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+    #define MA_DR_FLAC_HAS_LZCNT_INTRINSIC
+#elif defined(__clang__)
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_clzll) || __has_builtin(__builtin_clzl)
+            #define MA_DR_FLAC_HAS_LZCNT_INTRINSIC
+        #endif
+    #endif
+#endif
+#if defined(_MSC_VER) && _MSC_VER >= 1400 && !defined(__clang__)
+    #define MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
+    #define MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
+    #define MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
+#elif defined(__clang__)
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_bswap16)
+            #define MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap32)
+            #define MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
+        #endif
+        #if __has_builtin(__builtin_bswap64)
+            #define MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
+        #endif
+    #endif
+#elif defined(__GNUC__)
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+        #define MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
+        #define MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
+    #endif
+    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+        #define MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
+    #endif
+#elif defined(__WATCOMC__) && defined(__386__)
+    #define MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
+    #define MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
+    #define MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
+    extern __inline ma_uint16 _watcom_bswap16(ma_uint16);
+    extern __inline ma_uint32 _watcom_bswap32(ma_uint32);
+    extern __inline ma_uint64 _watcom_bswap64(ma_uint64);
+#pragma aux _watcom_bswap16 = \
+    "xchg al, ah" \
+    parm  [ax]    \
+    value [ax]    \
+    modify nomemory;
+#pragma aux _watcom_bswap32 = \
+    "bswap eax" \
+    parm  [eax] \
+    value [eax] \
+    modify nomemory;
+#pragma aux _watcom_bswap64 = \
+    "bswap eax"     \
+    "bswap edx"     \
+    "xchg eax,edx"  \
+    parm [eax edx]  \
+    value [eax edx] \
+    modify nomemory;
+#endif
+#ifndef MA_DR_FLAC_ASSERT
+#include <assert.h>
+#define MA_DR_FLAC_ASSERT(expression)           assert(expression)
+#endif
+#ifndef MA_DR_FLAC_MALLOC
+#define MA_DR_FLAC_MALLOC(sz)                   malloc((sz))
+#endif
+#ifndef MA_DR_FLAC_REALLOC
+#define MA_DR_FLAC_REALLOC(p, sz)               realloc((p), (sz))
+#endif
+#ifndef MA_DR_FLAC_FREE
+#define MA_DR_FLAC_FREE(p)                      free((p))
+#endif
+#ifndef MA_DR_FLAC_COPY_MEMORY
+#define MA_DR_FLAC_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
+#endif
+#ifndef MA_DR_FLAC_ZERO_MEMORY
+#define MA_DR_FLAC_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
+#endif
+#ifndef MA_DR_FLAC_ZERO_OBJECT
+#define MA_DR_FLAC_ZERO_OBJECT(p)               MA_DR_FLAC_ZERO_MEMORY((p), sizeof(*(p)))
+#endif
+#define MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE                     64
+#define MA_DR_FLAC_SUBFRAME_CONSTANT                        0
+#define MA_DR_FLAC_SUBFRAME_VERBATIM                        1
+#define MA_DR_FLAC_SUBFRAME_FIXED                           8
+#define MA_DR_FLAC_SUBFRAME_LPC                             32
+#define MA_DR_FLAC_SUBFRAME_RESERVED                        255
+#define MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE  0
+#define MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2 1
+#define MA_DR_FLAC_CHANNEL_ASSIGNMENT_INDEPENDENT           0
+#define MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE             8
+#define MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE            9
+#define MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE              10
+#define MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES                  18
+#define MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES             36
+#define MA_DR_FLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES       12
+#define ma_dr_flac_align(x, a)                              ((((x) + (a) - 1) / (a)) * (a))
+MA_API void ma_dr_flac_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision)
+{
+    if (pMajor) {
+        *pMajor = MA_DR_FLAC_VERSION_MAJOR;
+    }
+    if (pMinor) {
+        *pMinor = MA_DR_FLAC_VERSION_MINOR;
+    }
+    if (pRevision) {
+        *pRevision = MA_DR_FLAC_VERSION_REVISION;
+    }
+}
+MA_API const char* ma_dr_flac_version_string(void)
+{
+    return MA_DR_FLAC_VERSION_STRING;
+}
+#if defined(__has_feature)
+    #if __has_feature(thread_sanitizer)
+        #define MA_DR_FLAC_NO_THREAD_SANITIZE __attribute__((no_sanitize("thread")))
+    #else
+        #define MA_DR_FLAC_NO_THREAD_SANITIZE
+    #endif
+#else
+    #define MA_DR_FLAC_NO_THREAD_SANITIZE
+#endif
+#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC)
+static ma_bool32 ma_dr_flac__gIsLZCNTSupported = MA_FALSE;
+#endif
+#ifndef MA_DR_FLAC_NO_CPUID
+static ma_bool32 ma_dr_flac__gIsSSE2Supported  = MA_FALSE;
+static ma_bool32 ma_dr_flac__gIsSSE41Supported = MA_FALSE;
+MA_DR_FLAC_NO_THREAD_SANITIZE static void ma_dr_flac__init_cpu_caps(void)
+{
+    static ma_bool32 isCPUCapsInitialized = MA_FALSE;
+    if (!isCPUCapsInitialized) {
+#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC)
+        int info[4] = {0};
+        ma_dr_flac__cpuid(info, 0x80000001);
+        ma_dr_flac__gIsLZCNTSupported = (info[2] & (1 << 5)) != 0;
+#endif
+        ma_dr_flac__gIsSSE2Supported = ma_dr_flac_has_sse2();
+        ma_dr_flac__gIsSSE41Supported = ma_dr_flac_has_sse41();
+        isCPUCapsInitialized = MA_TRUE;
+    }
+}
+#else
+static ma_bool32 ma_dr_flac__gIsNEONSupported  = MA_FALSE;
+static MA_INLINE ma_bool32 ma_dr_flac__has_neon(void)
+{
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+    #if defined(MA_ARM) && !defined(MA_DR_FLAC_NO_NEON)
+        #if (defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64))
+            return MA_TRUE;
+        #else
+            return MA_FALSE;
+        #endif
+    #else
+        return MA_FALSE;
+    #endif
+#else
+    return MA_FALSE;
+#endif
+}
+MA_DR_FLAC_NO_THREAD_SANITIZE static void ma_dr_flac__init_cpu_caps(void)
+{
+    ma_dr_flac__gIsNEONSupported = ma_dr_flac__has_neon();
+#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC) && defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
+    ma_dr_flac__gIsLZCNTSupported = MA_TRUE;
+#endif
+}
+#endif
+static MA_INLINE ma_bool32 ma_dr_flac__is_little_endian(void)
+{
+#if defined(MA_X86) || defined(MA_X64)
+    return MA_TRUE;
+#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
+    return MA_TRUE;
+#else
+    int n = 1;
+    return (*(char*)&n) == 1;
+#endif
+}
+static MA_INLINE ma_uint16 ma_dr_flac__swap_endian_uint16(ma_uint16 n)
+{
+#ifdef MA_DR_FLAC_HAS_BYTESWAP16_INTRINSIC
+    #if defined(_MSC_VER) && !defined(__clang__)
+        return _byteswap_ushort(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap16(n);
+    #elif defined(__WATCOMC__) && defined(__386__)
+        return _watcom_bswap16(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF00) >> 8) |
+           ((n & 0x00FF) << 8);
+#endif
+}
+static MA_INLINE ma_uint32 ma_dr_flac__swap_endian_uint32(ma_uint32 n)
+{
+#ifdef MA_DR_FLAC_HAS_BYTESWAP32_INTRINSIC
+    #if defined(_MSC_VER) && !defined(__clang__)
+        return _byteswap_ulong(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        #if defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(__ARM_ARCH_6M__) && !defined(MA_64BIT)
+            ma_uint32 r;
+            __asm__ __volatile__ (
+            #if defined(MA_64BIT)
+                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)
+            #else
+                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
+            #endif
+            );
+            return r;
+        #else
+            return __builtin_bswap32(n);
+        #endif
+    #elif defined(__WATCOMC__) && defined(__386__)
+        return _watcom_bswap32(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & 0xFF000000) >> 24) |
+           ((n & 0x00FF0000) >>  8) |
+           ((n & 0x0000FF00) <<  8) |
+           ((n & 0x000000FF) << 24);
+#endif
+}
+static MA_INLINE ma_uint64 ma_dr_flac__swap_endian_uint64(ma_uint64 n)
+{
+#ifdef MA_DR_FLAC_HAS_BYTESWAP64_INTRINSIC
+    #if defined(_MSC_VER) && !defined(__clang__)
+        return _byteswap_uint64(n);
+    #elif defined(__GNUC__) || defined(__clang__)
+        return __builtin_bswap64(n);
+    #elif defined(__WATCOMC__) && defined(__386__)
+        return _watcom_bswap64(n);
+    #else
+        #error "This compiler does not support the byte swap intrinsic."
+    #endif
+#else
+    return ((n & ((ma_uint64)0xFF000000 << 32)) >> 56) |
+           ((n & ((ma_uint64)0x00FF0000 << 32)) >> 40) |
+           ((n & ((ma_uint64)0x0000FF00 << 32)) >> 24) |
+           ((n & ((ma_uint64)0x000000FF << 32)) >>  8) |
+           ((n & ((ma_uint64)0xFF000000      )) <<  8) |
+           ((n & ((ma_uint64)0x00FF0000      )) << 24) |
+           ((n & ((ma_uint64)0x0000FF00      )) << 40) |
+           ((n & ((ma_uint64)0x000000FF      )) << 56);
+#endif
+}
+static MA_INLINE ma_uint16 ma_dr_flac__be2host_16(ma_uint16 n)
+{
+    if (ma_dr_flac__is_little_endian()) {
+        return ma_dr_flac__swap_endian_uint16(n);
+    }
+    return n;
+}
+static MA_INLINE ma_uint32 ma_dr_flac__be2host_32(ma_uint32 n)
+{
+    if (ma_dr_flac__is_little_endian()) {
+        return ma_dr_flac__swap_endian_uint32(n);
+    }
+    return n;
+}
+static MA_INLINE ma_uint32 ma_dr_flac__be2host_32_ptr_unaligned(const void* pData)
+{
+    const ma_uint8* pNum = (ma_uint8*)pData;
+    return *(pNum) << 24 | *(pNum+1) << 16 | *(pNum+2) << 8 | *(pNum+3);
+}
+static MA_INLINE ma_uint64 ma_dr_flac__be2host_64(ma_uint64 n)
+{
+    if (ma_dr_flac__is_little_endian()) {
+        return ma_dr_flac__swap_endian_uint64(n);
+    }
+    return n;
+}
+static MA_INLINE ma_uint32 ma_dr_flac__le2host_32(ma_uint32 n)
+{
+    if (!ma_dr_flac__is_little_endian()) {
+        return ma_dr_flac__swap_endian_uint32(n);
+    }
+    return n;
+}
+static MA_INLINE ma_uint32 ma_dr_flac__le2host_32_ptr_unaligned(const void* pData)
+{
+    const ma_uint8* pNum = (ma_uint8*)pData;
+    return *pNum | *(pNum+1) << 8 |  *(pNum+2) << 16 | *(pNum+3) << 24;
+}
+static MA_INLINE ma_uint32 ma_dr_flac__unsynchsafe_32(ma_uint32 n)
+{
+    ma_uint32 result = 0;
+    result |= (n & 0x7F000000) >> 3;
+    result |= (n & 0x007F0000) >> 2;
+    result |= (n & 0x00007F00) >> 1;
+    result |= (n & 0x0000007F) >> 0;
+    return result;
+}
+static ma_uint8 ma_dr_flac__crc8_table[] = {
+    0x00, 0x07, 0x0E, 0x09, 0x1C, 0x1B, 0x12, 0x15, 0x38, 0x3F, 0x36, 0x31, 0x24, 0x23, 0x2A, 0x2D,
+    0x70, 0x77, 0x7E, 0x79, 0x6C, 0x6B, 0x62, 0x65, 0x48, 0x4F, 0x46, 0x41, 0x54, 0x53, 0x5A, 0x5D,
+    0xE0, 0xE7, 0xEE, 0xE9, 0xFC, 0xFB, 0xF2, 0xF5, 0xD8, 0xDF, 0xD6, 0xD1, 0xC4, 0xC3, 0xCA, 0xCD,
+    0x90, 0x97, 0x9E, 0x99, 0x8C, 0x8B, 0x82, 0x85, 0xA8, 0xAF, 0xA6, 0xA1, 0xB4, 0xB3, 0xBA, 0xBD,
+    0xC7, 0xC0, 0xC9, 0xCE, 0xDB, 0xDC, 0xD5, 0xD2, 0xFF, 0xF8, 0xF1, 0xF6, 0xE3, 0xE4, 0xED, 0xEA,
+    0xB7, 0xB0, 0xB9, 0xBE, 0xAB, 0xAC, 0xA5, 0xA2, 0x8F, 0x88, 0x81, 0x86, 0x93, 0x94, 0x9D, 0x9A,
+    0x27, 0x20, 0x29, 0x2E, 0x3B, 0x3C, 0x35, 0x32, 0x1F, 0x18, 0x11, 0x16, 0x03, 0x04, 0x0D, 0x0A,
+    0x57, 0x50, 0x59, 0x5E, 0x4B, 0x4C, 0x45, 0x42, 0x6F, 0x68, 0x61, 0x66, 0x73, 0x74, 0x7D, 0x7A,
+    0x89, 0x8E, 0x87, 0x80, 0x95, 0x92, 0x9B, 0x9C, 0xB1, 0xB6, 0xBF, 0xB8, 0xAD, 0xAA, 0xA3, 0xA4,
+    0xF9, 0xFE, 0xF7, 0xF0, 0xE5, 0xE2, 0xEB, 0xEC, 0xC1, 0xC6, 0xCF, 0xC8, 0xDD, 0xDA, 0xD3, 0xD4,
+    0x69, 0x6E, 0x67, 0x60, 0x75, 0x72, 0x7B, 0x7C, 0x51, 0x56, 0x5F, 0x58, 0x4D, 0x4A, 0x43, 0x44,
+    0x19, 0x1E, 0x17, 0x10, 0x05, 0x02, 0x0B, 0x0C, 0x21, 0x26, 0x2F, 0x28, 0x3D, 0x3A, 0x33, 0x34,
+    0x4E, 0x49, 0x40, 0x47, 0x52, 0x55, 0x5C, 0x5B, 0x76, 0x71, 0x78, 0x7F, 0x6A, 0x6D, 0x64, 0x63,
+    0x3E, 0x39, 0x30, 0x37, 0x22, 0x25, 0x2C, 0x2B, 0x06, 0x01, 0x08, 0x0F, 0x1A, 0x1D, 0x14, 0x13,
+    0xAE, 0xA9, 0xA0, 0xA7, 0xB2, 0xB5, 0xBC, 0xBB, 0x96, 0x91, 0x98, 0x9F, 0x8A, 0x8D, 0x84, 0x83,
+    0xDE, 0xD9, 0xD0, 0xD7, 0xC2, 0xC5, 0xCC, 0xCB, 0xE6, 0xE1, 0xE8, 0xEF, 0xFA, 0xFD, 0xF4, 0xF3
+};
+static ma_uint16 ma_dr_flac__crc16_table[] = {
+    0x0000, 0x8005, 0x800F, 0x000A, 0x801B, 0x001E, 0x0014, 0x8011,
+    0x8033, 0x0036, 0x003C, 0x8039, 0x0028, 0x802D, 0x8027, 0x0022,
+    0x8063, 0x0066, 0x006C, 0x8069, 0x0078, 0x807D, 0x8077, 0x0072,
+    0x0050, 0x8055, 0x805F, 0x005A, 0x804B, 0x004E, 0x0044, 0x8041,
+    0x80C3, 0x00C6, 0x00CC, 0x80C9, 0x00D8, 0x80DD, 0x80D7, 0x00D2,
+    0x00F0, 0x80F5, 0x80FF, 0x00FA, 0x80EB, 0x00EE, 0x00E4, 0x80E1,
+    0x00A0, 0x80A5, 0x80AF, 0x00AA, 0x80BB, 0x00BE, 0x00B4, 0x80B1,
+    0x8093, 0x0096, 0x009C, 0x8099, 0x0088, 0x808D, 0x8087, 0x0082,
+    0x8183, 0x0186, 0x018C, 0x8189, 0x0198, 0x819D, 0x8197, 0x0192,
+    0x01B0, 0x81B5, 0x81BF, 0x01BA, 0x81AB, 0x01AE, 0x01A4, 0x81A1,
+    0x01E0, 0x81E5, 0x81EF, 0x01EA, 0x81FB, 0x01FE, 0x01F4, 0x81F1,
+    0x81D3, 0x01D6, 0x01DC, 0x81D9, 0x01C8, 0x81CD, 0x81C7, 0x01C2,
+    0x0140, 0x8145, 0x814F, 0x014A, 0x815B, 0x015E, 0x0154, 0x8151,
+    0x8173, 0x0176, 0x017C, 0x8179, 0x0168, 0x816D, 0x8167, 0x0162,
+    0x8123, 0x0126, 0x012C, 0x8129, 0x0138, 0x813D, 0x8137, 0x0132,
+    0x0110, 0x8115, 0x811F, 0x011A, 0x810B, 0x010E, 0x0104, 0x8101,
+    0x8303, 0x0306, 0x030C, 0x8309, 0x0318, 0x831D, 0x8317, 0x0312,
+    0x0330, 0x8335, 0x833F, 0x033A, 0x832B, 0x032E, 0x0324, 0x8321,
+    0x0360, 0x8365, 0x836F, 0x036A, 0x837B, 0x037E, 0x0374, 0x8371,
+    0x8353, 0x0356, 0x035C, 0x8359, 0x0348, 0x834D, 0x8347, 0x0342,
+    0x03C0, 0x83C5, 0x83CF, 0x03CA, 0x83DB, 0x03DE, 0x03D4, 0x83D1,
+    0x83F3, 0x03F6, 0x03FC, 0x83F9, 0x03E8, 0x83ED, 0x83E7, 0x03E2,
+    0x83A3, 0x03A6, 0x03AC, 0x83A9, 0x03B8, 0x83BD, 0x83B7, 0x03B2,
+    0x0390, 0x8395, 0x839F, 0x039A, 0x838B, 0x038E, 0x0384, 0x8381,
+    0x0280, 0x8285, 0x828F, 0x028A, 0x829B, 0x029E, 0x0294, 0x8291,
+    0x82B3, 0x02B6, 0x02BC, 0x82B9, 0x02A8, 0x82AD, 0x82A7, 0x02A2,
+    0x82E3, 0x02E6, 0x02EC, 0x82E9, 0x02F8, 0x82FD, 0x82F7, 0x02F2,
+    0x02D0, 0x82D5, 0x82DF, 0x02DA, 0x82CB, 0x02CE, 0x02C4, 0x82C1,
+    0x8243, 0x0246, 0x024C, 0x8249, 0x0258, 0x825D, 0x8257, 0x0252,
+    0x0270, 0x8275, 0x827F, 0x027A, 0x826B, 0x026E, 0x0264, 0x8261,
+    0x0220, 0x8225, 0x822F, 0x022A, 0x823B, 0x023E, 0x0234, 0x8231,
+    0x8213, 0x0216, 0x021C, 0x8219, 0x0208, 0x820D, 0x8207, 0x0202
+};
+static MA_INLINE ma_uint8 ma_dr_flac_crc8_byte(ma_uint8 crc, ma_uint8 data)
+{
+    return ma_dr_flac__crc8_table[crc ^ data];
+}
+static MA_INLINE ma_uint8 ma_dr_flac_crc8(ma_uint8 crc, ma_uint32 data, ma_uint32 count)
+{
+#ifdef MA_DR_FLAC_NO_CRC
+    (void)crc;
+    (void)data;
+    (void)count;
+    return 0;
+#else
+#if 0
+    ma_uint8 p = 0x07;
+    for (int i = count-1; i >= 0; --i) {
+        ma_uint8 bit = (data & (1 << i)) >> i;
+        if (crc & 0x80) {
+            crc = ((crc << 1) | bit) ^ p;
+        } else {
+            crc = ((crc << 1) | bit);
+        }
+    }
+    return crc;
+#else
+    ma_uint32 wholeBytes;
+    ma_uint32 leftoverBits;
+    ma_uint64 leftoverDataMask;
+    static ma_uint64 leftoverDataMaskTable[8] = {
+        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
+    };
+    MA_DR_FLAC_ASSERT(count <= 32);
+    wholeBytes = count >> 3;
+    leftoverBits = count - (wholeBytes*8);
+    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
+    switch (wholeBytes) {
+        case 4: crc = ma_dr_flac_crc8_byte(crc, (ma_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
+        case 3: crc = ma_dr_flac_crc8_byte(crc, (ma_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
+        case 2: crc = ma_dr_flac_crc8_byte(crc, (ma_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
+        case 1: crc = ma_dr_flac_crc8_byte(crc, (ma_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
+        case 0: if (leftoverBits > 0) crc = (ma_uint8)((crc << leftoverBits) ^ ma_dr_flac__crc8_table[(crc >> (8 - leftoverBits)) ^ (data & leftoverDataMask)]);
+    }
+    return crc;
+#endif
+#endif
+}
+static MA_INLINE ma_uint16 ma_dr_flac_crc16_byte(ma_uint16 crc, ma_uint8 data)
+{
+    return (crc << 8) ^ ma_dr_flac__crc16_table[(ma_uint8)(crc >> 8) ^ data];
+}
+static MA_INLINE ma_uint16 ma_dr_flac_crc16_cache(ma_uint16 crc, ma_dr_flac_cache_t data)
+{
+#ifdef MA_64BIT
+    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 56) & 0xFF));
+    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 48) & 0xFF));
+    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 40) & 0xFF));
+    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 32) & 0xFF));
+#endif
+    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 24) & 0xFF));
+    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 16) & 0xFF));
+    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >>  8) & 0xFF));
+    crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >>  0) & 0xFF));
+    return crc;
+}
+static MA_INLINE ma_uint16 ma_dr_flac_crc16_bytes(ma_uint16 crc, ma_dr_flac_cache_t data, ma_uint32 byteCount)
+{
+    switch (byteCount)
+    {
+#ifdef MA_64BIT
+    case 8: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 56) & 0xFF));
+    case 7: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 48) & 0xFF));
+    case 6: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 40) & 0xFF));
+    case 5: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 32) & 0xFF));
+#endif
+    case 4: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 24) & 0xFF));
+    case 3: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >> 16) & 0xFF));
+    case 2: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >>  8) & 0xFF));
+    case 1: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data >>  0) & 0xFF));
+    }
+    return crc;
+}
+#if 0
+static MA_INLINE ma_uint16 ma_dr_flac_crc16__32bit(ma_uint16 crc, ma_uint32 data, ma_uint32 count)
+{
+#ifdef MA_DR_FLAC_NO_CRC
+    (void)crc;
+    (void)data;
+    (void)count;
+    return 0;
+#else
+#if 0
+    ma_uint16 p = 0x8005;
+    for (int i = count-1; i >= 0; --i) {
+        ma_uint16 bit = (data & (1ULL << i)) >> i;
+        if (r & 0x8000) {
+            r = ((r << 1) | bit) ^ p;
+        } else {
+            r = ((r << 1) | bit);
+        }
+    }
+    return crc;
+#else
+    ma_uint32 wholeBytes;
+    ma_uint32 leftoverBits;
+    ma_uint64 leftoverDataMask;
+    static ma_uint64 leftoverDataMaskTable[8] = {
+        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
+    };
+    MA_DR_FLAC_ASSERT(count <= 64);
+    wholeBytes = count >> 3;
+    leftoverBits = count & 7;
+    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
+    switch (wholeBytes) {
+        default:
+        case 4: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (0xFF000000UL << leftoverBits)) >> (24 + leftoverBits)));
+        case 3: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (0x00FF0000UL << leftoverBits)) >> (16 + leftoverBits)));
+        case 2: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (0x0000FF00UL << leftoverBits)) >> ( 8 + leftoverBits)));
+        case 1: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (0x000000FFUL << leftoverBits)) >> ( 0 + leftoverBits)));
+        case 0: if (leftoverBits > 0) crc = (crc << leftoverBits) ^ ma_dr_flac__crc16_table[(crc >> (16 - leftoverBits)) ^ (data & leftoverDataMask)];
+    }
+    return crc;
+#endif
+#endif
+}
+static MA_INLINE ma_uint16 ma_dr_flac_crc16__64bit(ma_uint16 crc, ma_uint64 data, ma_uint32 count)
+{
+#ifdef MA_DR_FLAC_NO_CRC
+    (void)crc;
+    (void)data;
+    (void)count;
+    return 0;
+#else
+    ma_uint32 wholeBytes;
+    ma_uint32 leftoverBits;
+    ma_uint64 leftoverDataMask;
+    static ma_uint64 leftoverDataMaskTable[8] = {
+        0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F
+    };
+    MA_DR_FLAC_ASSERT(count <= 64);
+    wholeBytes = count >> 3;
+    leftoverBits = count & 7;
+    leftoverDataMask = leftoverDataMaskTable[leftoverBits];
+    switch (wholeBytes) {
+        default:
+        case 8: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0xFF000000 << 32) << leftoverBits)) >> (56 + leftoverBits)));
+        case 7: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x00FF0000 << 32) << leftoverBits)) >> (48 + leftoverBits)));
+        case 6: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x0000FF00 << 32) << leftoverBits)) >> (40 + leftoverBits)));
+        case 5: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x000000FF << 32) << leftoverBits)) >> (32 + leftoverBits)));
+        case 4: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0xFF000000      ) << leftoverBits)) >> (24 + leftoverBits)));
+        case 3: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x00FF0000      ) << leftoverBits)) >> (16 + leftoverBits)));
+        case 2: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x0000FF00      ) << leftoverBits)) >> ( 8 + leftoverBits)));
+        case 1: crc = ma_dr_flac_crc16_byte(crc, (ma_uint8)((data & (((ma_uint64)0x000000FF      ) << leftoverBits)) >> ( 0 + leftoverBits)));
+        case 0: if (leftoverBits > 0) crc = (crc << leftoverBits) ^ ma_dr_flac__crc16_table[(crc >> (16 - leftoverBits)) ^ (data & leftoverDataMask)];
+    }
+    return crc;
+#endif
+}
+static MA_INLINE ma_uint16 ma_dr_flac_crc16(ma_uint16 crc, ma_dr_flac_cache_t data, ma_uint32 count)
+{
+#ifdef MA_64BIT
+    return ma_dr_flac_crc16__64bit(crc, data, count);
+#else
+    return ma_dr_flac_crc16__32bit(crc, data, count);
+#endif
+}
+#endif
+#ifdef MA_64BIT
+#define ma_dr_flac__be2host__cache_line ma_dr_flac__be2host_64
+#else
+#define ma_dr_flac__be2host__cache_line ma_dr_flac__be2host_32
+#endif
+#define MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs)                      (sizeof((bs)->cache))
+#define MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)                       (sizeof((bs)->cache)*8)
+#define MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)                  (MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - (bs)->consumedBits)
+#define MA_DR_FLAC_CACHE_L1_SELECTION_MASK(_bitCount)           (~((~(ma_dr_flac_cache_t)0) >> (_bitCount)))
+#define MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT(bs, _bitCount)      (MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - (_bitCount))
+#define MA_DR_FLAC_CACHE_L1_SELECT(bs, _bitCount)               (((bs)->cache) & MA_DR_FLAC_CACHE_L1_SELECTION_MASK(_bitCount))
+#define MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, _bitCount)     (MA_DR_FLAC_CACHE_L1_SELECT((bs), (_bitCount)) >>  MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)))
+#define MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, _bitCount)(MA_DR_FLAC_CACHE_L1_SELECT((bs), (_bitCount)) >> (MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT((bs), (_bitCount)) & (MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)-1)))
+#define MA_DR_FLAC_CACHE_L2_SIZE_BYTES(bs)                      (sizeof((bs)->cacheL2))
+#define MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)                      (MA_DR_FLAC_CACHE_L2_SIZE_BYTES(bs) / sizeof((bs)->cacheL2[0]))
+#define MA_DR_FLAC_CACHE_L2_LINES_REMAINING(bs)                 (MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs) - (bs)->nextL2Line)
+#ifndef MA_DR_FLAC_NO_CRC
+static MA_INLINE void ma_dr_flac__reset_crc16(ma_dr_flac_bs* bs)
+{
+    bs->crc16 = 0;
+    bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
+}
+static MA_INLINE void ma_dr_flac__update_crc16(ma_dr_flac_bs* bs)
+{
+    if (bs->crc16CacheIgnoredBytes == 0) {
+        bs->crc16 = ma_dr_flac_crc16_cache(bs->crc16, bs->crc16Cache);
+    } else {
+        bs->crc16 = ma_dr_flac_crc16_bytes(bs->crc16, bs->crc16Cache, MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs) - bs->crc16CacheIgnoredBytes);
+        bs->crc16CacheIgnoredBytes = 0;
+    }
+}
+static MA_INLINE ma_uint16 ma_dr_flac__flush_crc16(ma_dr_flac_bs* bs)
+{
+    MA_DR_FLAC_ASSERT((MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) & 7) == 0);
+    if (MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) == 0) {
+        ma_dr_flac__update_crc16(bs);
+    } else {
+        bs->crc16 = ma_dr_flac_crc16_bytes(bs->crc16, bs->crc16Cache >> MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs), (bs->consumedBits >> 3) - bs->crc16CacheIgnoredBytes);
+        bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
+    }
+    return bs->crc16;
+}
+#endif
+static MA_INLINE ma_bool32 ma_dr_flac__reload_l1_cache_from_l2(ma_dr_flac_bs* bs)
+{
+    size_t bytesRead;
+    size_t alignedL1LineCount;
+    if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
+        bs->cache = bs->cacheL2[bs->nextL2Line++];
+        return MA_TRUE;
+    }
+    if (bs->unalignedByteCount > 0) {
+        return MA_FALSE;
+    }
+    bytesRead = bs->onRead(bs->pUserData, bs->cacheL2, MA_DR_FLAC_CACHE_L2_SIZE_BYTES(bs));
+    bs->nextL2Line = 0;
+    if (bytesRead == MA_DR_FLAC_CACHE_L2_SIZE_BYTES(bs)) {
+        bs->cache = bs->cacheL2[bs->nextL2Line++];
+        return MA_TRUE;
+    }
+    alignedL1LineCount = bytesRead / MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs);
+    bs->unalignedByteCount = bytesRead - (alignedL1LineCount * MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs));
+    if (bs->unalignedByteCount > 0) {
+        bs->unalignedCache = bs->cacheL2[alignedL1LineCount];
+    }
+    if (alignedL1LineCount > 0) {
+        size_t offset = MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs) - alignedL1LineCount;
+        size_t i;
+        for (i = alignedL1LineCount; i > 0; --i) {
+            bs->cacheL2[i-1 + offset] = bs->cacheL2[i-1];
+        }
+        bs->nextL2Line = (ma_uint32)offset;
+        bs->cache = bs->cacheL2[bs->nextL2Line++];
+        return MA_TRUE;
+    } else {
+        bs->nextL2Line = MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs);
+        return MA_FALSE;
+    }
+}
+static ma_bool32 ma_dr_flac__reload_cache(ma_dr_flac_bs* bs)
+{
+    size_t bytesRead;
+#ifndef MA_DR_FLAC_NO_CRC
+    ma_dr_flac__update_crc16(bs);
+#endif
+    if (ma_dr_flac__reload_l1_cache_from_l2(bs)) {
+        bs->cache = ma_dr_flac__be2host__cache_line(bs->cache);
+        bs->consumedBits = 0;
+#ifndef MA_DR_FLAC_NO_CRC
+        bs->crc16Cache = bs->cache;
+#endif
+        return MA_TRUE;
+    }
+    bytesRead = bs->unalignedByteCount;
+    if (bytesRead == 0) {
+        bs->consumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
+        return MA_FALSE;
+    }
+    MA_DR_FLAC_ASSERT(bytesRead < MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs));
+    bs->consumedBits = (ma_uint32)(MA_DR_FLAC_CACHE_L1_SIZE_BYTES(bs) - bytesRead) * 8;
+    bs->cache = ma_dr_flac__be2host__cache_line(bs->unalignedCache);
+    bs->cache &= MA_DR_FLAC_CACHE_L1_SELECTION_MASK(MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs));
+    bs->unalignedByteCount = 0;
+#ifndef MA_DR_FLAC_NO_CRC
+    bs->crc16Cache = bs->cache >> bs->consumedBits;
+    bs->crc16CacheIgnoredBytes = bs->consumedBits >> 3;
+#endif
+    return MA_TRUE;
+}
+static void ma_dr_flac__reset_cache(ma_dr_flac_bs* bs)
+{
+    bs->nextL2Line   = MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs);
+    bs->consumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
+    bs->cache = 0;
+    bs->unalignedByteCount = 0;
+    bs->unalignedCache = 0;
+#ifndef MA_DR_FLAC_NO_CRC
+    bs->crc16Cache = 0;
+    bs->crc16CacheIgnoredBytes = 0;
+#endif
+}
+static MA_INLINE ma_bool32 ma_dr_flac__read_uint32(ma_dr_flac_bs* bs, unsigned int bitCount, ma_uint32* pResultOut)
+{
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pResultOut != NULL);
+    MA_DR_FLAC_ASSERT(bitCount > 0);
+    MA_DR_FLAC_ASSERT(bitCount <= 32);
+    if (bs->consumedBits == MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)) {
+        if (!ma_dr_flac__reload_cache(bs)) {
+            return MA_FALSE;
+        }
+    }
+    if (bitCount <= MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
+#ifdef MA_64BIT
+        *pResultOut = (ma_uint32)MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
+        bs->consumedBits += bitCount;
+        bs->cache <<= bitCount;
+#else
+        if (bitCount < MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)) {
+            *pResultOut = (ma_uint32)MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCount);
+            bs->consumedBits += bitCount;
+            bs->cache <<= bitCount;
+        } else {
+            *pResultOut = (ma_uint32)bs->cache;
+            bs->consumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
+            bs->cache = 0;
+        }
+#endif
+        return MA_TRUE;
+    } else {
+        ma_uint32 bitCountHi = MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
+        ma_uint32 bitCountLo = bitCount - bitCountHi;
+        ma_uint32 resultHi;
+        MA_DR_FLAC_ASSERT(bitCountHi > 0);
+        MA_DR_FLAC_ASSERT(bitCountHi < 32);
+        resultHi = (ma_uint32)MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountHi);
+        if (!ma_dr_flac__reload_cache(bs)) {
+            return MA_FALSE;
+        }
+        if (bitCountLo > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
+            return MA_FALSE;
+        }
+        *pResultOut = (resultHi << bitCountLo) | (ma_uint32)MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, bitCountLo);
+        bs->consumedBits += bitCountLo;
+        bs->cache <<= bitCountLo;
+        return MA_TRUE;
+    }
+}
+static ma_bool32 ma_dr_flac__read_int32(ma_dr_flac_bs* bs, unsigned int bitCount, ma_int32* pResult)
+{
+    ma_uint32 result;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pResult != NULL);
+    MA_DR_FLAC_ASSERT(bitCount > 0);
+    MA_DR_FLAC_ASSERT(bitCount <= 32);
+    if (!ma_dr_flac__read_uint32(bs, bitCount, &result)) {
+        return MA_FALSE;
+    }
+    if (bitCount < 32) {
+        ma_uint32 signbit;
+        signbit = ((result >> (bitCount-1)) & 0x01);
+        result |= (~signbit + 1) << bitCount;
+    }
+    *pResult = (ma_int32)result;
+    return MA_TRUE;
+}
+#ifdef MA_64BIT
+static ma_bool32 ma_dr_flac__read_uint64(ma_dr_flac_bs* bs, unsigned int bitCount, ma_uint64* pResultOut)
+{
+    ma_uint32 resultHi;
+    ma_uint32 resultLo;
+    MA_DR_FLAC_ASSERT(bitCount <= 64);
+    MA_DR_FLAC_ASSERT(bitCount >  32);
+    if (!ma_dr_flac__read_uint32(bs, bitCount - 32, &resultHi)) {
+        return MA_FALSE;
+    }
+    if (!ma_dr_flac__read_uint32(bs, 32, &resultLo)) {
+        return MA_FALSE;
+    }
+    *pResultOut = (((ma_uint64)resultHi) << 32) | ((ma_uint64)resultLo);
+    return MA_TRUE;
+}
+#endif
+#if 0
+static ma_bool32 ma_dr_flac__read_int64(ma_dr_flac_bs* bs, unsigned int bitCount, ma_int64* pResultOut)
+{
+    ma_uint64 result;
+    ma_uint64 signbit;
+    MA_DR_FLAC_ASSERT(bitCount <= 64);
+    if (!ma_dr_flac__read_uint64(bs, bitCount, &result)) {
+        return MA_FALSE;
+    }
+    signbit = ((result >> (bitCount-1)) & 0x01);
+    result |= (~signbit + 1) << bitCount;
+    *pResultOut = (ma_int64)result;
+    return MA_TRUE;
+}
+#endif
+static ma_bool32 ma_dr_flac__read_uint16(ma_dr_flac_bs* bs, unsigned int bitCount, ma_uint16* pResult)
+{
+    ma_uint32 result;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pResult != NULL);
+    MA_DR_FLAC_ASSERT(bitCount > 0);
+    MA_DR_FLAC_ASSERT(bitCount <= 16);
+    if (!ma_dr_flac__read_uint32(bs, bitCount, &result)) {
+        return MA_FALSE;
+    }
+    *pResult = (ma_uint16)result;
+    return MA_TRUE;
+}
+#if 0
+static ma_bool32 ma_dr_flac__read_int16(ma_dr_flac_bs* bs, unsigned int bitCount, ma_int16* pResult)
+{
+    ma_int32 result;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pResult != NULL);
+    MA_DR_FLAC_ASSERT(bitCount > 0);
+    MA_DR_FLAC_ASSERT(bitCount <= 16);
+    if (!ma_dr_flac__read_int32(bs, bitCount, &result)) {
+        return MA_FALSE;
+    }
+    *pResult = (ma_int16)result;
+    return MA_TRUE;
+}
+#endif
+static ma_bool32 ma_dr_flac__read_uint8(ma_dr_flac_bs* bs, unsigned int bitCount, ma_uint8* pResult)
+{
+    ma_uint32 result;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pResult != NULL);
+    MA_DR_FLAC_ASSERT(bitCount > 0);
+    MA_DR_FLAC_ASSERT(bitCount <= 8);
+    if (!ma_dr_flac__read_uint32(bs, bitCount, &result)) {
+        return MA_FALSE;
+    }
+    *pResult = (ma_uint8)result;
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__read_int8(ma_dr_flac_bs* bs, unsigned int bitCount, ma_int8* pResult)
+{
+    ma_int32 result;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pResult != NULL);
+    MA_DR_FLAC_ASSERT(bitCount > 0);
+    MA_DR_FLAC_ASSERT(bitCount <= 8);
+    if (!ma_dr_flac__read_int32(bs, bitCount, &result)) {
+        return MA_FALSE;
+    }
+    *pResult = (ma_int8)result;
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__seek_bits(ma_dr_flac_bs* bs, size_t bitsToSeek)
+{
+    if (bitsToSeek <= MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
+        bs->consumedBits += (ma_uint32)bitsToSeek;
+        bs->cache <<= bitsToSeek;
+        return MA_TRUE;
+    } else {
+        bitsToSeek       -= MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
+        bs->consumedBits += MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
+        bs->cache         = 0;
+#ifdef MA_64BIT
+        while (bitsToSeek >= MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)) {
+            ma_uint64 bin;
+            if (!ma_dr_flac__read_uint64(bs, MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
+                return MA_FALSE;
+            }
+            bitsToSeek -= MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
+        }
+#else
+        while (bitsToSeek >= MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)) {
+            ma_uint32 bin;
+            if (!ma_dr_flac__read_uint32(bs, MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs), &bin)) {
+                return MA_FALSE;
+            }
+            bitsToSeek -= MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
+        }
+#endif
+        while (bitsToSeek >= 8) {
+            ma_uint8 bin;
+            if (!ma_dr_flac__read_uint8(bs, 8, &bin)) {
+                return MA_FALSE;
+            }
+            bitsToSeek -= 8;
+        }
+        if (bitsToSeek > 0) {
+            ma_uint8 bin;
+            if (!ma_dr_flac__read_uint8(bs, (ma_uint32)bitsToSeek, &bin)) {
+                return MA_FALSE;
+            }
+            bitsToSeek = 0;
+        }
+        MA_DR_FLAC_ASSERT(bitsToSeek == 0);
+        return MA_TRUE;
+    }
+}
+static ma_bool32 ma_dr_flac__find_and_seek_to_next_sync_code(ma_dr_flac_bs* bs)
+{
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    if (!ma_dr_flac__seek_bits(bs, MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
+        return MA_FALSE;
+    }
+    for (;;) {
+        ma_uint8 hi;
+#ifndef MA_DR_FLAC_NO_CRC
+        ma_dr_flac__reset_crc16(bs);
+#endif
+        if (!ma_dr_flac__read_uint8(bs, 8, &hi)) {
+            return MA_FALSE;
+        }
+        if (hi == 0xFF) {
+            ma_uint8 lo;
+            if (!ma_dr_flac__read_uint8(bs, 6, &lo)) {
+                return MA_FALSE;
+            }
+            if (lo == 0x3E) {
+                return MA_TRUE;
+            } else {
+                if (!ma_dr_flac__seek_bits(bs, MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) & 7)) {
+                    return MA_FALSE;
+                }
+            }
+        }
+    }
+}
+#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC)
+#define MA_DR_FLAC_IMPLEMENT_CLZ_LZCNT
+#endif
+#if  defined(_MSC_VER) && _MSC_VER >= 1400 && (defined(MA_X64) || defined(MA_X86)) && !defined(__clang__)
+#define MA_DR_FLAC_IMPLEMENT_CLZ_MSVC
+#endif
+#if  defined(__WATCOMC__) && defined(__386__)
+#define MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM
+#endif
+#ifdef __MRC__
+#include <intrinsics.h>
+#define MA_DR_FLAC_IMPLEMENT_CLZ_MRC
+#endif
+static MA_INLINE ma_uint32 ma_dr_flac__clz_software(ma_dr_flac_cache_t x)
+{
+    ma_uint32 n;
+    static ma_uint32 clz_table_4[] = {
+        0,
+        4,
+        3, 3,
+        2, 2, 2, 2,
+        1, 1, 1, 1, 1, 1, 1, 1
+    };
+    if (x == 0) {
+        return sizeof(x)*8;
+    }
+    n = clz_table_4[x >> (sizeof(x)*8 - 4)];
+    if (n == 0) {
+#ifdef MA_64BIT
+        if ((x & ((ma_uint64)0xFFFFFFFF << 32)) == 0) { n  = 32; x <<= 32; }
+        if ((x & ((ma_uint64)0xFFFF0000 << 32)) == 0) { n += 16; x <<= 16; }
+        if ((x & ((ma_uint64)0xFF000000 << 32)) == 0) { n += 8;  x <<= 8;  }
+        if ((x & ((ma_uint64)0xF0000000 << 32)) == 0) { n += 4;  x <<= 4;  }
+#else
+        if ((x & 0xFFFF0000) == 0) { n  = 16; x <<= 16; }
+        if ((x & 0xFF000000) == 0) { n += 8;  x <<= 8;  }
+        if ((x & 0xF0000000) == 0) { n += 4;  x <<= 4;  }
+#endif
+        n += clz_table_4[x >> (sizeof(x)*8 - 4)];
+    }
+    return n - 1;
+}
+#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_LZCNT
+static MA_INLINE ma_bool32 ma_dr_flac__is_lzcnt_supported(void)
+{
+#if defined(MA_DR_FLAC_HAS_LZCNT_INTRINSIC) && defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5)
+    return MA_TRUE;
+#elif defined(__MRC__)
+    return MA_TRUE;
+#else
+    #ifdef MA_DR_FLAC_HAS_LZCNT_INTRINSIC
+        return ma_dr_flac__gIsLZCNTSupported;
+    #else
+        return MA_FALSE;
+    #endif
+#endif
+}
+static MA_INLINE ma_uint32 ma_dr_flac__clz_lzcnt(ma_dr_flac_cache_t x)
+{
+#if defined(_MSC_VER)
+    #ifdef MA_64BIT
+        return (ma_uint32)__lzcnt64(x);
+    #else
+        return (ma_uint32)__lzcnt(x);
+    #endif
+#else
+    #if defined(__GNUC__) || defined(__clang__)
+        #if defined(MA_X64)
+            {
+                ma_uint64 r;
+                __asm__ __volatile__ (
+                    "lzcnt{ %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
+                );
+                return (ma_uint32)r;
+            }
+        #elif defined(MA_X86)
+            {
+                ma_uint32 r;
+                __asm__ __volatile__ (
+                    "lzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
+                );
+                return r;
+            }
+        #elif defined(MA_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 5) && !defined(__ARM_ARCH_6M__) && !defined(MA_64BIT)
+            {
+                unsigned int r;
+                __asm__ __volatile__ (
+                #if defined(MA_64BIT)
+                    "clz %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(x)
+                #else
+                    "clz %[out], %[in]" : [out]"=r"(r) : [in]"r"(x)
+                #endif
+                );
+                return r;
+            }
+        #else
+            if (x == 0) {
+                return sizeof(x)*8;
+            }
+            #ifdef MA_64BIT
+                return (ma_uint32)__builtin_clzll((ma_uint64)x);
+            #else
+                return (ma_uint32)__builtin_clzl((ma_uint32)x);
+            #endif
+        #endif
+    #else
+        #error "This compiler does not support the lzcnt intrinsic."
+    #endif
+#endif
+}
+#endif
+#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_MSVC
+#include <intrin.h>
+static MA_INLINE ma_uint32 ma_dr_flac__clz_msvc(ma_dr_flac_cache_t x)
+{
+    ma_uint32 n;
+    if (x == 0) {
+        return sizeof(x)*8;
+    }
+#ifdef MA_64BIT
+    _BitScanReverse64((unsigned long*)&n, x);
+#else
+    _BitScanReverse((unsigned long*)&n, x);
+#endif
+    return sizeof(x)*8 - n - 1;
+}
+#endif
+#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM
+static __inline ma_uint32 ma_dr_flac__clz_watcom (ma_uint32);
+#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM_LZCNT
+#pragma aux ma_dr_flac__clz_watcom_lzcnt = \
+    "db 0F3h, 0Fh, 0BDh, 0C0h"  \
+    parm [eax] \
+    value [eax] \
+    modify nomemory;
+#else
+#pragma aux ma_dr_flac__clz_watcom = \
+    "bsr eax, eax" \
+    "xor eax, 31" \
+    parm [eax] nomemory \
+    value [eax] \
+    modify exact [eax] nomemory;
+#endif
+#endif
+static MA_INLINE ma_uint32 ma_dr_flac__clz(ma_dr_flac_cache_t x)
+{
+#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_LZCNT
+    if (ma_dr_flac__is_lzcnt_supported()) {
+        return ma_dr_flac__clz_lzcnt(x);
+    } else
+#endif
+    {
+#ifdef MA_DR_FLAC_IMPLEMENT_CLZ_MSVC
+        return ma_dr_flac__clz_msvc(x);
+#elif defined(MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM_LZCNT)
+        return ma_dr_flac__clz_watcom_lzcnt(x);
+#elif defined(MA_DR_FLAC_IMPLEMENT_CLZ_WATCOM)
+        return (x == 0) ? sizeof(x)*8 : ma_dr_flac__clz_watcom(x);
+#elif defined(__MRC__)
+        return __cntlzw(x);
+#else
+        return ma_dr_flac__clz_software(x);
+#endif
+    }
+}
+static MA_INLINE ma_bool32 ma_dr_flac__seek_past_next_set_bit(ma_dr_flac_bs* bs, unsigned int* pOffsetOut)
+{
+    ma_uint32 zeroCounter = 0;
+    ma_uint32 setBitOffsetPlus1;
+    while (bs->cache == 0) {
+        zeroCounter += (ma_uint32)MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
+        if (!ma_dr_flac__reload_cache(bs)) {
+            return MA_FALSE;
+        }
+    }
+    if (bs->cache == 1) {
+        *pOffsetOut = zeroCounter + (ma_uint32)MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs) - 1;
+        if (!ma_dr_flac__reload_cache(bs)) {
+            return MA_FALSE;
+        }
+        return MA_TRUE;
+    }
+    setBitOffsetPlus1 = ma_dr_flac__clz(bs->cache);
+    setBitOffsetPlus1 += 1;
+    if (setBitOffsetPlus1 > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
+        return MA_FALSE;
+    }
+    bs->consumedBits += setBitOffsetPlus1;
+    bs->cache <<= setBitOffsetPlus1;
+    *pOffsetOut = zeroCounter + setBitOffsetPlus1 - 1;
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__seek_to_byte(ma_dr_flac_bs* bs, ma_uint64 offsetFromStart)
+{
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(offsetFromStart > 0);
+    if (offsetFromStart > 0x7FFFFFFF) {
+        ma_uint64 bytesRemaining = offsetFromStart;
+        if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, ma_dr_flac_seek_origin_start)) {
+            return MA_FALSE;
+        }
+        bytesRemaining -= 0x7FFFFFFF;
+        while (bytesRemaining > 0x7FFFFFFF) {
+            if (!bs->onSeek(bs->pUserData, 0x7FFFFFFF, ma_dr_flac_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            bytesRemaining -= 0x7FFFFFFF;
+        }
+        if (bytesRemaining > 0) {
+            if (!bs->onSeek(bs->pUserData, (int)bytesRemaining, ma_dr_flac_seek_origin_current)) {
+                return MA_FALSE;
+            }
+        }
+    } else {
+        if (!bs->onSeek(bs->pUserData, (int)offsetFromStart, ma_dr_flac_seek_origin_start)) {
+            return MA_FALSE;
+        }
+    }
+    ma_dr_flac__reset_cache(bs);
+    return MA_TRUE;
+}
+static ma_result ma_dr_flac__read_utf8_coded_number(ma_dr_flac_bs* bs, ma_uint64* pNumberOut, ma_uint8* pCRCOut)
+{
+    ma_uint8 crc;
+    ma_uint64 result;
+    ma_uint8 utf8[7] = {0};
+    int byteCount;
+    int i;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pNumberOut != NULL);
+    MA_DR_FLAC_ASSERT(pCRCOut != NULL);
+    crc = *pCRCOut;
+    if (!ma_dr_flac__read_uint8(bs, 8, utf8)) {
+        *pNumberOut = 0;
+        return MA_AT_END;
+    }
+    crc = ma_dr_flac_crc8(crc, utf8[0], 8);
+    if ((utf8[0] & 0x80) == 0) {
+        *pNumberOut = utf8[0];
+        *pCRCOut = crc;
+        return MA_SUCCESS;
+    }
+    if ((utf8[0] & 0xE0) == 0xC0) {
+        byteCount = 2;
+    } else if ((utf8[0] & 0xF0) == 0xE0) {
+        byteCount = 3;
+    } else if ((utf8[0] & 0xF8) == 0xF0) {
+        byteCount = 4;
+    } else if ((utf8[0] & 0xFC) == 0xF8) {
+        byteCount = 5;
+    } else if ((utf8[0] & 0xFE) == 0xFC) {
+        byteCount = 6;
+    } else if ((utf8[0] & 0xFF) == 0xFE) {
+        byteCount = 7;
+    } else {
+        *pNumberOut = 0;
+        return MA_CRC_MISMATCH;
+    }
+    MA_DR_FLAC_ASSERT(byteCount > 1);
+    result = (ma_uint64)(utf8[0] & (0xFF >> (byteCount + 1)));
+    for (i = 1; i < byteCount; ++i) {
+        if (!ma_dr_flac__read_uint8(bs, 8, utf8 + i)) {
+            *pNumberOut = 0;
+            return MA_AT_END;
+        }
+        crc = ma_dr_flac_crc8(crc, utf8[i], 8);
+        result = (result << 6) | (utf8[i] & 0x3F);
+    }
+    *pNumberOut = result;
+    *pCRCOut = crc;
+    return MA_SUCCESS;
+}
+static MA_INLINE ma_uint32 ma_dr_flac__ilog2_u32(ma_uint32 x)
+{
+#if 1
+    ma_uint32 result = 0;
+    while (x > 0) {
+        result += 1;
+        x >>= 1;
+    }
+    return result;
+#endif
+}
+static MA_INLINE ma_bool32 ma_dr_flac__use_64_bit_prediction(ma_uint32 bitsPerSample, ma_uint32 order, ma_uint32 precision)
+{
+    return bitsPerSample + precision + ma_dr_flac__ilog2_u32(order) > 32;
+}
+#if defined(__clang__)
+__attribute__((no_sanitize("signed-integer-overflow")))
+#endif
+static MA_INLINE ma_int32 ma_dr_flac__calculate_prediction_32(ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pDecodedSamples)
+{
+    ma_int32 prediction = 0;
+    MA_DR_FLAC_ASSERT(order <= 32);
+    switch (order)
+    {
+    case 32: prediction += coefficients[31] * pDecodedSamples[-32];
+    case 31: prediction += coefficients[30] * pDecodedSamples[-31];
+    case 30: prediction += coefficients[29] * pDecodedSamples[-30];
+    case 29: prediction += coefficients[28] * pDecodedSamples[-29];
+    case 28: prediction += coefficients[27] * pDecodedSamples[-28];
+    case 27: prediction += coefficients[26] * pDecodedSamples[-27];
+    case 26: prediction += coefficients[25] * pDecodedSamples[-26];
+    case 25: prediction += coefficients[24] * pDecodedSamples[-25];
+    case 24: prediction += coefficients[23] * pDecodedSamples[-24];
+    case 23: prediction += coefficients[22] * pDecodedSamples[-23];
+    case 22: prediction += coefficients[21] * pDecodedSamples[-22];
+    case 21: prediction += coefficients[20] * pDecodedSamples[-21];
+    case 20: prediction += coefficients[19] * pDecodedSamples[-20];
+    case 19: prediction += coefficients[18] * pDecodedSamples[-19];
+    case 18: prediction += coefficients[17] * pDecodedSamples[-18];
+    case 17: prediction += coefficients[16] * pDecodedSamples[-17];
+    case 16: prediction += coefficients[15] * pDecodedSamples[-16];
+    case 15: prediction += coefficients[14] * pDecodedSamples[-15];
+    case 14: prediction += coefficients[13] * pDecodedSamples[-14];
+    case 13: prediction += coefficients[12] * pDecodedSamples[-13];
+    case 12: prediction += coefficients[11] * pDecodedSamples[-12];
+    case 11: prediction += coefficients[10] * pDecodedSamples[-11];
+    case 10: prediction += coefficients[ 9] * pDecodedSamples[-10];
+    case  9: prediction += coefficients[ 8] * pDecodedSamples[- 9];
+    case  8: prediction += coefficients[ 7] * pDecodedSamples[- 8];
+    case  7: prediction += coefficients[ 6] * pDecodedSamples[- 7];
+    case  6: prediction += coefficients[ 5] * pDecodedSamples[- 6];
+    case  5: prediction += coefficients[ 4] * pDecodedSamples[- 5];
+    case  4: prediction += coefficients[ 3] * pDecodedSamples[- 4];
+    case  3: prediction += coefficients[ 2] * pDecodedSamples[- 3];
+    case  2: prediction += coefficients[ 1] * pDecodedSamples[- 2];
+    case  1: prediction += coefficients[ 0] * pDecodedSamples[- 1];
+    }
+    return (ma_int32)(prediction >> shift);
+}
+static MA_INLINE ma_int32 ma_dr_flac__calculate_prediction_64(ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pDecodedSamples)
+{
+    ma_int64 prediction;
+    MA_DR_FLAC_ASSERT(order <= 32);
+#ifndef MA_64BIT
+    if (order == 8)
+    {
+        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
+        prediction += coefficients[4] * (ma_int64)pDecodedSamples[-5];
+        prediction += coefficients[5] * (ma_int64)pDecodedSamples[-6];
+        prediction += coefficients[6] * (ma_int64)pDecodedSamples[-7];
+        prediction += coefficients[7] * (ma_int64)pDecodedSamples[-8];
+    }
+    else if (order == 7)
+    {
+        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
+        prediction += coefficients[4] * (ma_int64)pDecodedSamples[-5];
+        prediction += coefficients[5] * (ma_int64)pDecodedSamples[-6];
+        prediction += coefficients[6] * (ma_int64)pDecodedSamples[-7];
+    }
+    else if (order == 3)
+    {
+        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
+    }
+    else if (order == 6)
+    {
+        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
+        prediction += coefficients[4] * (ma_int64)pDecodedSamples[-5];
+        prediction += coefficients[5] * (ma_int64)pDecodedSamples[-6];
+    }
+    else if (order == 5)
+    {
+        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
+        prediction += coefficients[4] * (ma_int64)pDecodedSamples[-5];
+    }
+    else if (order == 4)
+    {
+        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2] * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3] * (ma_int64)pDecodedSamples[-4];
+    }
+    else if (order == 12)
+    {
+        prediction  = coefficients[0]  * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1]  * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2]  * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3]  * (ma_int64)pDecodedSamples[-4];
+        prediction += coefficients[4]  * (ma_int64)pDecodedSamples[-5];
+        prediction += coefficients[5]  * (ma_int64)pDecodedSamples[-6];
+        prediction += coefficients[6]  * (ma_int64)pDecodedSamples[-7];
+        prediction += coefficients[7]  * (ma_int64)pDecodedSamples[-8];
+        prediction += coefficients[8]  * (ma_int64)pDecodedSamples[-9];
+        prediction += coefficients[9]  * (ma_int64)pDecodedSamples[-10];
+        prediction += coefficients[10] * (ma_int64)pDecodedSamples[-11];
+        prediction += coefficients[11] * (ma_int64)pDecodedSamples[-12];
+    }
+    else if (order == 2)
+    {
+        prediction  = coefficients[0] * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1] * (ma_int64)pDecodedSamples[-2];
+    }
+    else if (order == 1)
+    {
+        prediction = coefficients[0] * (ma_int64)pDecodedSamples[-1];
+    }
+    else if (order == 10)
+    {
+        prediction  = coefficients[0]  * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1]  * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2]  * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3]  * (ma_int64)pDecodedSamples[-4];
+        prediction += coefficients[4]  * (ma_int64)pDecodedSamples[-5];
+        prediction += coefficients[5]  * (ma_int64)pDecodedSamples[-6];
+        prediction += coefficients[6]  * (ma_int64)pDecodedSamples[-7];
+        prediction += coefficients[7]  * (ma_int64)pDecodedSamples[-8];
+        prediction += coefficients[8]  * (ma_int64)pDecodedSamples[-9];
+        prediction += coefficients[9]  * (ma_int64)pDecodedSamples[-10];
+    }
+    else if (order == 9)
+    {
+        prediction  = coefficients[0]  * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1]  * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2]  * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3]  * (ma_int64)pDecodedSamples[-4];
+        prediction += coefficients[4]  * (ma_int64)pDecodedSamples[-5];
+        prediction += coefficients[5]  * (ma_int64)pDecodedSamples[-6];
+        prediction += coefficients[6]  * (ma_int64)pDecodedSamples[-7];
+        prediction += coefficients[7]  * (ma_int64)pDecodedSamples[-8];
+        prediction += coefficients[8]  * (ma_int64)pDecodedSamples[-9];
+    }
+    else if (order == 11)
+    {
+        prediction  = coefficients[0]  * (ma_int64)pDecodedSamples[-1];
+        prediction += coefficients[1]  * (ma_int64)pDecodedSamples[-2];
+        prediction += coefficients[2]  * (ma_int64)pDecodedSamples[-3];
+        prediction += coefficients[3]  * (ma_int64)pDecodedSamples[-4];
+        prediction += coefficients[4]  * (ma_int64)pDecodedSamples[-5];
+        prediction += coefficients[5]  * (ma_int64)pDecodedSamples[-6];
+        prediction += coefficients[6]  * (ma_int64)pDecodedSamples[-7];
+        prediction += coefficients[7]  * (ma_int64)pDecodedSamples[-8];
+        prediction += coefficients[8]  * (ma_int64)pDecodedSamples[-9];
+        prediction += coefficients[9]  * (ma_int64)pDecodedSamples[-10];
+        prediction += coefficients[10] * (ma_int64)pDecodedSamples[-11];
+    }
+    else
+    {
+        int j;
+        prediction = 0;
+        for (j = 0; j < (int)order; ++j) {
+            prediction += coefficients[j] * (ma_int64)pDecodedSamples[-j-1];
+        }
+    }
+#endif
+#ifdef MA_64BIT
+    prediction = 0;
+    switch (order)
+    {
+    case 32: prediction += coefficients[31] * (ma_int64)pDecodedSamples[-32];
+    case 31: prediction += coefficients[30] * (ma_int64)pDecodedSamples[-31];
+    case 30: prediction += coefficients[29] * (ma_int64)pDecodedSamples[-30];
+    case 29: prediction += coefficients[28] * (ma_int64)pDecodedSamples[-29];
+    case 28: prediction += coefficients[27] * (ma_int64)pDecodedSamples[-28];
+    case 27: prediction += coefficients[26] * (ma_int64)pDecodedSamples[-27];
+    case 26: prediction += coefficients[25] * (ma_int64)pDecodedSamples[-26];
+    case 25: prediction += coefficients[24] * (ma_int64)pDecodedSamples[-25];
+    case 24: prediction += coefficients[23] * (ma_int64)pDecodedSamples[-24];
+    case 23: prediction += coefficients[22] * (ma_int64)pDecodedSamples[-23];
+    case 22: prediction += coefficients[21] * (ma_int64)pDecodedSamples[-22];
+    case 21: prediction += coefficients[20] * (ma_int64)pDecodedSamples[-21];
+    case 20: prediction += coefficients[19] * (ma_int64)pDecodedSamples[-20];
+    case 19: prediction += coefficients[18] * (ma_int64)pDecodedSamples[-19];
+    case 18: prediction += coefficients[17] * (ma_int64)pDecodedSamples[-18];
+    case 17: prediction += coefficients[16] * (ma_int64)pDecodedSamples[-17];
+    case 16: prediction += coefficients[15] * (ma_int64)pDecodedSamples[-16];
+    case 15: prediction += coefficients[14] * (ma_int64)pDecodedSamples[-15];
+    case 14: prediction += coefficients[13] * (ma_int64)pDecodedSamples[-14];
+    case 13: prediction += coefficients[12] * (ma_int64)pDecodedSamples[-13];
+    case 12: prediction += coefficients[11] * (ma_int64)pDecodedSamples[-12];
+    case 11: prediction += coefficients[10] * (ma_int64)pDecodedSamples[-11];
+    case 10: prediction += coefficients[ 9] * (ma_int64)pDecodedSamples[-10];
+    case  9: prediction += coefficients[ 8] * (ma_int64)pDecodedSamples[- 9];
+    case  8: prediction += coefficients[ 7] * (ma_int64)pDecodedSamples[- 8];
+    case  7: prediction += coefficients[ 6] * (ma_int64)pDecodedSamples[- 7];
+    case  6: prediction += coefficients[ 5] * (ma_int64)pDecodedSamples[- 6];
+    case  5: prediction += coefficients[ 4] * (ma_int64)pDecodedSamples[- 5];
+    case  4: prediction += coefficients[ 3] * (ma_int64)pDecodedSamples[- 4];
+    case  3: prediction += coefficients[ 2] * (ma_int64)pDecodedSamples[- 3];
+    case  2: prediction += coefficients[ 1] * (ma_int64)pDecodedSamples[- 2];
+    case  1: prediction += coefficients[ 0] * (ma_int64)pDecodedSamples[- 1];
+    }
+#endif
+    return (ma_int32)(prediction >> shift);
+}
+#if 0
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__reference(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    ma_uint32 i;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
+    for (i = 0; i < count; ++i) {
+        ma_uint32 zeroCounter = 0;
+        for (;;) {
+            ma_uint8 bit;
+            if (!ma_dr_flac__read_uint8(bs, 1, &bit)) {
+                return MA_FALSE;
+            }
+            if (bit == 0) {
+                zeroCounter += 1;
+            } else {
+                break;
+            }
+        }
+        ma_uint32 decodedRice;
+        if (riceParam > 0) {
+            if (!ma_dr_flac__read_uint32(bs, riceParam, &decodedRice)) {
+                return MA_FALSE;
+            }
+        } else {
+            decodedRice = 0;
+        }
+        decodedRice |= (zeroCounter << riceParam);
+        if ((decodedRice & 0x01)) {
+            decodedRice = ~(decodedRice >> 1);
+        } else {
+            decodedRice =  (decodedRice >> 1);
+        }
+        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            pSamplesOut[i] = decodedRice + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
+        } else {
+            pSamplesOut[i] = decodedRice + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
+        }
+    }
+    return MA_TRUE;
+}
+#endif
+#if 0
+static ma_bool32 ma_dr_flac__read_rice_parts__reference(ma_dr_flac_bs* bs, ma_uint8 riceParam, ma_uint32* pZeroCounterOut, ma_uint32* pRiceParamPartOut)
+{
+    ma_uint32 zeroCounter = 0;
+    ma_uint32 decodedRice;
+    for (;;) {
+        ma_uint8 bit;
+        if (!ma_dr_flac__read_uint8(bs, 1, &bit)) {
+            return MA_FALSE;
+        }
+        if (bit == 0) {
+            zeroCounter += 1;
+        } else {
+            break;
+        }
+    }
+    if (riceParam > 0) {
+        if (!ma_dr_flac__read_uint32(bs, riceParam, &decodedRice)) {
+            return MA_FALSE;
+        }
+    } else {
+        decodedRice = 0;
+    }
+    *pZeroCounterOut = zeroCounter;
+    *pRiceParamPartOut = decodedRice;
+    return MA_TRUE;
+}
+#endif
+#if 0
+static MA_INLINE ma_bool32 ma_dr_flac__read_rice_parts(ma_dr_flac_bs* bs, ma_uint8 riceParam, ma_uint32* pZeroCounterOut, ma_uint32* pRiceParamPartOut)
+{
+    ma_dr_flac_cache_t riceParamMask;
+    ma_uint32 zeroCounter;
+    ma_uint32 setBitOffsetPlus1;
+    ma_uint32 riceParamPart;
+    ma_uint32 riceLength;
+    MA_DR_FLAC_ASSERT(riceParam > 0);
+    riceParamMask = MA_DR_FLAC_CACHE_L1_SELECTION_MASK(riceParam);
+    zeroCounter = 0;
+    while (bs->cache == 0) {
+        zeroCounter += (ma_uint32)MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs);
+        if (!ma_dr_flac__reload_cache(bs)) {
+            return MA_FALSE;
+        }
+    }
+    setBitOffsetPlus1 = ma_dr_flac__clz(bs->cache);
+    zeroCounter += setBitOffsetPlus1;
+    setBitOffsetPlus1 += 1;
+    riceLength = setBitOffsetPlus1 + riceParam;
+    if (riceLength < MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
+        riceParamPart = (ma_uint32)((bs->cache & (riceParamMask >> setBitOffsetPlus1)) >> MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT(bs, riceLength));
+        bs->consumedBits += riceLength;
+        bs->cache <<= riceLength;
+    } else {
+        ma_uint32 bitCountLo;
+        ma_dr_flac_cache_t resultHi;
+        bs->consumedBits += riceLength;
+        bs->cache <<= setBitOffsetPlus1 & (MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs)-1);
+        bitCountLo = bs->consumedBits - MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs);
+        resultHi = MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT(bs, riceParam);
+        if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
+#ifndef MA_DR_FLAC_NO_CRC
+            ma_dr_flac__update_crc16(bs);
+#endif
+            bs->cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+            bs->consumedBits = 0;
+#ifndef MA_DR_FLAC_NO_CRC
+            bs->crc16Cache = bs->cache;
+#endif
+        } else {
+            if (!ma_dr_flac__reload_cache(bs)) {
+                return MA_FALSE;
+            }
+            if (bitCountLo > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
+                return MA_FALSE;
+            }
+        }
+        riceParamPart = (ma_uint32)(resultHi | MA_DR_FLAC_CACHE_L1_SELECT_AND_SHIFT_SAFE(bs, bitCountLo));
+        bs->consumedBits += bitCountLo;
+        bs->cache <<= bitCountLo;
+    }
+    pZeroCounterOut[0] = zeroCounter;
+    pRiceParamPartOut[0] = riceParamPart;
+    return MA_TRUE;
+}
+#endif
+static MA_INLINE ma_bool32 ma_dr_flac__read_rice_parts_x1(ma_dr_flac_bs* bs, ma_uint8 riceParam, ma_uint32* pZeroCounterOut, ma_uint32* pRiceParamPartOut)
+{
+    ma_uint32  riceParamPlus1 = riceParam + 1;
+    ma_uint32  riceParamPlus1Shift = MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPlus1);
+    ma_uint32  riceParamPlus1MaxConsumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
+    ma_dr_flac_cache_t bs_cache = bs->cache;
+    ma_uint32  bs_consumedBits = bs->consumedBits;
+    ma_uint32  lzcount = ma_dr_flac__clz(bs_cache);
+    if (lzcount < sizeof(bs_cache)*8) {
+        pZeroCounterOut[0] = lzcount;
+    extract_rice_param_part:
+        bs_cache       <<= lzcount;
+        bs_consumedBits += lzcount;
+        if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
+            pRiceParamPartOut[0] = (ma_uint32)(bs_cache >> riceParamPlus1Shift);
+            bs_cache       <<= riceParamPlus1;
+            bs_consumedBits += riceParamPlus1;
+        } else {
+            ma_uint32 riceParamPartHi;
+            ma_uint32 riceParamPartLo;
+            ma_uint32 riceParamPartLoBitCount;
+            riceParamPartHi = (ma_uint32)(bs_cache >> riceParamPlus1Shift);
+            riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
+            MA_DR_FLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
+            if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef MA_DR_FLAC_NO_CRC
+                ma_dr_flac__update_crc16(bs);
+            #endif
+                bs_cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = riceParamPartLoBitCount;
+            #ifndef MA_DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                if (!ma_dr_flac__reload_cache(bs)) {
+                    return MA_FALSE;
+                }
+                if (riceParamPartLoBitCount > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
+                    return MA_FALSE;
+                }
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
+            }
+            riceParamPartLo = (ma_uint32)(bs_cache >> (MA_DR_FLAC_CACHE_L1_SELECTION_SHIFT(bs, riceParamPartLoBitCount)));
+            pRiceParamPartOut[0] = riceParamPartHi | riceParamPartLo;
+            bs_cache <<= riceParamPartLoBitCount;
+        }
+    } else {
+        ma_uint32 zeroCounter = (ma_uint32)(MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - bs_consumedBits);
+        for (;;) {
+            if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef MA_DR_FLAC_NO_CRC
+                ma_dr_flac__update_crc16(bs);
+            #endif
+                bs_cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = 0;
+            #ifndef MA_DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                if (!ma_dr_flac__reload_cache(bs)) {
+                    return MA_FALSE;
+                }
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits;
+            }
+            lzcount = ma_dr_flac__clz(bs_cache);
+            zeroCounter += lzcount;
+            if (lzcount < sizeof(bs_cache)*8) {
+                break;
+            }
+        }
+        pZeroCounterOut[0] = zeroCounter;
+        goto extract_rice_param_part;
+    }
+    bs->cache = bs_cache;
+    bs->consumedBits = bs_consumedBits;
+    return MA_TRUE;
+}
+static MA_INLINE ma_bool32 ma_dr_flac__seek_rice_parts(ma_dr_flac_bs* bs, ma_uint8 riceParam)
+{
+    ma_uint32  riceParamPlus1 = riceParam + 1;
+    ma_uint32  riceParamPlus1MaxConsumedBits = MA_DR_FLAC_CACHE_L1_SIZE_BITS(bs) - riceParamPlus1;
+    ma_dr_flac_cache_t bs_cache = bs->cache;
+    ma_uint32  bs_consumedBits = bs->consumedBits;
+    ma_uint32  lzcount = ma_dr_flac__clz(bs_cache);
+    if (lzcount < sizeof(bs_cache)*8) {
+    extract_rice_param_part:
+        bs_cache       <<= lzcount;
+        bs_consumedBits += lzcount;
+        if (bs_consumedBits <= riceParamPlus1MaxConsumedBits) {
+            bs_cache       <<= riceParamPlus1;
+            bs_consumedBits += riceParamPlus1;
+        } else {
+            ma_uint32 riceParamPartLoBitCount = bs_consumedBits - riceParamPlus1MaxConsumedBits;
+            MA_DR_FLAC_ASSERT(riceParamPartLoBitCount > 0 && riceParamPartLoBitCount < 32);
+            if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef MA_DR_FLAC_NO_CRC
+                ma_dr_flac__update_crc16(bs);
+            #endif
+                bs_cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = riceParamPartLoBitCount;
+            #ifndef MA_DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                if (!ma_dr_flac__reload_cache(bs)) {
+                    return MA_FALSE;
+                }
+                if (riceParamPartLoBitCount > MA_DR_FLAC_CACHE_L1_BITS_REMAINING(bs)) {
+                    return MA_FALSE;
+                }
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits + riceParamPartLoBitCount;
+            }
+            bs_cache <<= riceParamPartLoBitCount;
+        }
+    } else {
+        for (;;) {
+            if (bs->nextL2Line < MA_DR_FLAC_CACHE_L2_LINE_COUNT(bs)) {
+            #ifndef MA_DR_FLAC_NO_CRC
+                ma_dr_flac__update_crc16(bs);
+            #endif
+                bs_cache = ma_dr_flac__be2host__cache_line(bs->cacheL2[bs->nextL2Line++]);
+                bs_consumedBits = 0;
+            #ifndef MA_DR_FLAC_NO_CRC
+                bs->crc16Cache = bs_cache;
+            #endif
+            } else {
+                if (!ma_dr_flac__reload_cache(bs)) {
+                    return MA_FALSE;
+                }
+                bs_cache = bs->cache;
+                bs_consumedBits = bs->consumedBits;
+            }
+            lzcount = ma_dr_flac__clz(bs_cache);
+            if (lzcount < sizeof(bs_cache)*8) {
+                break;
+            }
+        }
+        goto extract_rice_param_part;
+    }
+    bs->cache = bs_cache;
+    bs->consumedBits = bs_consumedBits;
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__scalar_zeroorder(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    ma_uint32 zeroCountPart0;
+    ma_uint32 riceParamPart0;
+    ma_uint32 riceParamMask;
+    ma_uint32 i;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
+    (void)bitsPerSample;
+    (void)order;
+    (void)shift;
+    (void)coefficients;
+    riceParamMask  = (ma_uint32)~((~0UL) << riceParam);
+    i = 0;
+    while (i < count) {
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
+            return MA_FALSE;
+        }
+        riceParamPart0 &= riceParamMask;
+        riceParamPart0 |= (zeroCountPart0 << riceParam);
+        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+        pSamplesOut[i] = riceParamPart0;
+        i += 1;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__scalar(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    ma_uint32 zeroCountPart0 = 0;
+    ma_uint32 zeroCountPart1 = 0;
+    ma_uint32 zeroCountPart2 = 0;
+    ma_uint32 zeroCountPart3 = 0;
+    ma_uint32 riceParamPart0 = 0;
+    ma_uint32 riceParamPart1 = 0;
+    ma_uint32 riceParamPart2 = 0;
+    ma_uint32 riceParamPart3 = 0;
+    ma_uint32 riceParamMask;
+    const ma_int32* pSamplesOutEnd;
+    ma_uint32 i;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
+    if (lpcOrder == 0) {
+        return ma_dr_flac__decode_samples_with_residual__rice__scalar_zeroorder(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+    }
+    riceParamMask  = (ma_uint32)~((~0UL) << riceParam);
+    pSamplesOutEnd = pSamplesOut + (count & ~3);
+    if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+        while (pSamplesOut < pSamplesOutEnd) {
+            if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
+                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
+                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
+                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
+                return MA_FALSE;
+            }
+            riceParamPart0 &= riceParamMask;
+            riceParamPart1 &= riceParamMask;
+            riceParamPart2 &= riceParamMask;
+            riceParamPart3 &= riceParamMask;
+            riceParamPart0 |= (zeroCountPart0 << riceParam);
+            riceParamPart1 |= (zeroCountPart1 << riceParam);
+            riceParamPart2 |= (zeroCountPart2 << riceParam);
+            riceParamPart3 |= (zeroCountPart3 << riceParam);
+            riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+            riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
+            riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
+            riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
+            pSamplesOut[0] = riceParamPart0 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
+            pSamplesOut[1] = riceParamPart1 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 1);
+            pSamplesOut[2] = riceParamPart2 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 2);
+            pSamplesOut[3] = riceParamPart3 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 3);
+            pSamplesOut += 4;
+        }
+    } else {
+        while (pSamplesOut < pSamplesOutEnd) {
+            if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0) ||
+                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart1, &riceParamPart1) ||
+                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart2, &riceParamPart2) ||
+                !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart3, &riceParamPart3)) {
+                return MA_FALSE;
+            }
+            riceParamPart0 &= riceParamMask;
+            riceParamPart1 &= riceParamMask;
+            riceParamPart2 &= riceParamMask;
+            riceParamPart3 &= riceParamMask;
+            riceParamPart0 |= (zeroCountPart0 << riceParam);
+            riceParamPart1 |= (zeroCountPart1 << riceParam);
+            riceParamPart2 |= (zeroCountPart2 << riceParam);
+            riceParamPart3 |= (zeroCountPart3 << riceParam);
+            riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+            riceParamPart1  = (riceParamPart1 >> 1) ^ t[riceParamPart1 & 0x01];
+            riceParamPart2  = (riceParamPart2 >> 1) ^ t[riceParamPart2 & 0x01];
+            riceParamPart3  = (riceParamPart3 >> 1) ^ t[riceParamPart3 & 0x01];
+            pSamplesOut[0] = riceParamPart0 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
+            pSamplesOut[1] = riceParamPart1 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 1);
+            pSamplesOut[2] = riceParamPart2 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 2);
+            pSamplesOut[3] = riceParamPart3 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 3);
+            pSamplesOut += 4;
+        }
+    }
+    i = (count & ~3);
+    while (i < count) {
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountPart0, &riceParamPart0)) {
+            return MA_FALSE;
+        }
+        riceParamPart0 &= riceParamMask;
+        riceParamPart0 |= (zeroCountPart0 << riceParam);
+        riceParamPart0  = (riceParamPart0 >> 1) ^ t[riceParamPart0 & 0x01];
+        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            pSamplesOut[0] = riceParamPart0 + ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
+        } else {
+            pSamplesOut[0] = riceParamPart0 + ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + 0);
+        }
+        i += 1;
+        pSamplesOut += 1;
+    }
+    return MA_TRUE;
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE __m128i ma_dr_flac__mm_packs_interleaved_epi32(__m128i a, __m128i b)
+{
+    __m128i r;
+    r = _mm_packs_epi32(a, b);
+    r = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 1, 2, 0));
+    r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
+    r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(3, 1, 2, 0));
+    return r;
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_SSE41)
+static MA_INLINE __m128i ma_dr_flac__mm_not_si128(__m128i a)
+{
+    return _mm_xor_si128(a, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+}
+static MA_INLINE __m128i ma_dr_flac__mm_hadd_epi32(__m128i x)
+{
+    __m128i x64 = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+    __m128i x32 = _mm_shufflelo_epi16(x64, _MM_SHUFFLE(1, 0, 3, 2));
+    return _mm_add_epi32(x64, x32);
+}
+static MA_INLINE __m128i ma_dr_flac__mm_hadd_epi64(__m128i x)
+{
+    return _mm_add_epi64(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
+}
+static MA_INLINE __m128i ma_dr_flac__mm_srai_epi64(__m128i x, int count)
+{
+    __m128i lo = _mm_srli_epi64(x, count);
+    __m128i hi = _mm_srai_epi32(x, count);
+    hi = _mm_and_si128(hi, _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0));
+    return _mm_or_si128(lo, hi);
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__sse41_32(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    int i;
+    ma_uint32 riceParamMask;
+    ma_int32* pDecodedSamples    = pSamplesOut;
+    ma_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
+    ma_uint32 zeroCountParts0 = 0;
+    ma_uint32 zeroCountParts1 = 0;
+    ma_uint32 zeroCountParts2 = 0;
+    ma_uint32 zeroCountParts3 = 0;
+    ma_uint32 riceParamParts0 = 0;
+    ma_uint32 riceParamParts1 = 0;
+    ma_uint32 riceParamParts2 = 0;
+    ma_uint32 riceParamParts3 = 0;
+    __m128i coefficients128_0;
+    __m128i coefficients128_4;
+    __m128i coefficients128_8;
+    __m128i samples128_0;
+    __m128i samples128_4;
+    __m128i samples128_8;
+    __m128i riceParamMask128;
+    const ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    riceParamMask    = (ma_uint32)~((~0UL) << riceParam);
+    riceParamMask128 = _mm_set1_epi32(riceParamMask);
+    coefficients128_0 = _mm_setzero_si128();
+    coefficients128_4 = _mm_setzero_si128();
+    coefficients128_8 = _mm_setzero_si128();
+    samples128_0 = _mm_setzero_si128();
+    samples128_4 = _mm_setzero_si128();
+    samples128_8 = _mm_setzero_si128();
+#if 1
+    {
+        int runningOrder = order;
+        if (runningOrder >= 4) {
+            coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
+            samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
+                case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
+                case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
+            }
+            runningOrder = 0;
+        }
+        if (runningOrder >= 4) {
+            coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
+            samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
+                case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
+                case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
+            }
+            runningOrder = 0;
+        }
+        if (runningOrder == 4) {
+            coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
+            samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
+                case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
+                case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
+            }
+            runningOrder = 0;
+        }
+        coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
+        coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
+        coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
+    }
+#else
+    switch (order)
+    {
+    case 12: ((ma_int32*)&coefficients128_8)[0] = coefficients[11]; ((ma_int32*)&samples128_8)[0] = pDecodedSamples[-12];
+    case 11: ((ma_int32*)&coefficients128_8)[1] = coefficients[10]; ((ma_int32*)&samples128_8)[1] = pDecodedSamples[-11];
+    case 10: ((ma_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((ma_int32*)&samples128_8)[2] = pDecodedSamples[-10];
+    case 9:  ((ma_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((ma_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
+    case 8:  ((ma_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((ma_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
+    case 7:  ((ma_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((ma_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
+    case 6:  ((ma_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((ma_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
+    case 5:  ((ma_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((ma_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
+    case 4:  ((ma_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((ma_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
+    case 3:  ((ma_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((ma_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
+    case 2:  ((ma_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((ma_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
+    case 1:  ((ma_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((ma_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
+    }
+#endif
+    while (pDecodedSamples < pDecodedSamplesEnd) {
+        __m128i prediction128;
+        __m128i zeroCountPart128;
+        __m128i riceParamPart128;
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
+            return MA_FALSE;
+        }
+        zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
+        riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
+        riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
+        riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
+        riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(ma_dr_flac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(0x01))), _mm_set1_epi32(0x01)));
+        if (order <= 4) {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 = _mm_mullo_epi32(coefficients128_0, samples128_0);
+                prediction128 = ma_dr_flac__mm_hadd_epi32(prediction128);
+                prediction128 = _mm_srai_epi32(prediction128, shift);
+                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
+                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
+                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
+            }
+        } else if (order <= 8) {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 =                              _mm_mullo_epi32(coefficients128_4, samples128_4);
+                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
+                prediction128 = ma_dr_flac__mm_hadd_epi32(prediction128);
+                prediction128 = _mm_srai_epi32(prediction128, shift);
+                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
+                samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
+                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
+                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
+            }
+        } else {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 =                              _mm_mullo_epi32(coefficients128_8, samples128_8);
+                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_4, samples128_4));
+                prediction128 = _mm_add_epi32(prediction128, _mm_mullo_epi32(coefficients128_0, samples128_0));
+                prediction128 = ma_dr_flac__mm_hadd_epi32(prediction128);
+                prediction128 = _mm_srai_epi32(prediction128, shift);
+                prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
+                samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
+                samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
+                samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
+                riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
+            }
+        }
+        _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
+        pDecodedSamples += 4;
+    }
+    i = (count & ~3);
+    while (i < (int)count) {
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
+            return MA_FALSE;
+        }
+        riceParamParts0 &= riceParamMask;
+        riceParamParts0 |= (zeroCountParts0 << riceParam);
+        riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
+        pDecodedSamples[0] = riceParamParts0 + ma_dr_flac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
+        i += 1;
+        pDecodedSamples += 1;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__sse41_64(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    int i;
+    ma_uint32 riceParamMask;
+    ma_int32* pDecodedSamples    = pSamplesOut;
+    ma_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
+    ma_uint32 zeroCountParts0 = 0;
+    ma_uint32 zeroCountParts1 = 0;
+    ma_uint32 zeroCountParts2 = 0;
+    ma_uint32 zeroCountParts3 = 0;
+    ma_uint32 riceParamParts0 = 0;
+    ma_uint32 riceParamParts1 = 0;
+    ma_uint32 riceParamParts2 = 0;
+    ma_uint32 riceParamParts3 = 0;
+    __m128i coefficients128_0;
+    __m128i coefficients128_4;
+    __m128i coefficients128_8;
+    __m128i samples128_0;
+    __m128i samples128_4;
+    __m128i samples128_8;
+    __m128i prediction128;
+    __m128i riceParamMask128;
+    const ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    MA_DR_FLAC_ASSERT(order <= 12);
+    riceParamMask    = (ma_uint32)~((~0UL) << riceParam);
+    riceParamMask128 = _mm_set1_epi32(riceParamMask);
+    prediction128 = _mm_setzero_si128();
+    coefficients128_0  = _mm_setzero_si128();
+    coefficients128_4  = _mm_setzero_si128();
+    coefficients128_8  = _mm_setzero_si128();
+    samples128_0  = _mm_setzero_si128();
+    samples128_4  = _mm_setzero_si128();
+    samples128_8  = _mm_setzero_si128();
+#if 1
+    {
+        int runningOrder = order;
+        if (runningOrder >= 4) {
+            coefficients128_0 = _mm_loadu_si128((const __m128i*)(coefficients + 0));
+            samples128_0      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 4));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_0 = _mm_set_epi32(0, coefficients[2], coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], pSamplesOut[-3], 0); break;
+                case 2: coefficients128_0 = _mm_set_epi32(0, 0,               coefficients[1], coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], pSamplesOut[-2], 0,               0); break;
+                case 1: coefficients128_0 = _mm_set_epi32(0, 0,               0,               coefficients[0]); samples128_0 = _mm_set_epi32(pSamplesOut[-1], 0,               0,               0); break;
+            }
+            runningOrder = 0;
+        }
+        if (runningOrder >= 4) {
+            coefficients128_4 = _mm_loadu_si128((const __m128i*)(coefficients + 4));
+            samples128_4      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 8));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_4 = _mm_set_epi32(0, coefficients[6], coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], pSamplesOut[-7], 0); break;
+                case 2: coefficients128_4 = _mm_set_epi32(0, 0,               coefficients[5], coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], pSamplesOut[-6], 0,               0); break;
+                case 1: coefficients128_4 = _mm_set_epi32(0, 0,               0,               coefficients[4]); samples128_4 = _mm_set_epi32(pSamplesOut[-5], 0,               0,               0); break;
+            }
+            runningOrder = 0;
+        }
+        if (runningOrder == 4) {
+            coefficients128_8 = _mm_loadu_si128((const __m128i*)(coefficients + 8));
+            samples128_8      = _mm_loadu_si128((const __m128i*)(pSamplesOut  - 12));
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: coefficients128_8 = _mm_set_epi32(0, coefficients[10], coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], pSamplesOut[-11], 0); break;
+                case 2: coefficients128_8 = _mm_set_epi32(0, 0,                coefficients[9], coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], pSamplesOut[-10], 0,                0); break;
+                case 1: coefficients128_8 = _mm_set_epi32(0, 0,                0,               coefficients[8]); samples128_8 = _mm_set_epi32(pSamplesOut[-9], 0,                0,                0); break;
+            }
+            runningOrder = 0;
+        }
+        coefficients128_0 = _mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(0, 1, 2, 3));
+        coefficients128_4 = _mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(0, 1, 2, 3));
+        coefficients128_8 = _mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(0, 1, 2, 3));
+    }
+#else
+    switch (order)
+    {
+    case 12: ((ma_int32*)&coefficients128_8)[0] = coefficients[11]; ((ma_int32*)&samples128_8)[0] = pDecodedSamples[-12];
+    case 11: ((ma_int32*)&coefficients128_8)[1] = coefficients[10]; ((ma_int32*)&samples128_8)[1] = pDecodedSamples[-11];
+    case 10: ((ma_int32*)&coefficients128_8)[2] = coefficients[ 9]; ((ma_int32*)&samples128_8)[2] = pDecodedSamples[-10];
+    case 9:  ((ma_int32*)&coefficients128_8)[3] = coefficients[ 8]; ((ma_int32*)&samples128_8)[3] = pDecodedSamples[- 9];
+    case 8:  ((ma_int32*)&coefficients128_4)[0] = coefficients[ 7]; ((ma_int32*)&samples128_4)[0] = pDecodedSamples[- 8];
+    case 7:  ((ma_int32*)&coefficients128_4)[1] = coefficients[ 6]; ((ma_int32*)&samples128_4)[1] = pDecodedSamples[- 7];
+    case 6:  ((ma_int32*)&coefficients128_4)[2] = coefficients[ 5]; ((ma_int32*)&samples128_4)[2] = pDecodedSamples[- 6];
+    case 5:  ((ma_int32*)&coefficients128_4)[3] = coefficients[ 4]; ((ma_int32*)&samples128_4)[3] = pDecodedSamples[- 5];
+    case 4:  ((ma_int32*)&coefficients128_0)[0] = coefficients[ 3]; ((ma_int32*)&samples128_0)[0] = pDecodedSamples[- 4];
+    case 3:  ((ma_int32*)&coefficients128_0)[1] = coefficients[ 2]; ((ma_int32*)&samples128_0)[1] = pDecodedSamples[- 3];
+    case 2:  ((ma_int32*)&coefficients128_0)[2] = coefficients[ 1]; ((ma_int32*)&samples128_0)[2] = pDecodedSamples[- 2];
+    case 1:  ((ma_int32*)&coefficients128_0)[3] = coefficients[ 0]; ((ma_int32*)&samples128_0)[3] = pDecodedSamples[- 1];
+    }
+#endif
+    while (pDecodedSamples < pDecodedSamplesEnd) {
+        __m128i zeroCountPart128;
+        __m128i riceParamPart128;
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts1, &riceParamParts1) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts2, &riceParamParts2) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts3, &riceParamParts3)) {
+            return MA_FALSE;
+        }
+        zeroCountPart128 = _mm_set_epi32(zeroCountParts3, zeroCountParts2, zeroCountParts1, zeroCountParts0);
+        riceParamPart128 = _mm_set_epi32(riceParamParts3, riceParamParts2, riceParamParts1, riceParamParts0);
+        riceParamPart128 = _mm_and_si128(riceParamPart128, riceParamMask128);
+        riceParamPart128 = _mm_or_si128(riceParamPart128, _mm_slli_epi32(zeroCountPart128, riceParam));
+        riceParamPart128 = _mm_xor_si128(_mm_srli_epi32(riceParamPart128, 1), _mm_add_epi32(ma_dr_flac__mm_not_si128(_mm_and_si128(riceParamPart128, _mm_set1_epi32(1))), _mm_set1_epi32(1)));
+        for (i = 0; i < 4; i += 1) {
+            prediction128 = _mm_xor_si128(prediction128, prediction128);
+            switch (order)
+            {
+            case 12:
+            case 11: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(1, 1, 0, 0))));
+            case 10:
+            case  9: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_8, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_8, _MM_SHUFFLE(3, 3, 2, 2))));
+            case  8:
+            case  7: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(1, 1, 0, 0))));
+            case  6:
+            case  5: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_4, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_4, _MM_SHUFFLE(3, 3, 2, 2))));
+            case  4:
+            case  3: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(1, 1, 0, 0)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(1, 1, 0, 0))));
+            case  2:
+            case  1: prediction128 = _mm_add_epi64(prediction128, _mm_mul_epi32(_mm_shuffle_epi32(coefficients128_0, _MM_SHUFFLE(3, 3, 2, 2)), _mm_shuffle_epi32(samples128_0, _MM_SHUFFLE(3, 3, 2, 2))));
+            }
+            prediction128 = ma_dr_flac__mm_hadd_epi64(prediction128);
+            prediction128 = ma_dr_flac__mm_srai_epi64(prediction128, shift);
+            prediction128 = _mm_add_epi32(riceParamPart128, prediction128);
+            samples128_8 = _mm_alignr_epi8(samples128_4,  samples128_8, 4);
+            samples128_4 = _mm_alignr_epi8(samples128_0,  samples128_4, 4);
+            samples128_0 = _mm_alignr_epi8(prediction128, samples128_0, 4);
+            riceParamPart128 = _mm_alignr_epi8(_mm_setzero_si128(), riceParamPart128, 4);
+        }
+        _mm_storeu_si128((__m128i*)pDecodedSamples, samples128_0);
+        pDecodedSamples += 4;
+    }
+    i = (count & ~3);
+    while (i < (int)count) {
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts0, &riceParamParts0)) {
+            return MA_FALSE;
+        }
+        riceParamParts0 &= riceParamMask;
+        riceParamParts0 |= (zeroCountParts0 << riceParam);
+        riceParamParts0  = (riceParamParts0 >> 1) ^ t[riceParamParts0 & 0x01];
+        pDecodedSamples[0] = riceParamParts0 + ma_dr_flac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
+        i += 1;
+        pDecodedSamples += 1;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__sse41(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
+    if (lpcOrder > 0 && lpcOrder <= 12) {
+        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            return ma_dr_flac__decode_samples_with_residual__rice__sse41_64(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+        } else {
+            return ma_dr_flac__decode_samples_with_residual__rice__sse41_32(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+        }
+    } else {
+        return ma_dr_flac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac__vst2q_s32(ma_int32* p, int32x4x2_t x)
+{
+    vst1q_s32(p+0, x.val[0]);
+    vst1q_s32(p+4, x.val[1]);
+}
+static MA_INLINE void ma_dr_flac__vst2q_u32(ma_uint32* p, uint32x4x2_t x)
+{
+    vst1q_u32(p+0, x.val[0]);
+    vst1q_u32(p+4, x.val[1]);
+}
+static MA_INLINE void ma_dr_flac__vst2q_f32(float* p, float32x4x2_t x)
+{
+    vst1q_f32(p+0, x.val[0]);
+    vst1q_f32(p+4, x.val[1]);
+}
+static MA_INLINE void ma_dr_flac__vst2q_s16(ma_int16* p, int16x4x2_t x)
+{
+    vst1q_s16(p, vcombine_s16(x.val[0], x.val[1]));
+}
+static MA_INLINE void ma_dr_flac__vst2q_u16(ma_uint16* p, uint16x4x2_t x)
+{
+    vst1q_u16(p, vcombine_u16(x.val[0], x.val[1]));
+}
+static MA_INLINE int32x4_t ma_dr_flac__vdupq_n_s32x4(ma_int32 x3, ma_int32 x2, ma_int32 x1, ma_int32 x0)
+{
+    ma_int32 x[4];
+    x[3] = x3;
+    x[2] = x2;
+    x[1] = x1;
+    x[0] = x0;
+    return vld1q_s32(x);
+}
+static MA_INLINE int32x4_t ma_dr_flac__valignrq_s32_1(int32x4_t a, int32x4_t b)
+{
+    return vextq_s32(b, a, 1);
+}
+static MA_INLINE uint32x4_t ma_dr_flac__valignrq_u32_1(uint32x4_t a, uint32x4_t b)
+{
+    return vextq_u32(b, a, 1);
+}
+static MA_INLINE int32x2_t ma_dr_flac__vhaddq_s32(int32x4_t x)
+{
+    int32x2_t r = vadd_s32(vget_high_s32(x), vget_low_s32(x));
+    return vpadd_s32(r, r);
+}
+static MA_INLINE int64x1_t ma_dr_flac__vhaddq_s64(int64x2_t x)
+{
+    return vadd_s64(vget_high_s64(x), vget_low_s64(x));
+}
+static MA_INLINE int32x4_t ma_dr_flac__vrevq_s32(int32x4_t x)
+{
+    return vrev64q_s32(vcombine_s32(vget_high_s32(x), vget_low_s32(x)));
+}
+static MA_INLINE int32x4_t ma_dr_flac__vnotq_s32(int32x4_t x)
+{
+    return veorq_s32(x, vdupq_n_s32(0xFFFFFFFF));
+}
+static MA_INLINE uint32x4_t ma_dr_flac__vnotq_u32(uint32x4_t x)
+{
+    return veorq_u32(x, vdupq_n_u32(0xFFFFFFFF));
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__neon_32(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    int i;
+    ma_uint32 riceParamMask;
+    ma_int32* pDecodedSamples    = pSamplesOut;
+    ma_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
+    ma_uint32 zeroCountParts[4];
+    ma_uint32 riceParamParts[4];
+    int32x4_t coefficients128_0;
+    int32x4_t coefficients128_4;
+    int32x4_t coefficients128_8;
+    int32x4_t samples128_0;
+    int32x4_t samples128_4;
+    int32x4_t samples128_8;
+    uint32x4_t riceParamMask128;
+    int32x4_t riceParam128;
+    int32x2_t shift64;
+    uint32x4_t one128;
+    const ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    riceParamMask    = (ma_uint32)~((~0UL) << riceParam);
+    riceParamMask128 = vdupq_n_u32(riceParamMask);
+    riceParam128 = vdupq_n_s32(riceParam);
+    shift64 = vdup_n_s32(-shift);
+    one128 = vdupq_n_u32(1);
+    {
+        int runningOrder = order;
+        ma_int32 tempC[4] = {0, 0, 0, 0};
+        ma_int32 tempS[4] = {0, 0, 0, 0};
+        if (runningOrder >= 4) {
+            coefficients128_0 = vld1q_s32(coefficients + 0);
+            samples128_0      = vld1q_s32(pSamplesOut  - 4);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3];
+                case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2];
+                case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1];
+            }
+            coefficients128_0 = vld1q_s32(tempC);
+            samples128_0      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+        if (runningOrder >= 4) {
+            coefficients128_4 = vld1q_s32(coefficients + 4);
+            samples128_4      = vld1q_s32(pSamplesOut  - 8);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7];
+                case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6];
+                case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5];
+            }
+            coefficients128_4 = vld1q_s32(tempC);
+            samples128_4      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+        if (runningOrder == 4) {
+            coefficients128_8 = vld1q_s32(coefficients + 8);
+            samples128_8      = vld1q_s32(pSamplesOut  - 12);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11];
+                case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10];
+                case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9];
+            }
+            coefficients128_8 = vld1q_s32(tempC);
+            samples128_8      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+        coefficients128_0 = ma_dr_flac__vrevq_s32(coefficients128_0);
+        coefficients128_4 = ma_dr_flac__vrevq_s32(coefficients128_4);
+        coefficients128_8 = ma_dr_flac__vrevq_s32(coefficients128_8);
+    }
+    while (pDecodedSamples < pDecodedSamplesEnd) {
+        int32x4_t prediction128;
+        int32x2_t prediction64;
+        uint32x4_t zeroCountPart128;
+        uint32x4_t riceParamPart128;
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
+            return MA_FALSE;
+        }
+        zeroCountPart128 = vld1q_u32(zeroCountParts);
+        riceParamPart128 = vld1q_u32(riceParamParts);
+        riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
+        riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
+        riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(ma_dr_flac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
+        if (order <= 4) {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 = vmulq_s32(coefficients128_0, samples128_0);
+                prediction64 = ma_dr_flac__vhaddq_s32(prediction128);
+                prediction64 = vshl_s32(prediction64, shift64);
+                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
+                samples128_0 = ma_dr_flac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
+                riceParamPart128 = ma_dr_flac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
+            }
+        } else if (order <= 8) {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 =                vmulq_s32(coefficients128_4, samples128_4);
+                prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
+                prediction64 = ma_dr_flac__vhaddq_s32(prediction128);
+                prediction64 = vshl_s32(prediction64, shift64);
+                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
+                samples128_4 = ma_dr_flac__valignrq_s32_1(samples128_0, samples128_4);
+                samples128_0 = ma_dr_flac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
+                riceParamPart128 = ma_dr_flac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
+            }
+        } else {
+            for (i = 0; i < 4; i += 1) {
+                prediction128 =                vmulq_s32(coefficients128_8, samples128_8);
+                prediction128 = vmlaq_s32(prediction128, coefficients128_4, samples128_4);
+                prediction128 = vmlaq_s32(prediction128, coefficients128_0, samples128_0);
+                prediction64 = ma_dr_flac__vhaddq_s32(prediction128);
+                prediction64 = vshl_s32(prediction64, shift64);
+                prediction64 = vadd_s32(prediction64, vget_low_s32(vreinterpretq_s32_u32(riceParamPart128)));
+                samples128_8 = ma_dr_flac__valignrq_s32_1(samples128_4, samples128_8);
+                samples128_4 = ma_dr_flac__valignrq_s32_1(samples128_0, samples128_4);
+                samples128_0 = ma_dr_flac__valignrq_s32_1(vcombine_s32(prediction64, vdup_n_s32(0)), samples128_0);
+                riceParamPart128 = ma_dr_flac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
+            }
+        }
+        vst1q_s32(pDecodedSamples, samples128_0);
+        pDecodedSamples += 4;
+    }
+    i = (count & ~3);
+    while (i < (int)count) {
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
+            return MA_FALSE;
+        }
+        riceParamParts[0] &= riceParamMask;
+        riceParamParts[0] |= (zeroCountParts[0] << riceParam);
+        riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
+        pDecodedSamples[0] = riceParamParts[0] + ma_dr_flac__calculate_prediction_32(order, shift, coefficients, pDecodedSamples);
+        i += 1;
+        pDecodedSamples += 1;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__neon_64(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam, ma_uint32 order, ma_int32 shift, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    int i;
+    ma_uint32 riceParamMask;
+    ma_int32* pDecodedSamples    = pSamplesOut;
+    ma_int32* pDecodedSamplesEnd = pSamplesOut + (count & ~3);
+    ma_uint32 zeroCountParts[4];
+    ma_uint32 riceParamParts[4];
+    int32x4_t coefficients128_0;
+    int32x4_t coefficients128_4;
+    int32x4_t coefficients128_8;
+    int32x4_t samples128_0;
+    int32x4_t samples128_4;
+    int32x4_t samples128_8;
+    uint32x4_t riceParamMask128;
+    int32x4_t riceParam128;
+    int64x1_t shift64;
+    uint32x4_t one128;
+    int64x2_t prediction128 = { 0 };
+    uint32x4_t zeroCountPart128;
+    uint32x4_t riceParamPart128;
+    const ma_uint32 t[2] = {0x00000000, 0xFFFFFFFF};
+    riceParamMask    = (ma_uint32)~((~0UL) << riceParam);
+    riceParamMask128 = vdupq_n_u32(riceParamMask);
+    riceParam128 = vdupq_n_s32(riceParam);
+    shift64 = vdup_n_s64(-shift);
+    one128 = vdupq_n_u32(1);
+    {
+        int runningOrder = order;
+        ma_int32 tempC[4] = {0, 0, 0, 0};
+        ma_int32 tempS[4] = {0, 0, 0, 0};
+        if (runningOrder >= 4) {
+            coefficients128_0 = vld1q_s32(coefficients + 0);
+            samples128_0      = vld1q_s32(pSamplesOut  - 4);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[2]; tempS[1] = pSamplesOut[-3];
+                case 2: tempC[1] = coefficients[1]; tempS[2] = pSamplesOut[-2];
+                case 1: tempC[0] = coefficients[0]; tempS[3] = pSamplesOut[-1];
+            }
+            coefficients128_0 = vld1q_s32(tempC);
+            samples128_0      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+        if (runningOrder >= 4) {
+            coefficients128_4 = vld1q_s32(coefficients + 4);
+            samples128_4      = vld1q_s32(pSamplesOut  - 8);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[6]; tempS[1] = pSamplesOut[-7];
+                case 2: tempC[1] = coefficients[5]; tempS[2] = pSamplesOut[-6];
+                case 1: tempC[0] = coefficients[4]; tempS[3] = pSamplesOut[-5];
+            }
+            coefficients128_4 = vld1q_s32(tempC);
+            samples128_4      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+        if (runningOrder == 4) {
+            coefficients128_8 = vld1q_s32(coefficients + 8);
+            samples128_8      = vld1q_s32(pSamplesOut  - 12);
+            runningOrder -= 4;
+        } else {
+            switch (runningOrder) {
+                case 3: tempC[2] = coefficients[10]; tempS[1] = pSamplesOut[-11];
+                case 2: tempC[1] = coefficients[ 9]; tempS[2] = pSamplesOut[-10];
+                case 1: tempC[0] = coefficients[ 8]; tempS[3] = pSamplesOut[- 9];
+            }
+            coefficients128_8 = vld1q_s32(tempC);
+            samples128_8      = vld1q_s32(tempS);
+            runningOrder = 0;
+        }
+        coefficients128_0 = ma_dr_flac__vrevq_s32(coefficients128_0);
+        coefficients128_4 = ma_dr_flac__vrevq_s32(coefficients128_4);
+        coefficients128_8 = ma_dr_flac__vrevq_s32(coefficients128_8);
+    }
+    while (pDecodedSamples < pDecodedSamplesEnd) {
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0]) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[1], &riceParamParts[1]) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[2], &riceParamParts[2]) ||
+            !ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[3], &riceParamParts[3])) {
+            return MA_FALSE;
+        }
+        zeroCountPart128 = vld1q_u32(zeroCountParts);
+        riceParamPart128 = vld1q_u32(riceParamParts);
+        riceParamPart128 = vandq_u32(riceParamPart128, riceParamMask128);
+        riceParamPart128 = vorrq_u32(riceParamPart128, vshlq_u32(zeroCountPart128, riceParam128));
+        riceParamPart128 = veorq_u32(vshrq_n_u32(riceParamPart128, 1), vaddq_u32(ma_dr_flac__vnotq_u32(vandq_u32(riceParamPart128, one128)), one128));
+        for (i = 0; i < 4; i += 1) {
+            int64x1_t prediction64;
+            prediction128 = veorq_s64(prediction128, prediction128);
+            switch (order)
+            {
+            case 12:
+            case 11: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_8), vget_low_s32(samples128_8)));
+            case 10:
+            case  9: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_8), vget_high_s32(samples128_8)));
+            case  8:
+            case  7: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_4), vget_low_s32(samples128_4)));
+            case  6:
+            case  5: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_4), vget_high_s32(samples128_4)));
+            case  4:
+            case  3: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_low_s32(coefficients128_0), vget_low_s32(samples128_0)));
+            case  2:
+            case  1: prediction128 = vaddq_s64(prediction128, vmull_s32(vget_high_s32(coefficients128_0), vget_high_s32(samples128_0)));
+            }
+            prediction64 = ma_dr_flac__vhaddq_s64(prediction128);
+            prediction64 = vshl_s64(prediction64, shift64);
+            prediction64 = vadd_s64(prediction64, vdup_n_s64(vgetq_lane_u32(riceParamPart128, 0)));
+            samples128_8 = ma_dr_flac__valignrq_s32_1(samples128_4, samples128_8);
+            samples128_4 = ma_dr_flac__valignrq_s32_1(samples128_0, samples128_4);
+            samples128_0 = ma_dr_flac__valignrq_s32_1(vcombine_s32(vreinterpret_s32_s64(prediction64), vdup_n_s32(0)), samples128_0);
+            riceParamPart128 = ma_dr_flac__valignrq_u32_1(vdupq_n_u32(0), riceParamPart128);
+        }
+        vst1q_s32(pDecodedSamples, samples128_0);
+        pDecodedSamples += 4;
+    }
+    i = (count & ~3);
+    while (i < (int)count) {
+        if (!ma_dr_flac__read_rice_parts_x1(bs, riceParam, &zeroCountParts[0], &riceParamParts[0])) {
+            return MA_FALSE;
+        }
+        riceParamParts[0] &= riceParamMask;
+        riceParamParts[0] |= (zeroCountParts[0] << riceParam);
+        riceParamParts[0]  = (riceParamParts[0] >> 1) ^ t[riceParamParts[0] & 0x01];
+        pDecodedSamples[0] = riceParamParts[0] + ma_dr_flac__calculate_prediction_64(order, shift, coefficients, pDecodedSamples);
+        i += 1;
+        pDecodedSamples += 1;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice__neon(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
+    if (lpcOrder > 0 && lpcOrder <= 12) {
+        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            return ma_dr_flac__decode_samples_with_residual__rice__neon_64(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+        } else {
+            return ma_dr_flac__decode_samples_with_residual__rice__neon_32(bs, count, riceParam, lpcOrder, lpcShift, coefficients, pSamplesOut);
+        }
+    } else {
+        return ma_dr_flac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    }
+}
+#endif
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__rice(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 riceParam, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE41)
+    if (ma_dr_flac__gIsSSE41Supported) {
+        return ma_dr_flac__decode_samples_with_residual__rice__sse41(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported) {
+        return ma_dr_flac__decode_samples_with_residual__rice__neon(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    } else
+#endif
+    {
+    #if 0
+        return ma_dr_flac__decode_samples_with_residual__rice__reference(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    #else
+        return ma_dr_flac__decode_samples_with_residual__rice__scalar(bs, bitsPerSample, count, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pSamplesOut);
+    #endif
+    }
+}
+static ma_bool32 ma_dr_flac__read_and_seek_residual__rice(ma_dr_flac_bs* bs, ma_uint32 count, ma_uint8 riceParam)
+{
+    ma_uint32 i;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    for (i = 0; i < count; ++i) {
+        if (!ma_dr_flac__seek_rice_parts(bs, riceParam)) {
+            return MA_FALSE;
+        }
+    }
+    return MA_TRUE;
+}
+#if defined(__clang__)
+__attribute__((no_sanitize("signed-integer-overflow")))
+#endif
+static ma_bool32 ma_dr_flac__decode_samples_with_residual__unencoded(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 count, ma_uint8 unencodedBitsPerSample, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pSamplesOut)
+{
+    ma_uint32 i;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(unencodedBitsPerSample <= 31);
+    MA_DR_FLAC_ASSERT(pSamplesOut != NULL);
+    for (i = 0; i < count; ++i) {
+        if (unencodedBitsPerSample > 0) {
+            if (!ma_dr_flac__read_int32(bs, unencodedBitsPerSample, pSamplesOut + i)) {
+                return MA_FALSE;
+            }
+        } else {
+            pSamplesOut[i] = 0;
+        }
+        if (ma_dr_flac__use_64_bit_prediction(bitsPerSample, lpcOrder, lpcPrecision)) {
+            pSamplesOut[i] += ma_dr_flac__calculate_prediction_64(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
+        } else {
+            pSamplesOut[i] += ma_dr_flac__calculate_prediction_32(lpcOrder, lpcShift, coefficients, pSamplesOut + i);
+        }
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples_with_residual(ma_dr_flac_bs* bs, ma_uint32 bitsPerSample, ma_uint32 blockSize, ma_uint32 lpcOrder, ma_int32 lpcShift, ma_uint32 lpcPrecision, const ma_int32* coefficients, ma_int32* pDecodedSamples)
+{
+    ma_uint8 residualMethod;
+    ma_uint8 partitionOrder;
+    ma_uint32 samplesInPartition;
+    ma_uint32 partitionsRemaining;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(blockSize != 0);
+    MA_DR_FLAC_ASSERT(pDecodedSamples != NULL);
+    if (!ma_dr_flac__read_uint8(bs, 2, &residualMethod)) {
+        return MA_FALSE;
+    }
+    if (residualMethod != MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return MA_FALSE;
+    }
+    pDecodedSamples += lpcOrder;
+    if (!ma_dr_flac__read_uint8(bs, 4, &partitionOrder)) {
+        return MA_FALSE;
+    }
+    if (partitionOrder > 8) {
+        return MA_FALSE;
+    }
+    if ((blockSize / (1 << partitionOrder)) < lpcOrder) {
+        return MA_FALSE;
+    }
+    samplesInPartition = (blockSize / (1 << partitionOrder)) - lpcOrder;
+    partitionsRemaining = (1 << partitionOrder);
+    for (;;) {
+        ma_uint8 riceParam = 0;
+        if (residualMethod == MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!ma_dr_flac__read_uint8(bs, 4, &riceParam)) {
+                return MA_FALSE;
+            }
+            if (riceParam == 15) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!ma_dr_flac__read_uint8(bs, 5, &riceParam)) {
+                return MA_FALSE;
+            }
+            if (riceParam == 31) {
+                riceParam = 0xFF;
+            }
+        }
+        if (riceParam != 0xFF) {
+            if (!ma_dr_flac__decode_samples_with_residual__rice(bs, bitsPerSample, samplesInPartition, riceParam, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
+                return MA_FALSE;
+            }
+        } else {
+            ma_uint8 unencodedBitsPerSample = 0;
+            if (!ma_dr_flac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
+                return MA_FALSE;
+            }
+            if (!ma_dr_flac__decode_samples_with_residual__unencoded(bs, bitsPerSample, samplesInPartition, unencodedBitsPerSample, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
+                return MA_FALSE;
+            }
+        }
+        pDecodedSamples += samplesInPartition;
+        if (partitionsRemaining == 1) {
+            break;
+        }
+        partitionsRemaining -= 1;
+        if (partitionOrder != 0) {
+            samplesInPartition = blockSize / (1 << partitionOrder);
+        }
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__read_and_seek_residual(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 order)
+{
+    ma_uint8 residualMethod;
+    ma_uint8 partitionOrder;
+    ma_uint32 samplesInPartition;
+    ma_uint32 partitionsRemaining;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(blockSize != 0);
+    if (!ma_dr_flac__read_uint8(bs, 2, &residualMethod)) {
+        return MA_FALSE;
+    }
+    if (residualMethod != MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE && residualMethod != MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+        return MA_FALSE;
+    }
+    if (!ma_dr_flac__read_uint8(bs, 4, &partitionOrder)) {
+        return MA_FALSE;
+    }
+    if (partitionOrder > 8) {
+        return MA_FALSE;
+    }
+    if ((blockSize / (1 << partitionOrder)) <= order) {
+        return MA_FALSE;
+    }
+    samplesInPartition = (blockSize / (1 << partitionOrder)) - order;
+    partitionsRemaining = (1 << partitionOrder);
+    for (;;)
+    {
+        ma_uint8 riceParam = 0;
+        if (residualMethod == MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE) {
+            if (!ma_dr_flac__read_uint8(bs, 4, &riceParam)) {
+                return MA_FALSE;
+            }
+            if (riceParam == 15) {
+                riceParam = 0xFF;
+            }
+        } else if (residualMethod == MA_DR_FLAC_RESIDUAL_CODING_METHOD_PARTITIONED_RICE2) {
+            if (!ma_dr_flac__read_uint8(bs, 5, &riceParam)) {
+                return MA_FALSE;
+            }
+            if (riceParam == 31) {
+                riceParam = 0xFF;
+            }
+        }
+        if (riceParam != 0xFF) {
+            if (!ma_dr_flac__read_and_seek_residual__rice(bs, samplesInPartition, riceParam)) {
+                return MA_FALSE;
+            }
+        } else {
+            ma_uint8 unencodedBitsPerSample = 0;
+            if (!ma_dr_flac__read_uint8(bs, 5, &unencodedBitsPerSample)) {
+                return MA_FALSE;
+            }
+            if (!ma_dr_flac__seek_bits(bs, unencodedBitsPerSample * samplesInPartition)) {
+                return MA_FALSE;
+            }
+        }
+        if (partitionsRemaining == 1) {
+            break;
+        }
+        partitionsRemaining -= 1;
+        samplesInPartition = blockSize / (1 << partitionOrder);
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples__constant(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 subframeBitsPerSample, ma_int32* pDecodedSamples)
+{
+    ma_uint32 i;
+    ma_int32 sample;
+    if (!ma_dr_flac__read_int32(bs, subframeBitsPerSample, &sample)) {
+        return MA_FALSE;
+    }
+    for (i = 0; i < blockSize; ++i) {
+        pDecodedSamples[i] = sample;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples__verbatim(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 subframeBitsPerSample, ma_int32* pDecodedSamples)
+{
+    ma_uint32 i;
+    for (i = 0; i < blockSize; ++i) {
+        ma_int32 sample;
+        if (!ma_dr_flac__read_int32(bs, subframeBitsPerSample, &sample)) {
+            return MA_FALSE;
+        }
+        pDecodedSamples[i] = sample;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples__fixed(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 subframeBitsPerSample, ma_uint8 lpcOrder, ma_int32* pDecodedSamples)
+{
+    ma_uint32 i;
+    static ma_int32 lpcCoefficientsTable[5][4] = {
+        {0,  0, 0,  0},
+        {1,  0, 0,  0},
+        {2, -1, 0,  0},
+        {3, -3, 1,  0},
+        {4, -6, 4, -1}
+    };
+    for (i = 0; i < lpcOrder; ++i) {
+        ma_int32 sample;
+        if (!ma_dr_flac__read_int32(bs, subframeBitsPerSample, &sample)) {
+            return MA_FALSE;
+        }
+        pDecodedSamples[i] = sample;
+    }
+    if (!ma_dr_flac__decode_samples_with_residual(bs, subframeBitsPerSample, blockSize, lpcOrder, 0, 4, lpcCoefficientsTable[lpcOrder], pDecodedSamples)) {
+        return MA_FALSE;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_samples__lpc(ma_dr_flac_bs* bs, ma_uint32 blockSize, ma_uint32 bitsPerSample, ma_uint8 lpcOrder, ma_int32* pDecodedSamples)
+{
+    ma_uint8 i;
+    ma_uint8 lpcPrecision;
+    ma_int8 lpcShift;
+    ma_int32 coefficients[32];
+    for (i = 0; i < lpcOrder; ++i) {
+        ma_int32 sample;
+        if (!ma_dr_flac__read_int32(bs, bitsPerSample, &sample)) {
+            return MA_FALSE;
+        }
+        pDecodedSamples[i] = sample;
+    }
+    if (!ma_dr_flac__read_uint8(bs, 4, &lpcPrecision)) {
+        return MA_FALSE;
+    }
+    if (lpcPrecision == 15) {
+        return MA_FALSE;
+    }
+    lpcPrecision += 1;
+    if (!ma_dr_flac__read_int8(bs, 5, &lpcShift)) {
+        return MA_FALSE;
+    }
+    if (lpcShift < 0) {
+        return MA_FALSE;
+    }
+    MA_DR_FLAC_ZERO_MEMORY(coefficients, sizeof(coefficients));
+    for (i = 0; i < lpcOrder; ++i) {
+        if (!ma_dr_flac__read_int32(bs, lpcPrecision, coefficients + i)) {
+            return MA_FALSE;
+        }
+    }
+    if (!ma_dr_flac__decode_samples_with_residual(bs, bitsPerSample, blockSize, lpcOrder, lpcShift, lpcPrecision, coefficients, pDecodedSamples)) {
+        return MA_FALSE;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__read_next_flac_frame_header(ma_dr_flac_bs* bs, ma_uint8 streaminfoBitsPerSample, ma_dr_flac_frame_header* header)
+{
+    const ma_uint32 sampleRateTable[12]  = {0, 88200, 176400, 192000, 8000, 16000, 22050, 24000, 32000, 44100, 48000, 96000};
+    const ma_uint8 bitsPerSampleTable[8] = {0, 8, 12, (ma_uint8)-1, 16, 20, 24, (ma_uint8)-1};
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(header != NULL);
+    for (;;) {
+        ma_uint8 crc8 = 0xCE;
+        ma_uint8 reserved = 0;
+        ma_uint8 blockingStrategy = 0;
+        ma_uint8 blockSize = 0;
+        ma_uint8 sampleRate = 0;
+        ma_uint8 channelAssignment = 0;
+        ma_uint8 bitsPerSample = 0;
+        ma_bool32 isVariableBlockSize;
+        if (!ma_dr_flac__find_and_seek_to_next_sync_code(bs)) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_flac__read_uint8(bs, 1, &reserved)) {
+            return MA_FALSE;
+        }
+        if (reserved == 1) {
+            continue;
+        }
+        crc8 = ma_dr_flac_crc8(crc8, reserved, 1);
+        if (!ma_dr_flac__read_uint8(bs, 1, &blockingStrategy)) {
+            return MA_FALSE;
+        }
+        crc8 = ma_dr_flac_crc8(crc8, blockingStrategy, 1);
+        if (!ma_dr_flac__read_uint8(bs, 4, &blockSize)) {
+            return MA_FALSE;
+        }
+        if (blockSize == 0) {
+            continue;
+        }
+        crc8 = ma_dr_flac_crc8(crc8, blockSize, 4);
+        if (!ma_dr_flac__read_uint8(bs, 4, &sampleRate)) {
+            return MA_FALSE;
+        }
+        crc8 = ma_dr_flac_crc8(crc8, sampleRate, 4);
+        if (!ma_dr_flac__read_uint8(bs, 4, &channelAssignment)) {
+            return MA_FALSE;
+        }
+        if (channelAssignment > 10) {
+            continue;
+        }
+        crc8 = ma_dr_flac_crc8(crc8, channelAssignment, 4);
+        if (!ma_dr_flac__read_uint8(bs, 3, &bitsPerSample)) {
+            return MA_FALSE;
+        }
+        if (bitsPerSample == 3 || bitsPerSample == 7) {
+            continue;
+        }
+        crc8 = ma_dr_flac_crc8(crc8, bitsPerSample, 3);
+        if (!ma_dr_flac__read_uint8(bs, 1, &reserved)) {
+            return MA_FALSE;
+        }
+        if (reserved == 1) {
+            continue;
+        }
+        crc8 = ma_dr_flac_crc8(crc8, reserved, 1);
+        isVariableBlockSize = blockingStrategy == 1;
+        if (isVariableBlockSize) {
+            ma_uint64 pcmFrameNumber;
+            ma_result result = ma_dr_flac__read_utf8_coded_number(bs, &pcmFrameNumber, &crc8);
+            if (result != MA_SUCCESS) {
+                if (result == MA_AT_END) {
+                    return MA_FALSE;
+                } else {
+                    continue;
+                }
+            }
+            header->flacFrameNumber  = 0;
+            header->pcmFrameNumber = pcmFrameNumber;
+        } else {
+            ma_uint64 flacFrameNumber = 0;
+            ma_result result = ma_dr_flac__read_utf8_coded_number(bs, &flacFrameNumber, &crc8);
+            if (result != MA_SUCCESS) {
+                if (result == MA_AT_END) {
+                    return MA_FALSE;
+                } else {
+                    continue;
+                }
+            }
+            header->flacFrameNumber  = (ma_uint32)flacFrameNumber;
+            header->pcmFrameNumber = 0;
+        }
+        MA_DR_FLAC_ASSERT(blockSize > 0);
+        if (blockSize == 1) {
+            header->blockSizeInPCMFrames = 192;
+        } else if (blockSize <= 5) {
+            MA_DR_FLAC_ASSERT(blockSize >= 2);
+            header->blockSizeInPCMFrames = 576 * (1 << (blockSize - 2));
+        } else if (blockSize == 6) {
+            if (!ma_dr_flac__read_uint16(bs, 8, &header->blockSizeInPCMFrames)) {
+                return MA_FALSE;
+            }
+            crc8 = ma_dr_flac_crc8(crc8, header->blockSizeInPCMFrames, 8);
+            header->blockSizeInPCMFrames += 1;
+        } else if (blockSize == 7) {
+            if (!ma_dr_flac__read_uint16(bs, 16, &header->blockSizeInPCMFrames)) {
+                return MA_FALSE;
+            }
+            crc8 = ma_dr_flac_crc8(crc8, header->blockSizeInPCMFrames, 16);
+            if (header->blockSizeInPCMFrames == 0xFFFF) {
+                return MA_FALSE;
+            }
+            header->blockSizeInPCMFrames += 1;
+        } else {
+            MA_DR_FLAC_ASSERT(blockSize >= 8);
+            header->blockSizeInPCMFrames = 256 * (1 << (blockSize - 8));
+        }
+        if (sampleRate <= 11) {
+            header->sampleRate = sampleRateTable[sampleRate];
+        } else if (sampleRate == 12) {
+            if (!ma_dr_flac__read_uint32(bs, 8, &header->sampleRate)) {
+                return MA_FALSE;
+            }
+            crc8 = ma_dr_flac_crc8(crc8, header->sampleRate, 8);
+            header->sampleRate *= 1000;
+        } else if (sampleRate == 13) {
+            if (!ma_dr_flac__read_uint32(bs, 16, &header->sampleRate)) {
+                return MA_FALSE;
+            }
+            crc8 = ma_dr_flac_crc8(crc8, header->sampleRate, 16);
+        } else if (sampleRate == 14) {
+            if (!ma_dr_flac__read_uint32(bs, 16, &header->sampleRate)) {
+                return MA_FALSE;
+            }
+            crc8 = ma_dr_flac_crc8(crc8, header->sampleRate, 16);
+            header->sampleRate *= 10;
+        } else {
+            continue;
+        }
+        header->channelAssignment = channelAssignment;
+        header->bitsPerSample = bitsPerSampleTable[bitsPerSample];
+        if (header->bitsPerSample == 0) {
+            header->bitsPerSample = streaminfoBitsPerSample;
+        }
+        if (header->bitsPerSample != streaminfoBitsPerSample) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_flac__read_uint8(bs, 8, &header->crc8)) {
+            return MA_FALSE;
+        }
+#ifndef MA_DR_FLAC_NO_CRC
+        if (header->crc8 != crc8) {
+            continue;
+        }
+#endif
+        return MA_TRUE;
+    }
+}
+static ma_bool32 ma_dr_flac__read_subframe_header(ma_dr_flac_bs* bs, ma_dr_flac_subframe* pSubframe)
+{
+    ma_uint8 header;
+    int type;
+    if (!ma_dr_flac__read_uint8(bs, 8, &header)) {
+        return MA_FALSE;
+    }
+    if ((header & 0x80) != 0) {
+        return MA_FALSE;
+    }
+    pSubframe->lpcOrder = 0;
+    type = (header & 0x7E) >> 1;
+    if (type == 0) {
+        pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_CONSTANT;
+    } else if (type == 1) {
+        pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_VERBATIM;
+    } else {
+        if ((type & 0x20) != 0) {
+            pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_LPC;
+            pSubframe->lpcOrder = (ma_uint8)(type & 0x1F) + 1;
+        } else if ((type & 0x08) != 0) {
+            pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_FIXED;
+            pSubframe->lpcOrder = (ma_uint8)(type & 0x07);
+            if (pSubframe->lpcOrder > 4) {
+                pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_RESERVED;
+                pSubframe->lpcOrder = 0;
+            }
+        } else {
+            pSubframe->subframeType = MA_DR_FLAC_SUBFRAME_RESERVED;
+        }
+    }
+    if (pSubframe->subframeType == MA_DR_FLAC_SUBFRAME_RESERVED) {
+        return MA_FALSE;
+    }
+    pSubframe->wastedBitsPerSample = 0;
+    if ((header & 0x01) == 1) {
+        unsigned int wastedBitsPerSample;
+        if (!ma_dr_flac__seek_past_next_set_bit(bs, &wastedBitsPerSample)) {
+            return MA_FALSE;
+        }
+        pSubframe->wastedBitsPerSample = (ma_uint8)wastedBitsPerSample + 1;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_subframe(ma_dr_flac_bs* bs, ma_dr_flac_frame* frame, int subframeIndex, ma_int32* pDecodedSamplesOut)
+{
+    ma_dr_flac_subframe* pSubframe;
+    ma_uint32 subframeBitsPerSample;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(frame != NULL);
+    pSubframe = frame->subframes + subframeIndex;
+    if (!ma_dr_flac__read_subframe_header(bs, pSubframe)) {
+        return MA_FALSE;
+    }
+    subframeBitsPerSample = frame->header.bitsPerSample;
+    if ((frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        subframeBitsPerSample += 1;
+    } else if (frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        subframeBitsPerSample += 1;
+    }
+    if (subframeBitsPerSample > 32) {
+        return MA_FALSE;
+    }
+    if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
+        return MA_FALSE;
+    }
+    subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
+    pSubframe->pSamplesS32 = pDecodedSamplesOut;
+    if (frame->header.blockSizeInPCMFrames < pSubframe->lpcOrder) {
+        return MA_FALSE;
+    }
+    switch (pSubframe->subframeType)
+    {
+        case MA_DR_FLAC_SUBFRAME_CONSTANT:
+        {
+            ma_dr_flac__decode_samples__constant(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
+        } break;
+        case MA_DR_FLAC_SUBFRAME_VERBATIM:
+        {
+            ma_dr_flac__decode_samples__verbatim(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->pSamplesS32);
+        } break;
+        case MA_DR_FLAC_SUBFRAME_FIXED:
+        {
+            ma_dr_flac__decode_samples__fixed(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
+        } break;
+        case MA_DR_FLAC_SUBFRAME_LPC:
+        {
+            ma_dr_flac__decode_samples__lpc(bs, frame->header.blockSizeInPCMFrames, subframeBitsPerSample, pSubframe->lpcOrder, pSubframe->pSamplesS32);
+        } break;
+        default: return MA_FALSE;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__seek_subframe(ma_dr_flac_bs* bs, ma_dr_flac_frame* frame, int subframeIndex)
+{
+    ma_dr_flac_subframe* pSubframe;
+    ma_uint32 subframeBitsPerSample;
+    MA_DR_FLAC_ASSERT(bs != NULL);
+    MA_DR_FLAC_ASSERT(frame != NULL);
+    pSubframe = frame->subframes + subframeIndex;
+    if (!ma_dr_flac__read_subframe_header(bs, pSubframe)) {
+        return MA_FALSE;
+    }
+    subframeBitsPerSample = frame->header.bitsPerSample;
+    if ((frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE || frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE) && subframeIndex == 1) {
+        subframeBitsPerSample += 1;
+    } else if (frame->header.channelAssignment == MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE && subframeIndex == 0) {
+        subframeBitsPerSample += 1;
+    }
+    if (pSubframe->wastedBitsPerSample >= subframeBitsPerSample) {
+        return MA_FALSE;
+    }
+    subframeBitsPerSample -= pSubframe->wastedBitsPerSample;
+    pSubframe->pSamplesS32 = NULL;
+    switch (pSubframe->subframeType)
+    {
+        case MA_DR_FLAC_SUBFRAME_CONSTANT:
+        {
+            if (!ma_dr_flac__seek_bits(bs, subframeBitsPerSample)) {
+                return MA_FALSE;
+            }
+        } break;
+        case MA_DR_FLAC_SUBFRAME_VERBATIM:
+        {
+            unsigned int bitsToSeek = frame->header.blockSizeInPCMFrames * subframeBitsPerSample;
+            if (!ma_dr_flac__seek_bits(bs, bitsToSeek)) {
+                return MA_FALSE;
+            }
+        } break;
+        case MA_DR_FLAC_SUBFRAME_FIXED:
+        {
+            unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
+            if (!ma_dr_flac__seek_bits(bs, bitsToSeek)) {
+                return MA_FALSE;
+            }
+            if (!ma_dr_flac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
+                return MA_FALSE;
+            }
+        } break;
+        case MA_DR_FLAC_SUBFRAME_LPC:
+        {
+            ma_uint8 lpcPrecision;
+            unsigned int bitsToSeek = pSubframe->lpcOrder * subframeBitsPerSample;
+            if (!ma_dr_flac__seek_bits(bs, bitsToSeek)) {
+                return MA_FALSE;
+            }
+            if (!ma_dr_flac__read_uint8(bs, 4, &lpcPrecision)) {
+                return MA_FALSE;
+            }
+            if (lpcPrecision == 15) {
+                return MA_FALSE;
+            }
+            lpcPrecision += 1;
+            bitsToSeek = (pSubframe->lpcOrder * lpcPrecision) + 5;
+            if (!ma_dr_flac__seek_bits(bs, bitsToSeek)) {
+                return MA_FALSE;
+            }
+            if (!ma_dr_flac__read_and_seek_residual(bs, frame->header.blockSizeInPCMFrames, pSubframe->lpcOrder)) {
+                return MA_FALSE;
+            }
+        } break;
+        default: return MA_FALSE;
+    }
+    return MA_TRUE;
+}
+static MA_INLINE ma_uint8 ma_dr_flac__get_channel_count_from_channel_assignment(ma_int8 channelAssignment)
+{
+    ma_uint8 lookup[] = {1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2};
+    MA_DR_FLAC_ASSERT(channelAssignment <= 10);
+    return lookup[channelAssignment];
+}
+static ma_result ma_dr_flac__decode_flac_frame(ma_dr_flac* pFlac)
+{
+    int channelCount;
+    int i;
+    ma_uint8 paddingSizeInBits;
+    ma_uint16 desiredCRC16;
+#ifndef MA_DR_FLAC_NO_CRC
+    ma_uint16 actualCRC16;
+#endif
+    MA_DR_FLAC_ZERO_MEMORY(pFlac->currentFLACFrame.subframes, sizeof(pFlac->currentFLACFrame.subframes));
+    if (pFlac->currentFLACFrame.header.blockSizeInPCMFrames > pFlac->maxBlockSizeInPCMFrames) {
+        return MA_ERROR;
+    }
+    channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+    if (channelCount != (int)pFlac->channels) {
+        return MA_ERROR;
+    }
+    for (i = 0; i < channelCount; ++i) {
+        if (!ma_dr_flac__decode_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i, pFlac->pDecodedSamples + (pFlac->currentFLACFrame.header.blockSizeInPCMFrames * i))) {
+            return MA_ERROR;
+        }
+    }
+    paddingSizeInBits = (ma_uint8)(MA_DR_FLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7);
+    if (paddingSizeInBits > 0) {
+        ma_uint8 padding = 0;
+        if (!ma_dr_flac__read_uint8(&pFlac->bs, paddingSizeInBits, &padding)) {
+            return MA_AT_END;
+        }
+    }
+#ifndef MA_DR_FLAC_NO_CRC
+    actualCRC16 = ma_dr_flac__flush_crc16(&pFlac->bs);
+#endif
+    if (!ma_dr_flac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
+        return MA_AT_END;
+    }
+#ifndef MA_DR_FLAC_NO_CRC
+    if (actualCRC16 != desiredCRC16) {
+        return MA_CRC_MISMATCH;
+    }
+#endif
+    pFlac->currentFLACFrame.pcmFramesRemaining = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
+    return MA_SUCCESS;
+}
+static ma_result ma_dr_flac__seek_flac_frame(ma_dr_flac* pFlac)
+{
+    int channelCount;
+    int i;
+    ma_uint16 desiredCRC16;
+#ifndef MA_DR_FLAC_NO_CRC
+    ma_uint16 actualCRC16;
+#endif
+    channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+    for (i = 0; i < channelCount; ++i) {
+        if (!ma_dr_flac__seek_subframe(&pFlac->bs, &pFlac->currentFLACFrame, i)) {
+            return MA_ERROR;
+        }
+    }
+    if (!ma_dr_flac__seek_bits(&pFlac->bs, MA_DR_FLAC_CACHE_L1_BITS_REMAINING(&pFlac->bs) & 7)) {
+        return MA_ERROR;
+    }
+#ifndef MA_DR_FLAC_NO_CRC
+    actualCRC16 = ma_dr_flac__flush_crc16(&pFlac->bs);
+#endif
+    if (!ma_dr_flac__read_uint16(&pFlac->bs, 16, &desiredCRC16)) {
+        return MA_AT_END;
+    }
+#ifndef MA_DR_FLAC_NO_CRC
+    if (actualCRC16 != desiredCRC16) {
+        return MA_CRC_MISMATCH;
+    }
+#endif
+    return MA_SUCCESS;
+}
+static ma_bool32 ma_dr_flac__read_and_decode_next_flac_frame(ma_dr_flac* pFlac)
+{
+    MA_DR_FLAC_ASSERT(pFlac != NULL);
+    for (;;) {
+        ma_result result;
+        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return MA_FALSE;
+        }
+        result = ma_dr_flac__decode_flac_frame(pFlac);
+        if (result != MA_SUCCESS) {
+            if (result == MA_CRC_MISMATCH) {
+                continue;
+            } else {
+                return MA_FALSE;
+            }
+        }
+        return MA_TRUE;
+    }
+}
+static void ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(ma_dr_flac* pFlac, ma_uint64* pFirstPCMFrame, ma_uint64* pLastPCMFrame)
+{
+    ma_uint64 firstPCMFrame;
+    ma_uint64 lastPCMFrame;
+    MA_DR_FLAC_ASSERT(pFlac != NULL);
+    firstPCMFrame = pFlac->currentFLACFrame.header.pcmFrameNumber;
+    if (firstPCMFrame == 0) {
+        firstPCMFrame = ((ma_uint64)pFlac->currentFLACFrame.header.flacFrameNumber) * pFlac->maxBlockSizeInPCMFrames;
+    }
+    lastPCMFrame = firstPCMFrame + pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
+    if (lastPCMFrame > 0) {
+        lastPCMFrame -= 1;
+    }
+    if (pFirstPCMFrame) {
+        *pFirstPCMFrame = firstPCMFrame;
+    }
+    if (pLastPCMFrame) {
+        *pLastPCMFrame = lastPCMFrame;
+    }
+}
+static ma_bool32 ma_dr_flac__seek_to_first_frame(ma_dr_flac* pFlac)
+{
+    ma_bool32 result;
+    MA_DR_FLAC_ASSERT(pFlac != NULL);
+    result = ma_dr_flac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes);
+    MA_DR_FLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
+    pFlac->currentPCMFrame = 0;
+    return result;
+}
+static MA_INLINE ma_result ma_dr_flac__seek_to_next_flac_frame(ma_dr_flac* pFlac)
+{
+    MA_DR_FLAC_ASSERT(pFlac != NULL);
+    return ma_dr_flac__seek_flac_frame(pFlac);
+}
+static ma_uint64 ma_dr_flac__seek_forward_by_pcm_frames(ma_dr_flac* pFlac, ma_uint64 pcmFramesToSeek)
+{
+    ma_uint64 pcmFramesRead = 0;
+    while (pcmFramesToSeek > 0) {
+        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
+                break;
+            }
+        } else {
+            if (pFlac->currentFLACFrame.pcmFramesRemaining > pcmFramesToSeek) {
+                pcmFramesRead   += pcmFramesToSeek;
+                pFlac->currentFLACFrame.pcmFramesRemaining -= (ma_uint32)pcmFramesToSeek;
+                pcmFramesToSeek  = 0;
+            } else {
+                pcmFramesRead   += pFlac->currentFLACFrame.pcmFramesRemaining;
+                pcmFramesToSeek -= pFlac->currentFLACFrame.pcmFramesRemaining;
+                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
+            }
+        }
+    }
+    pFlac->currentPCMFrame += pcmFramesRead;
+    return pcmFramesRead;
+}
+static ma_bool32 ma_dr_flac__seek_to_pcm_frame__brute_force(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
+{
+    ma_bool32 isMidFrame = MA_FALSE;
+    ma_uint64 runningPCMFrameCount;
+    MA_DR_FLAC_ASSERT(pFlac != NULL);
+    if (pcmFrameIndex >= pFlac->currentPCMFrame) {
+        runningPCMFrameCount = pFlac->currentPCMFrame;
+        if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                return MA_FALSE;
+            }
+        } else {
+            isMidFrame = MA_TRUE;
+        }
+    } else {
+        runningPCMFrameCount = 0;
+        if (!ma_dr_flac__seek_to_first_frame(pFlac)) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return MA_FALSE;
+        }
+    }
+    for (;;) {
+        ma_uint64 pcmFrameCountInThisFLACFrame;
+        ma_uint64 firstPCMFrameInFLACFrame = 0;
+        ma_uint64 lastPCMFrameInFLACFrame = 0;
+        ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
+        pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
+        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
+            ma_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
+            if (!isMidFrame) {
+                ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
+                if (result == MA_SUCCESS) {
+                    return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
+                } else {
+                    if (result == MA_CRC_MISMATCH) {
+                        goto next_iteration;
+                    } else {
+                        return MA_FALSE;
+                    }
+                }
+            } else {
+                return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
+            }
+        } else {
+            if (!isMidFrame) {
+                ma_result result = ma_dr_flac__seek_to_next_flac_frame(pFlac);
+                if (result == MA_SUCCESS) {
+                    runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
+                } else {
+                    if (result == MA_CRC_MISMATCH) {
+                        goto next_iteration;
+                    } else {
+                        return MA_FALSE;
+                    }
+                }
+            } else {
+                runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
+                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
+                isMidFrame = MA_FALSE;
+            }
+            if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
+                return MA_TRUE;
+            }
+        }
+    next_iteration:
+        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return MA_FALSE;
+        }
+    }
+}
+#if !defined(MA_DR_FLAC_NO_CRC)
+#define MA_DR_FLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO 0.6f
+static ma_bool32 ma_dr_flac__seek_to_approximate_flac_frame_to_byte(ma_dr_flac* pFlac, ma_uint64 targetByte, ma_uint64 rangeLo, ma_uint64 rangeHi, ma_uint64* pLastSuccessfulSeekOffset)
+{
+    MA_DR_FLAC_ASSERT(pFlac != NULL);
+    MA_DR_FLAC_ASSERT(pLastSuccessfulSeekOffset != NULL);
+    MA_DR_FLAC_ASSERT(targetByte >= rangeLo);
+    MA_DR_FLAC_ASSERT(targetByte <= rangeHi);
+    *pLastSuccessfulSeekOffset = pFlac->firstFLACFramePosInBytes;
+    for (;;) {
+        ma_uint64 lastTargetByte = targetByte;
+        if (!ma_dr_flac__seek_to_byte(&pFlac->bs, targetByte)) {
+            if (targetByte == 0) {
+                ma_dr_flac__seek_to_first_frame(pFlac);
+                return MA_FALSE;
+            }
+            targetByte = rangeLo + ((rangeHi - rangeLo)/2);
+            rangeHi = targetByte;
+        } else {
+            MA_DR_FLAC_ZERO_MEMORY(&pFlac->currentFLACFrame, sizeof(pFlac->currentFLACFrame));
+#if 1
+            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
+                targetByte = rangeLo + ((rangeHi - rangeLo)/2);
+                rangeHi = targetByte;
+            } else {
+                break;
+            }
+#else
+            if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                targetByte = rangeLo + ((rangeHi - rangeLo)/2);
+                rangeHi = targetByte;
+            } else {
+                break;
+            }
+#endif
+        }
+        if(targetByte == lastTargetByte) {
+            return MA_FALSE;
+        }
+    }
+    ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
+    MA_DR_FLAC_ASSERT(targetByte <= rangeHi);
+    *pLastSuccessfulSeekOffset = targetByte;
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__decode_flac_frame_and_seek_forward_by_pcm_frames(ma_dr_flac* pFlac, ma_uint64 offset)
+{
+#if 0
+    if (ma_dr_flac__decode_flac_frame(pFlac) != MA_SUCCESS) {
+        if (ma_dr_flac__read_and_decode_next_flac_frame(pFlac) == MA_FALSE) {
+            return MA_FALSE;
+        }
+    }
+#endif
+    return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, offset) == offset;
+}
+static ma_bool32 ma_dr_flac__seek_to_pcm_frame__binary_search_internal(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex, ma_uint64 byteRangeLo, ma_uint64 byteRangeHi)
+{
+    ma_uint64 targetByte;
+    ma_uint64 pcmRangeLo = pFlac->totalPCMFrameCount;
+    ma_uint64 pcmRangeHi = 0;
+    ma_uint64 lastSuccessfulSeekOffset = (ma_uint64)-1;
+    ma_uint64 closestSeekOffsetBeforeTargetPCMFrame = byteRangeLo;
+    ma_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
+    targetByte = byteRangeLo + (ma_uint64)(((ma_int64)((pcmFrameIndex - pFlac->currentPCMFrame) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * MA_DR_FLAC_BINARY_SEARCH_APPROX_COMPRESSION_RATIO);
+    if (targetByte > byteRangeHi) {
+        targetByte = byteRangeHi;
+    }
+    for (;;) {
+        if (ma_dr_flac__seek_to_approximate_flac_frame_to_byte(pFlac, targetByte, byteRangeLo, byteRangeHi, &lastSuccessfulSeekOffset)) {
+            ma_uint64 newPCMRangeLo;
+            ma_uint64 newPCMRangeHi;
+            ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &newPCMRangeLo, &newPCMRangeHi);
+            if (pcmRangeLo == newPCMRangeLo) {
+                if (!ma_dr_flac__seek_to_approximate_flac_frame_to_byte(pFlac, closestSeekOffsetBeforeTargetPCMFrame, closestSeekOffsetBeforeTargetPCMFrame, byteRangeHi, &lastSuccessfulSeekOffset)) {
+                    break;
+                }
+                if (ma_dr_flac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
+                    return MA_TRUE;
+                } else {
+                    break;
+                }
+            }
+            pcmRangeLo = newPCMRangeLo;
+            pcmRangeHi = newPCMRangeHi;
+            if (pcmRangeLo <= pcmFrameIndex && pcmRangeHi >= pcmFrameIndex) {
+                if (ma_dr_flac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame) ) {
+                    return MA_TRUE;
+                } else {
+                    break;
+                }
+            } else {
+                const float approxCompressionRatio = (ma_int64)(lastSuccessfulSeekOffset - pFlac->firstFLACFramePosInBytes) / ((ma_int64)(pcmRangeLo * pFlac->channels * pFlac->bitsPerSample)/8.0f);
+                if (pcmRangeLo > pcmFrameIndex) {
+                    byteRangeHi = lastSuccessfulSeekOffset;
+                    if (byteRangeLo > byteRangeHi) {
+                        byteRangeLo = byteRangeHi;
+                    }
+                    targetByte = byteRangeLo + ((byteRangeHi - byteRangeLo) / 2);
+                    if (targetByte < byteRangeLo) {
+                        targetByte = byteRangeLo;
+                    }
+                } else  {
+                    if ((pcmFrameIndex - pcmRangeLo) < seekForwardThreshold) {
+                        if (ma_dr_flac__decode_flac_frame_and_seek_forward_by_pcm_frames(pFlac, pcmFrameIndex - pFlac->currentPCMFrame)) {
+                            return MA_TRUE;
+                        } else {
+                            break;
+                        }
+                    } else {
+                        byteRangeLo = lastSuccessfulSeekOffset;
+                        if (byteRangeHi < byteRangeLo) {
+                            byteRangeHi = byteRangeLo;
+                        }
+                        targetByte = lastSuccessfulSeekOffset + (ma_uint64)(((ma_int64)((pcmFrameIndex-pcmRangeLo) * pFlac->channels * pFlac->bitsPerSample)/8.0f) * approxCompressionRatio);
+                        if (targetByte > byteRangeHi) {
+                            targetByte = byteRangeHi;
+                        }
+                        if (closestSeekOffsetBeforeTargetPCMFrame < lastSuccessfulSeekOffset) {
+                            closestSeekOffsetBeforeTargetPCMFrame = lastSuccessfulSeekOffset;
+                        }
+                    }
+                }
+            }
+        } else {
+            break;
+        }
+    }
+    ma_dr_flac__seek_to_first_frame(pFlac);
+    return MA_FALSE;
+}
+static ma_bool32 ma_dr_flac__seek_to_pcm_frame__binary_search(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
+{
+    ma_uint64 byteRangeLo;
+    ma_uint64 byteRangeHi;
+    ma_uint32 seekForwardThreshold = (pFlac->maxBlockSizeInPCMFrames != 0) ? pFlac->maxBlockSizeInPCMFrames*2 : 4096;
+    if (ma_dr_flac__seek_to_first_frame(pFlac) == MA_FALSE) {
+        return MA_FALSE;
+    }
+    if (pcmFrameIndex < seekForwardThreshold) {
+        return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFrameIndex) == pcmFrameIndex;
+    }
+    byteRangeLo = pFlac->firstFLACFramePosInBytes;
+    byteRangeHi = pFlac->firstFLACFramePosInBytes + (ma_uint64)((ma_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
+    return ma_dr_flac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi);
+}
+#endif
+static ma_bool32 ma_dr_flac__seek_to_pcm_frame__seek_table(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
+{
+    ma_uint32 iClosestSeekpoint = 0;
+    ma_bool32 isMidFrame = MA_FALSE;
+    ma_uint64 runningPCMFrameCount;
+    ma_uint32 iSeekpoint;
+    MA_DR_FLAC_ASSERT(pFlac != NULL);
+    if (pFlac->pSeekpoints == NULL || pFlac->seekpointCount == 0) {
+        return MA_FALSE;
+    }
+    if (pFlac->pSeekpoints[0].firstPCMFrame > pcmFrameIndex) {
+        return MA_FALSE;
+    }
+    for (iSeekpoint = 0; iSeekpoint < pFlac->seekpointCount; ++iSeekpoint) {
+        if (pFlac->pSeekpoints[iSeekpoint].firstPCMFrame >= pcmFrameIndex) {
+            break;
+        }
+        iClosestSeekpoint = iSeekpoint;
+    }
+    if (pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount == 0 || pFlac->pSeekpoints[iClosestSeekpoint].pcmFrameCount > pFlac->maxBlockSizeInPCMFrames) {
+        return MA_FALSE;
+    }
+    if (pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame > pFlac->totalPCMFrameCount && pFlac->totalPCMFrameCount > 0) {
+        return MA_FALSE;
+    }
+#if !defined(MA_DR_FLAC_NO_CRC)
+    if (pFlac->totalPCMFrameCount > 0) {
+        ma_uint64 byteRangeLo;
+        ma_uint64 byteRangeHi;
+        byteRangeHi = pFlac->firstFLACFramePosInBytes + (ma_uint64)((ma_int64)(pFlac->totalPCMFrameCount * pFlac->channels * pFlac->bitsPerSample)/8.0f);
+        byteRangeLo = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset;
+        if (iClosestSeekpoint < pFlac->seekpointCount-1) {
+            ma_uint32 iNextSeekpoint = iClosestSeekpoint + 1;
+            if (pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset >= pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset || pFlac->pSeekpoints[iNextSeekpoint].pcmFrameCount == 0) {
+                return MA_FALSE;
+            }
+            if (pFlac->pSeekpoints[iNextSeekpoint].firstPCMFrame != (((ma_uint64)0xFFFFFFFF << 32) | 0xFFFFFFFF)) {
+                byteRangeHi = pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iNextSeekpoint].flacFrameOffset - 1;
+            }
+        }
+        if (ma_dr_flac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
+            if (ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &pFlac->currentPCMFrame, NULL);
+                if (ma_dr_flac__seek_to_pcm_frame__binary_search_internal(pFlac, pcmFrameIndex, byteRangeLo, byteRangeHi)) {
+                    return MA_TRUE;
+                }
+            }
+        }
+    }
+#endif
+    if (pcmFrameIndex >= pFlac->currentPCMFrame && pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame <= pFlac->currentPCMFrame) {
+        runningPCMFrameCount = pFlac->currentPCMFrame;
+        if (pFlac->currentPCMFrame == 0 && pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                return MA_FALSE;
+            }
+        } else {
+            isMidFrame = MA_TRUE;
+        }
+    } else {
+        runningPCMFrameCount = pFlac->pSeekpoints[iClosestSeekpoint].firstPCMFrame;
+        if (!ma_dr_flac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes + pFlac->pSeekpoints[iClosestSeekpoint].flacFrameOffset)) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return MA_FALSE;
+        }
+    }
+    for (;;) {
+        ma_uint64 pcmFrameCountInThisFLACFrame;
+        ma_uint64 firstPCMFrameInFLACFrame = 0;
+        ma_uint64 lastPCMFrameInFLACFrame = 0;
+        ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
+        pcmFrameCountInThisFLACFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
+        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFLACFrame)) {
+            ma_uint64 pcmFramesToDecode = pcmFrameIndex - runningPCMFrameCount;
+            if (!isMidFrame) {
+                ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
+                if (result == MA_SUCCESS) {
+                    return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
+                } else {
+                    if (result == MA_CRC_MISMATCH) {
+                        goto next_iteration;
+                    } else {
+                        return MA_FALSE;
+                    }
+                }
+            } else {
+                return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
+            }
+        } else {
+            if (!isMidFrame) {
+                ma_result result = ma_dr_flac__seek_to_next_flac_frame(pFlac);
+                if (result == MA_SUCCESS) {
+                    runningPCMFrameCount += pcmFrameCountInThisFLACFrame;
+                } else {
+                    if (result == MA_CRC_MISMATCH) {
+                        goto next_iteration;
+                    } else {
+                        return MA_FALSE;
+                    }
+                }
+            } else {
+                runningPCMFrameCount += pFlac->currentFLACFrame.pcmFramesRemaining;
+                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
+                isMidFrame = MA_FALSE;
+            }
+            if (pcmFrameIndex == pFlac->totalPCMFrameCount && runningPCMFrameCount == pFlac->totalPCMFrameCount) {
+                return MA_TRUE;
+            }
+        }
+    next_iteration:
+        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return MA_FALSE;
+        }
+    }
+}
+#ifndef MA_DR_FLAC_NO_OGG
+typedef struct
+{
+    ma_uint8 capturePattern[4];
+    ma_uint8 structureVersion;
+    ma_uint8 headerType;
+    ma_uint64 granulePosition;
+    ma_uint32 serialNumber;
+    ma_uint32 sequenceNumber;
+    ma_uint32 checksum;
+    ma_uint8 segmentCount;
+    ma_uint8 segmentTable[255];
+} ma_dr_flac_ogg_page_header;
+#endif
+typedef struct
+{
+    ma_dr_flac_read_proc onRead;
+    ma_dr_flac_seek_proc onSeek;
+    ma_dr_flac_meta_proc onMeta;
+    ma_dr_flac_container container;
+    void* pUserData;
+    void* pUserDataMD;
+    ma_uint32 sampleRate;
+    ma_uint8  channels;
+    ma_uint8  bitsPerSample;
+    ma_uint64 totalPCMFrameCount;
+    ma_uint16 maxBlockSizeInPCMFrames;
+    ma_uint64 runningFilePos;
+    ma_bool32 hasStreamInfoBlock;
+    ma_bool32 hasMetadataBlocks;
+    ma_dr_flac_bs bs;
+    ma_dr_flac_frame_header firstFrameHeader;
+#ifndef MA_DR_FLAC_NO_OGG
+    ma_uint32 oggSerial;
+    ma_uint64 oggFirstBytePos;
+    ma_dr_flac_ogg_page_header oggBosHeader;
+#endif
+} ma_dr_flac_init_info;
+static MA_INLINE void ma_dr_flac__decode_block_header(ma_uint32 blockHeader, ma_uint8* isLastBlock, ma_uint8* blockType, ma_uint32* blockSize)
+{
+    blockHeader = ma_dr_flac__be2host_32(blockHeader);
+    *isLastBlock = (ma_uint8)((blockHeader & 0x80000000UL) >> 31);
+    *blockType   = (ma_uint8)((blockHeader & 0x7F000000UL) >> 24);
+    *blockSize   =                (blockHeader & 0x00FFFFFFUL);
+}
+static MA_INLINE ma_bool32 ma_dr_flac__read_and_decode_block_header(ma_dr_flac_read_proc onRead, void* pUserData, ma_uint8* isLastBlock, ma_uint8* blockType, ma_uint32* blockSize)
+{
+    ma_uint32 blockHeader;
+    *blockSize = 0;
+    if (onRead(pUserData, &blockHeader, 4) != 4) {
+        return MA_FALSE;
+    }
+    ma_dr_flac__decode_block_header(blockHeader, isLastBlock, blockType, blockSize);
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__read_streaminfo(ma_dr_flac_read_proc onRead, void* pUserData, ma_dr_flac_streaminfo* pStreamInfo)
+{
+    ma_uint32 blockSizes;
+    ma_uint64 frameSizes = 0;
+    ma_uint64 importantProps;
+    ma_uint8 md5[16];
+    if (onRead(pUserData, &blockSizes, 4) != 4) {
+        return MA_FALSE;
+    }
+    if (onRead(pUserData, &frameSizes, 6) != 6) {
+        return MA_FALSE;
+    }
+    if (onRead(pUserData, &importantProps, 8) != 8) {
+        return MA_FALSE;
+    }
+    if (onRead(pUserData, md5, sizeof(md5)) != sizeof(md5)) {
+        return MA_FALSE;
+    }
+    blockSizes     = ma_dr_flac__be2host_32(blockSizes);
+    frameSizes     = ma_dr_flac__be2host_64(frameSizes);
+    importantProps = ma_dr_flac__be2host_64(importantProps);
+    pStreamInfo->minBlockSizeInPCMFrames = (ma_uint16)((blockSizes & 0xFFFF0000) >> 16);
+    pStreamInfo->maxBlockSizeInPCMFrames = (ma_uint16) (blockSizes & 0x0000FFFF);
+    pStreamInfo->minFrameSizeInPCMFrames = (ma_uint32)((frameSizes     &  (((ma_uint64)0x00FFFFFF << 16) << 24)) >> 40);
+    pStreamInfo->maxFrameSizeInPCMFrames = (ma_uint32)((frameSizes     &  (((ma_uint64)0x00FFFFFF << 16) <<  0)) >> 16);
+    pStreamInfo->sampleRate              = (ma_uint32)((importantProps &  (((ma_uint64)0x000FFFFF << 16) << 28)) >> 44);
+    pStreamInfo->channels                = (ma_uint8 )((importantProps &  (((ma_uint64)0x0000000E << 16) << 24)) >> 41) + 1;
+    pStreamInfo->bitsPerSample           = (ma_uint8 )((importantProps &  (((ma_uint64)0x0000001F << 16) << 20)) >> 36) + 1;
+    pStreamInfo->totalPCMFrameCount      =                ((importantProps & ((((ma_uint64)0x0000000F << 16) << 16) | 0xFFFFFFFF)));
+    MA_DR_FLAC_COPY_MEMORY(pStreamInfo->md5, md5, sizeof(md5));
+    return MA_TRUE;
+}
+static void* ma_dr_flac__malloc_default(size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return MA_DR_FLAC_MALLOC(sz);
+}
+static void* ma_dr_flac__realloc_default(void* p, size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return MA_DR_FLAC_REALLOC(p, sz);
+}
+static void ma_dr_flac__free_default(void* p, void* pUserData)
+{
+    (void)pUserData;
+    MA_DR_FLAC_FREE(p);
+}
+static void* ma_dr_flac__malloc_from_callbacks(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+    if (pAllocationCallbacks->onMalloc != NULL) {
+        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
+    }
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
+    }
+    return NULL;
+}
+static void* ma_dr_flac__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
+    }
+    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
+        void* p2;
+        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
+        if (p2 == NULL) {
+            return NULL;
+        }
+        if (p != NULL) {
+            MA_DR_FLAC_COPY_MEMORY(p2, p, szOld);
+            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+        }
+        return p2;
+    }
+    return NULL;
+}
+static void ma_dr_flac__free_from_callbacks(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (p == NULL || pAllocationCallbacks == NULL) {
+        return;
+    }
+    if (pAllocationCallbacks->onFree != NULL) {
+        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+    }
+}
+static ma_bool32 ma_dr_flac__read_and_decode_metadata(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, void* pUserDataMD, ma_uint64* pFirstFramePos, ma_uint64* pSeektablePos, ma_uint32* pSeekpointCount, ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_uint64 runningFilePos = 42;
+    ma_uint64 seektablePos   = 0;
+    ma_uint32 seektableSize  = 0;
+    for (;;) {
+        ma_dr_flac_metadata metadata;
+        ma_uint8 isLastBlock = 0;
+        ma_uint8 blockType = 0;
+        ma_uint32 blockSize;
+        if (ma_dr_flac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize) == MA_FALSE) {
+            return MA_FALSE;
+        }
+        runningFilePos += 4;
+        metadata.type = blockType;
+        metadata.pRawData = NULL;
+        metadata.rawDataSize = 0;
+        switch (blockType)
+        {
+            case MA_DR_FLAC_METADATA_BLOCK_TYPE_APPLICATION:
+            {
+                if (blockSize < 4) {
+                    return MA_FALSE;
+                }
+                if (onMeta) {
+                    void* pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return MA_FALSE;
+                    }
+                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    metadata.data.application.id       = ma_dr_flac__be2host_32(*(ma_uint32*)pRawData);
+                    metadata.data.application.pData    = (const void*)((ma_uint8*)pRawData + sizeof(ma_uint32));
+                    metadata.data.application.dataSize = blockSize - sizeof(ma_uint32);
+                    onMeta(pUserDataMD, &metadata);
+                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+            case MA_DR_FLAC_METADATA_BLOCK_TYPE_SEEKTABLE:
+            {
+                seektablePos  = runningFilePos;
+                seektableSize = blockSize;
+                if (onMeta) {
+                    ma_uint32 seekpointCount;
+                    ma_uint32 iSeekpoint;
+                    void* pRawData;
+                    seekpointCount = blockSize/MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES;
+                    pRawData = ma_dr_flac__malloc_from_callbacks(seekpointCount * sizeof(ma_dr_flac_seekpoint), pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return MA_FALSE;
+                    }
+                    for (iSeekpoint = 0; iSeekpoint < seekpointCount; ++iSeekpoint) {
+                        ma_dr_flac_seekpoint* pSeekpoint = (ma_dr_flac_seekpoint*)pRawData + iSeekpoint;
+                        if (onRead(pUserData, pSeekpoint, MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES) != MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES) {
+                            ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return MA_FALSE;
+                        }
+                        pSeekpoint->firstPCMFrame   = ma_dr_flac__be2host_64(pSeekpoint->firstPCMFrame);
+                        pSeekpoint->flacFrameOffset = ma_dr_flac__be2host_64(pSeekpoint->flacFrameOffset);
+                        pSeekpoint->pcmFrameCount   = ma_dr_flac__be2host_16(pSeekpoint->pcmFrameCount);
+                    }
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    metadata.data.seektable.seekpointCount = seekpointCount;
+                    metadata.data.seektable.pSeekpoints = (const ma_dr_flac_seekpoint*)pRawData;
+                    onMeta(pUserDataMD, &metadata);
+                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+            case MA_DR_FLAC_METADATA_BLOCK_TYPE_VORBIS_COMMENT:
+            {
+                if (blockSize < 8) {
+                    return MA_FALSE;
+                }
+                if (onMeta) {
+                    void* pRawData;
+                    const char* pRunningData;
+                    const char* pRunningDataEnd;
+                    ma_uint32 i;
+                    pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return MA_FALSE;
+                    }
+                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    pRunningData    = (const char*)pRawData;
+                    pRunningDataEnd = (const char*)pRawData + blockSize;
+                    metadata.data.vorbis_comment.vendorLength = ma_dr_flac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    if ((pRunningDataEnd - pRunningData) - 4 < (ma_int64)metadata.data.vorbis_comment.vendorLength) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.data.vorbis_comment.vendor       = pRunningData;                                            pRunningData += metadata.data.vorbis_comment.vendorLength;
+                    metadata.data.vorbis_comment.commentCount = ma_dr_flac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    if ((pRunningDataEnd - pRunningData) / sizeof(ma_uint32) < metadata.data.vorbis_comment.commentCount) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.data.vorbis_comment.pComments    = pRunningData;
+                    for (i = 0; i < metadata.data.vorbis_comment.commentCount; ++i) {
+                        ma_uint32 commentLength;
+                        if (pRunningDataEnd - pRunningData < 4) {
+                            ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return MA_FALSE;
+                        }
+                        commentLength = ma_dr_flac__le2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                        if (pRunningDataEnd - pRunningData < (ma_int64)commentLength) {
+                            ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return MA_FALSE;
+                        }
+                        pRunningData += commentLength;
+                    }
+                    onMeta(pUserDataMD, &metadata);
+                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+            case MA_DR_FLAC_METADATA_BLOCK_TYPE_CUESHEET:
+            {
+                if (blockSize < 396) {
+                    return MA_FALSE;
+                }
+                if (onMeta) {
+                    void* pRawData;
+                    const char* pRunningData;
+                    const char* pRunningDataEnd;
+                    size_t bufferSize;
+                    ma_uint8 iTrack;
+                    ma_uint8 iIndex;
+                    void* pTrackData;
+                    pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return MA_FALSE;
+                    }
+                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    pRunningData    = (const char*)pRawData;
+                    pRunningDataEnd = (const char*)pRawData + blockSize;
+                    MA_DR_FLAC_COPY_MEMORY(metadata.data.cuesheet.catalog, pRunningData, 128);                              pRunningData += 128;
+                    metadata.data.cuesheet.leadInSampleCount = ma_dr_flac__be2host_64(*(const ma_uint64*)pRunningData); pRunningData += 8;
+                    metadata.data.cuesheet.isCD              = (pRunningData[0] & 0x80) != 0;                           pRunningData += 259;
+                    metadata.data.cuesheet.trackCount        = pRunningData[0];                                         pRunningData += 1;
+                    metadata.data.cuesheet.pTrackData        = NULL;
+                    {
+                        const char* pRunningDataSaved = pRunningData;
+                        bufferSize = metadata.data.cuesheet.trackCount * MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES;
+                        for (iTrack = 0; iTrack < metadata.data.cuesheet.trackCount; ++iTrack) {
+                            ma_uint8 indexCount;
+                            ma_uint32 indexPointSize;
+                            if (pRunningDataEnd - pRunningData < MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES) {
+                                ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                                return MA_FALSE;
+                            }
+                            pRunningData += 35;
+                            indexCount = pRunningData[0];
+                            pRunningData += 1;
+                            bufferSize += indexCount * sizeof(ma_dr_flac_cuesheet_track_index);
+                            indexPointSize = indexCount * MA_DR_FLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES;
+                            if (pRunningDataEnd - pRunningData < (ma_int64)indexPointSize) {
+                                ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                                return MA_FALSE;
+                            }
+                            pRunningData += indexPointSize;
+                        }
+                        pRunningData = pRunningDataSaved;
+                    }
+                    {
+                        char* pRunningTrackData;
+                        pTrackData = ma_dr_flac__malloc_from_callbacks(bufferSize, pAllocationCallbacks);
+                        if (pTrackData == NULL) {
+                            ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                            return MA_FALSE;
+                        }
+                        pRunningTrackData = (char*)pTrackData;
+                        for (iTrack = 0; iTrack < metadata.data.cuesheet.trackCount; ++iTrack) {
+                            ma_uint8 indexCount;
+                            MA_DR_FLAC_COPY_MEMORY(pRunningTrackData, pRunningData, MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES);
+                            pRunningData      += MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES-1;
+                            pRunningTrackData += MA_DR_FLAC_CUESHEET_TRACK_SIZE_IN_BYTES-1;
+                            indexCount = pRunningData[0];
+                            pRunningData      += 1;
+                            pRunningTrackData += 1;
+                            for (iIndex = 0; iIndex < indexCount; ++iIndex) {
+                                ma_dr_flac_cuesheet_track_index* pTrackIndex = (ma_dr_flac_cuesheet_track_index*)pRunningTrackData;
+                                MA_DR_FLAC_COPY_MEMORY(pRunningTrackData, pRunningData, MA_DR_FLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES);
+                                pRunningData      += MA_DR_FLAC_CUESHEET_TRACK_INDEX_SIZE_IN_BYTES;
+                                pRunningTrackData += sizeof(ma_dr_flac_cuesheet_track_index);
+                                pTrackIndex->offset = ma_dr_flac__be2host_64(pTrackIndex->offset);
+                            }
+                        }
+                        metadata.data.cuesheet.pTrackData = pTrackData;
+                    }
+                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                    pRawData = NULL;
+                    onMeta(pUserDataMD, &metadata);
+                    ma_dr_flac__free_from_callbacks(pTrackData, pAllocationCallbacks);
+                    pTrackData = NULL;
+                }
+            } break;
+            case MA_DR_FLAC_METADATA_BLOCK_TYPE_PICTURE:
+            {
+                if (blockSize < 32) {
+                    return MA_FALSE;
+                }
+                if (onMeta) {
+                    void* pRawData;
+                    const char* pRunningData;
+                    const char* pRunningDataEnd;
+                    pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return MA_FALSE;
+                    }
+                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    pRunningData    = (const char*)pRawData;
+                    pRunningDataEnd = (const char*)pRawData + blockSize;
+                    metadata.data.picture.type       = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    metadata.data.picture.mimeLength = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    if ((pRunningDataEnd - pRunningData) - 24 < (ma_int64)metadata.data.picture.mimeLength) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.data.picture.mime              = pRunningData;                                   pRunningData += metadata.data.picture.mimeLength;
+                    metadata.data.picture.descriptionLength = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    if ((pRunningDataEnd - pRunningData) - 20 < (ma_int64)metadata.data.picture.descriptionLength) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.data.picture.description     = pRunningData;                                   pRunningData += metadata.data.picture.descriptionLength;
+                    metadata.data.picture.width           = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    metadata.data.picture.height          = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    metadata.data.picture.colorDepth      = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    metadata.data.picture.indexColorCount = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    metadata.data.picture.pictureDataSize = ma_dr_flac__be2host_32_ptr_unaligned(pRunningData); pRunningData += 4;
+                    metadata.data.picture.pPictureData    = (const ma_uint8*)pRunningData;
+                    if (pRunningDataEnd - pRunningData < (ma_int64)metadata.data.picture.pictureDataSize) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    onMeta(pUserDataMD, &metadata);
+                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+            case MA_DR_FLAC_METADATA_BLOCK_TYPE_PADDING:
+            {
+                if (onMeta) {
+                    metadata.data.padding.unused = 0;
+                    if (!onSeek(pUserData, blockSize, ma_dr_flac_seek_origin_current)) {
+                        isLastBlock = MA_TRUE;
+                    } else {
+                        onMeta(pUserDataMD, &metadata);
+                    }
+                }
+            } break;
+            case MA_DR_FLAC_METADATA_BLOCK_TYPE_INVALID:
+            {
+                if (onMeta) {
+                    if (!onSeek(pUserData, blockSize, ma_dr_flac_seek_origin_current)) {
+                        isLastBlock = MA_TRUE;
+                    }
+                }
+            } break;
+            default:
+            {
+                if (onMeta) {
+                    void* pRawData = ma_dr_flac__malloc_from_callbacks(blockSize, pAllocationCallbacks);
+                    if (pRawData == NULL) {
+                        return MA_FALSE;
+                    }
+                    if (onRead(pUserData, pRawData, blockSize) != blockSize) {
+                        ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                        return MA_FALSE;
+                    }
+                    metadata.pRawData = pRawData;
+                    metadata.rawDataSize = blockSize;
+                    onMeta(pUserDataMD, &metadata);
+                    ma_dr_flac__free_from_callbacks(pRawData, pAllocationCallbacks);
+                }
+            } break;
+        }
+        if (onMeta == NULL && blockSize > 0) {
+            if (!onSeek(pUserData, blockSize, ma_dr_flac_seek_origin_current)) {
+                isLastBlock = MA_TRUE;
+            }
+        }
+        runningFilePos += blockSize;
+        if (isLastBlock) {
+            break;
+        }
+    }
+    *pSeektablePos   = seektablePos;
+    *pSeekpointCount = seektableSize / MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES;
+    *pFirstFramePos  = runningFilePos;
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac__init_private__native(ma_dr_flac_init_info* pInit, ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, void* pUserDataMD, ma_bool32 relaxed)
+{
+    ma_uint8 isLastBlock;
+    ma_uint8 blockType;
+    ma_uint32 blockSize;
+    (void)onSeek;
+    pInit->container = ma_dr_flac_container_native;
+    if (!ma_dr_flac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
+        return MA_FALSE;
+    }
+    if (blockType != MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
+        if (!relaxed) {
+            return MA_FALSE;
+        } else {
+            pInit->hasStreamInfoBlock = MA_FALSE;
+            pInit->hasMetadataBlocks  = MA_FALSE;
+            if (!ma_dr_flac__read_next_flac_frame_header(&pInit->bs, 0, &pInit->firstFrameHeader)) {
+                return MA_FALSE;
+            }
+            if (pInit->firstFrameHeader.bitsPerSample == 0) {
+                return MA_FALSE;
+            }
+            pInit->sampleRate              = pInit->firstFrameHeader.sampleRate;
+            pInit->channels                = ma_dr_flac__get_channel_count_from_channel_assignment(pInit->firstFrameHeader.channelAssignment);
+            pInit->bitsPerSample           = pInit->firstFrameHeader.bitsPerSample;
+            pInit->maxBlockSizeInPCMFrames = 65535;
+            return MA_TRUE;
+        }
+    } else {
+        ma_dr_flac_streaminfo streaminfo;
+        if (!ma_dr_flac__read_streaminfo(onRead, pUserData, &streaminfo)) {
+            return MA_FALSE;
+        }
+        pInit->hasStreamInfoBlock      = MA_TRUE;
+        pInit->sampleRate              = streaminfo.sampleRate;
+        pInit->channels                = streaminfo.channels;
+        pInit->bitsPerSample           = streaminfo.bitsPerSample;
+        pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
+        pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;
+        pInit->hasMetadataBlocks       = !isLastBlock;
+        if (onMeta) {
+            ma_dr_flac_metadata metadata;
+            metadata.type = MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO;
+            metadata.pRawData = NULL;
+            metadata.rawDataSize = 0;
+            metadata.data.streaminfo = streaminfo;
+            onMeta(pUserDataMD, &metadata);
+        }
+        return MA_TRUE;
+    }
+}
+#ifndef MA_DR_FLAC_NO_OGG
+#define MA_DR_FLAC_OGG_MAX_PAGE_SIZE            65307
+#define MA_DR_FLAC_OGG_CAPTURE_PATTERN_CRC32    1605413199
+typedef enum
+{
+    ma_dr_flac_ogg_recover_on_crc_mismatch,
+    ma_dr_flac_ogg_fail_on_crc_mismatch
+} ma_dr_flac_ogg_crc_mismatch_recovery;
+#ifndef MA_DR_FLAC_NO_CRC
+static ma_uint32 ma_dr_flac__crc32_table[] = {
+    0x00000000L, 0x04C11DB7L, 0x09823B6EL, 0x0D4326D9L,
+    0x130476DCL, 0x17C56B6BL, 0x1A864DB2L, 0x1E475005L,
+    0x2608EDB8L, 0x22C9F00FL, 0x2F8AD6D6L, 0x2B4BCB61L,
+    0x350C9B64L, 0x31CD86D3L, 0x3C8EA00AL, 0x384FBDBDL,
+    0x4C11DB70L, 0x48D0C6C7L, 0x4593E01EL, 0x4152FDA9L,
+    0x5F15ADACL, 0x5BD4B01BL, 0x569796C2L, 0x52568B75L,
+    0x6A1936C8L, 0x6ED82B7FL, 0x639B0DA6L, 0x675A1011L,
+    0x791D4014L, 0x7DDC5DA3L, 0x709F7B7AL, 0x745E66CDL,
+    0x9823B6E0L, 0x9CE2AB57L, 0x91A18D8EL, 0x95609039L,
+    0x8B27C03CL, 0x8FE6DD8BL, 0x82A5FB52L, 0x8664E6E5L,
+    0xBE2B5B58L, 0xBAEA46EFL, 0xB7A96036L, 0xB3687D81L,
+    0xAD2F2D84L, 0xA9EE3033L, 0xA4AD16EAL, 0xA06C0B5DL,
+    0xD4326D90L, 0xD0F37027L, 0xDDB056FEL, 0xD9714B49L,
+    0xC7361B4CL, 0xC3F706FBL, 0xCEB42022L, 0xCA753D95L,
+    0xF23A8028L, 0xF6FB9D9FL, 0xFBB8BB46L, 0xFF79A6F1L,
+    0xE13EF6F4L, 0xE5FFEB43L, 0xE8BCCD9AL, 0xEC7DD02DL,
+    0x34867077L, 0x30476DC0L, 0x3D044B19L, 0x39C556AEL,
+    0x278206ABL, 0x23431B1CL, 0x2E003DC5L, 0x2AC12072L,
+    0x128E9DCFL, 0x164F8078L, 0x1B0CA6A1L, 0x1FCDBB16L,
+    0x018AEB13L, 0x054BF6A4L, 0x0808D07DL, 0x0CC9CDCAL,
+    0x7897AB07L, 0x7C56B6B0L, 0x71159069L, 0x75D48DDEL,
+    0x6B93DDDBL, 0x6F52C06CL, 0x6211E6B5L, 0x66D0FB02L,
+    0x5E9F46BFL, 0x5A5E5B08L, 0x571D7DD1L, 0x53DC6066L,
+    0x4D9B3063L, 0x495A2DD4L, 0x44190B0DL, 0x40D816BAL,
+    0xACA5C697L, 0xA864DB20L, 0xA527FDF9L, 0xA1E6E04EL,
+    0xBFA1B04BL, 0xBB60ADFCL, 0xB6238B25L, 0xB2E29692L,
+    0x8AAD2B2FL, 0x8E6C3698L, 0x832F1041L, 0x87EE0DF6L,
+    0x99A95DF3L, 0x9D684044L, 0x902B669DL, 0x94EA7B2AL,
+    0xE0B41DE7L, 0xE4750050L, 0xE9362689L, 0xEDF73B3EL,
+    0xF3B06B3BL, 0xF771768CL, 0xFA325055L, 0xFEF34DE2L,
+    0xC6BCF05FL, 0xC27DEDE8L, 0xCF3ECB31L, 0xCBFFD686L,
+    0xD5B88683L, 0xD1799B34L, 0xDC3ABDEDL, 0xD8FBA05AL,
+    0x690CE0EEL, 0x6DCDFD59L, 0x608EDB80L, 0x644FC637L,
+    0x7A089632L, 0x7EC98B85L, 0x738AAD5CL, 0x774BB0EBL,
+    0x4F040D56L, 0x4BC510E1L, 0x46863638L, 0x42472B8FL,
+    0x5C007B8AL, 0x58C1663DL, 0x558240E4L, 0x51435D53L,
+    0x251D3B9EL, 0x21DC2629L, 0x2C9F00F0L, 0x285E1D47L,
+    0x36194D42L, 0x32D850F5L, 0x3F9B762CL, 0x3B5A6B9BL,
+    0x0315D626L, 0x07D4CB91L, 0x0A97ED48L, 0x0E56F0FFL,
+    0x1011A0FAL, 0x14D0BD4DL, 0x19939B94L, 0x1D528623L,
+    0xF12F560EL, 0xF5EE4BB9L, 0xF8AD6D60L, 0xFC6C70D7L,
+    0xE22B20D2L, 0xE6EA3D65L, 0xEBA91BBCL, 0xEF68060BL,
+    0xD727BBB6L, 0xD3E6A601L, 0xDEA580D8L, 0xDA649D6FL,
+    0xC423CD6AL, 0xC0E2D0DDL, 0xCDA1F604L, 0xC960EBB3L,
+    0xBD3E8D7EL, 0xB9FF90C9L, 0xB4BCB610L, 0xB07DABA7L,
+    0xAE3AFBA2L, 0xAAFBE615L, 0xA7B8C0CCL, 0xA379DD7BL,
+    0x9B3660C6L, 0x9FF77D71L, 0x92B45BA8L, 0x9675461FL,
+    0x8832161AL, 0x8CF30BADL, 0x81B02D74L, 0x857130C3L,
+    0x5D8A9099L, 0x594B8D2EL, 0x5408ABF7L, 0x50C9B640L,
+    0x4E8EE645L, 0x4A4FFBF2L, 0x470CDD2BL, 0x43CDC09CL,
+    0x7B827D21L, 0x7F436096L, 0x7200464FL, 0x76C15BF8L,
+    0x68860BFDL, 0x6C47164AL, 0x61043093L, 0x65C52D24L,
+    0x119B4BE9L, 0x155A565EL, 0x18197087L, 0x1CD86D30L,
+    0x029F3D35L, 0x065E2082L, 0x0B1D065BL, 0x0FDC1BECL,
+    0x3793A651L, 0x3352BBE6L, 0x3E119D3FL, 0x3AD08088L,
+    0x2497D08DL, 0x2056CD3AL, 0x2D15EBE3L, 0x29D4F654L,
+    0xC5A92679L, 0xC1683BCEL, 0xCC2B1D17L, 0xC8EA00A0L,
+    0xD6AD50A5L, 0xD26C4D12L, 0xDF2F6BCBL, 0xDBEE767CL,
+    0xE3A1CBC1L, 0xE760D676L, 0xEA23F0AFL, 0xEEE2ED18L,
+    0xF0A5BD1DL, 0xF464A0AAL, 0xF9278673L, 0xFDE69BC4L,
+    0x89B8FD09L, 0x8D79E0BEL, 0x803AC667L, 0x84FBDBD0L,
+    0x9ABC8BD5L, 0x9E7D9662L, 0x933EB0BBL, 0x97FFAD0CL,
+    0xAFB010B1L, 0xAB710D06L, 0xA6322BDFL, 0xA2F33668L,
+    0xBCB4666DL, 0xB8757BDAL, 0xB5365D03L, 0xB1F740B4L
+};
+#endif
+static MA_INLINE ma_uint32 ma_dr_flac_crc32_byte(ma_uint32 crc32, ma_uint8 data)
+{
+#ifndef MA_DR_FLAC_NO_CRC
+    return (crc32 << 8) ^ ma_dr_flac__crc32_table[(ma_uint8)((crc32 >> 24) & 0xFF) ^ data];
+#else
+    (void)data;
+    return crc32;
+#endif
+}
+#if 0
+static MA_INLINE ma_uint32 ma_dr_flac_crc32_uint32(ma_uint32 crc32, ma_uint32 data)
+{
+    crc32 = ma_dr_flac_crc32_byte(crc32, (ma_uint8)((data >> 24) & 0xFF));
+    crc32 = ma_dr_flac_crc32_byte(crc32, (ma_uint8)((data >> 16) & 0xFF));
+    crc32 = ma_dr_flac_crc32_byte(crc32, (ma_uint8)((data >>  8) & 0xFF));
+    crc32 = ma_dr_flac_crc32_byte(crc32, (ma_uint8)((data >>  0) & 0xFF));
+    return crc32;
+}
+static MA_INLINE ma_uint32 ma_dr_flac_crc32_uint64(ma_uint32 crc32, ma_uint64 data)
+{
+    crc32 = ma_dr_flac_crc32_uint32(crc32, (ma_uint32)((data >> 32) & 0xFFFFFFFF));
+    crc32 = ma_dr_flac_crc32_uint32(crc32, (ma_uint32)((data >>  0) & 0xFFFFFFFF));
+    return crc32;
+}
+#endif
+static MA_INLINE ma_uint32 ma_dr_flac_crc32_buffer(ma_uint32 crc32, ma_uint8* pData, ma_uint32 dataSize)
+{
+    ma_uint32 i;
+    for (i = 0; i < dataSize; ++i) {
+        crc32 = ma_dr_flac_crc32_byte(crc32, pData[i]);
+    }
+    return crc32;
+}
+static MA_INLINE ma_bool32 ma_dr_flac_ogg__is_capture_pattern(ma_uint8 pattern[4])
+{
+    return pattern[0] == 'O' && pattern[1] == 'g' && pattern[2] == 'g' && pattern[3] == 'S';
+}
+static MA_INLINE ma_uint32 ma_dr_flac_ogg__get_page_header_size(ma_dr_flac_ogg_page_header* pHeader)
+{
+    return 27 + pHeader->segmentCount;
+}
+static MA_INLINE ma_uint32 ma_dr_flac_ogg__get_page_body_size(ma_dr_flac_ogg_page_header* pHeader)
+{
+    ma_uint32 pageBodySize = 0;
+    int i;
+    for (i = 0; i < pHeader->segmentCount; ++i) {
+        pageBodySize += pHeader->segmentTable[i];
+    }
+    return pageBodySize;
+}
+static ma_result ma_dr_flac_ogg__read_page_header_after_capture_pattern(ma_dr_flac_read_proc onRead, void* pUserData, ma_dr_flac_ogg_page_header* pHeader, ma_uint32* pBytesRead, ma_uint32* pCRC32)
+{
+    ma_uint8 data[23];
+    ma_uint32 i;
+    MA_DR_FLAC_ASSERT(*pCRC32 == MA_DR_FLAC_OGG_CAPTURE_PATTERN_CRC32);
+    if (onRead(pUserData, data, 23) != 23) {
+        return MA_AT_END;
+    }
+    *pBytesRead += 23;
+    pHeader->capturePattern[0] = 'O';
+    pHeader->capturePattern[1] = 'g';
+    pHeader->capturePattern[2] = 'g';
+    pHeader->capturePattern[3] = 'S';
+    pHeader->structureVersion = data[0];
+    pHeader->headerType       = data[1];
+    MA_DR_FLAC_COPY_MEMORY(&pHeader->granulePosition, &data[ 2], 8);
+    MA_DR_FLAC_COPY_MEMORY(&pHeader->serialNumber,    &data[10], 4);
+    MA_DR_FLAC_COPY_MEMORY(&pHeader->sequenceNumber,  &data[14], 4);
+    MA_DR_FLAC_COPY_MEMORY(&pHeader->checksum,        &data[18], 4);
+    pHeader->segmentCount     = data[22];
+    data[18] = 0;
+    data[19] = 0;
+    data[20] = 0;
+    data[21] = 0;
+    for (i = 0; i < 23; ++i) {
+        *pCRC32 = ma_dr_flac_crc32_byte(*pCRC32, data[i]);
+    }
+    if (onRead(pUserData, pHeader->segmentTable, pHeader->segmentCount) != pHeader->segmentCount) {
+        return MA_AT_END;
+    }
+    *pBytesRead += pHeader->segmentCount;
+    for (i = 0; i < pHeader->segmentCount; ++i) {
+        *pCRC32 = ma_dr_flac_crc32_byte(*pCRC32, pHeader->segmentTable[i]);
+    }
+    return MA_SUCCESS;
+}
+static ma_result ma_dr_flac_ogg__read_page_header(ma_dr_flac_read_proc onRead, void* pUserData, ma_dr_flac_ogg_page_header* pHeader, ma_uint32* pBytesRead, ma_uint32* pCRC32)
+{
+    ma_uint8 id[4];
+    *pBytesRead = 0;
+    if (onRead(pUserData, id, 4) != 4) {
+        return MA_AT_END;
+    }
+    *pBytesRead += 4;
+    for (;;) {
+        if (ma_dr_flac_ogg__is_capture_pattern(id)) {
+            ma_result result;
+            *pCRC32 = MA_DR_FLAC_OGG_CAPTURE_PATTERN_CRC32;
+            result = ma_dr_flac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, pHeader, pBytesRead, pCRC32);
+            if (result == MA_SUCCESS) {
+                return MA_SUCCESS;
+            } else {
+                if (result == MA_CRC_MISMATCH) {
+                    continue;
+                } else {
+                    return result;
+                }
+            }
+        } else {
+            id[0] = id[1];
+            id[1] = id[2];
+            id[2] = id[3];
+            if (onRead(pUserData, &id[3], 1) != 1) {
+                return MA_AT_END;
+            }
+            *pBytesRead += 1;
+        }
+    }
+}
+typedef struct
+{
+    ma_dr_flac_read_proc onRead;
+    ma_dr_flac_seek_proc onSeek;
+    void* pUserData;
+    ma_uint64 currentBytePos;
+    ma_uint64 firstBytePos;
+    ma_uint32 serialNumber;
+    ma_dr_flac_ogg_page_header bosPageHeader;
+    ma_dr_flac_ogg_page_header currentPageHeader;
+    ma_uint32 bytesRemainingInPage;
+    ma_uint32 pageDataSize;
+    ma_uint8 pageData[MA_DR_FLAC_OGG_MAX_PAGE_SIZE];
+} ma_dr_flac_oggbs;
+static size_t ma_dr_flac_oggbs__read_physical(ma_dr_flac_oggbs* oggbs, void* bufferOut, size_t bytesToRead)
+{
+    size_t bytesActuallyRead = oggbs->onRead(oggbs->pUserData, bufferOut, bytesToRead);
+    oggbs->currentBytePos += bytesActuallyRead;
+    return bytesActuallyRead;
+}
+static ma_bool32 ma_dr_flac_oggbs__seek_physical(ma_dr_flac_oggbs* oggbs, ma_uint64 offset, ma_dr_flac_seek_origin origin)
+{
+    if (origin == ma_dr_flac_seek_origin_start) {
+        if (offset <= 0x7FFFFFFF) {
+            if (!oggbs->onSeek(oggbs->pUserData, (int)offset, ma_dr_flac_seek_origin_start)) {
+                return MA_FALSE;
+            }
+            oggbs->currentBytePos = offset;
+            return MA_TRUE;
+        } else {
+            if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, ma_dr_flac_seek_origin_start)) {
+                return MA_FALSE;
+            }
+            oggbs->currentBytePos = offset;
+            return ma_dr_flac_oggbs__seek_physical(oggbs, offset - 0x7FFFFFFF, ma_dr_flac_seek_origin_current);
+        }
+    } else {
+        while (offset > 0x7FFFFFFF) {
+            if (!oggbs->onSeek(oggbs->pUserData, 0x7FFFFFFF, ma_dr_flac_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            oggbs->currentBytePos += 0x7FFFFFFF;
+            offset -= 0x7FFFFFFF;
+        }
+        if (!oggbs->onSeek(oggbs->pUserData, (int)offset, ma_dr_flac_seek_origin_current)) {
+            return MA_FALSE;
+        }
+        oggbs->currentBytePos += offset;
+        return MA_TRUE;
+    }
+}
+static ma_bool32 ma_dr_flac_oggbs__goto_next_page(ma_dr_flac_oggbs* oggbs, ma_dr_flac_ogg_crc_mismatch_recovery recoveryMethod)
+{
+    ma_dr_flac_ogg_page_header header;
+    for (;;) {
+        ma_uint32 crc32 = 0;
+        ma_uint32 bytesRead;
+        ma_uint32 pageBodySize;
+#ifndef MA_DR_FLAC_NO_CRC
+        ma_uint32 actualCRC32;
+#endif
+        if (ma_dr_flac_ogg__read_page_header(oggbs->onRead, oggbs->pUserData, &header, &bytesRead, &crc32) != MA_SUCCESS) {
+            return MA_FALSE;
+        }
+        oggbs->currentBytePos += bytesRead;
+        pageBodySize = ma_dr_flac_ogg__get_page_body_size(&header);
+        if (pageBodySize > MA_DR_FLAC_OGG_MAX_PAGE_SIZE) {
+            continue;
+        }
+        if (header.serialNumber != oggbs->serialNumber) {
+            if (pageBodySize > 0 && !ma_dr_flac_oggbs__seek_physical(oggbs, pageBodySize, ma_dr_flac_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            continue;
+        }
+        if (ma_dr_flac_oggbs__read_physical(oggbs, oggbs->pageData, pageBodySize) != pageBodySize) {
+            return MA_FALSE;
+        }
+        oggbs->pageDataSize = pageBodySize;
+#ifndef MA_DR_FLAC_NO_CRC
+        actualCRC32 = ma_dr_flac_crc32_buffer(crc32, oggbs->pageData, oggbs->pageDataSize);
+        if (actualCRC32 != header.checksum) {
+            if (recoveryMethod == ma_dr_flac_ogg_recover_on_crc_mismatch) {
+                continue;
+            } else {
+                ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_recover_on_crc_mismatch);
+                return MA_FALSE;
+            }
+        }
+#else
+        (void)recoveryMethod;
+#endif
+        oggbs->currentPageHeader = header;
+        oggbs->bytesRemainingInPage = pageBodySize;
+        return MA_TRUE;
+    }
+}
+#if 0
+static ma_uint8 ma_dr_flac_oggbs__get_current_segment_index(ma_dr_flac_oggbs* oggbs, ma_uint8* pBytesRemainingInSeg)
+{
+    ma_uint32 bytesConsumedInPage = ma_dr_flac_ogg__get_page_body_size(&oggbs->currentPageHeader) - oggbs->bytesRemainingInPage;
+    ma_uint8 iSeg = 0;
+    ma_uint32 iByte = 0;
+    while (iByte < bytesConsumedInPage) {
+        ma_uint8 segmentSize = oggbs->currentPageHeader.segmentTable[iSeg];
+        if (iByte + segmentSize > bytesConsumedInPage) {
+            break;
+        } else {
+            iSeg += 1;
+            iByte += segmentSize;
+        }
+    }
+    *pBytesRemainingInSeg = oggbs->currentPageHeader.segmentTable[iSeg] - (ma_uint8)(bytesConsumedInPage - iByte);
+    return iSeg;
+}
+static ma_bool32 ma_dr_flac_oggbs__seek_to_next_packet(ma_dr_flac_oggbs* oggbs)
+{
+    for (;;) {
+        ma_bool32 atEndOfPage = MA_FALSE;
+        ma_uint8 bytesRemainingInSeg;
+        ma_uint8 iFirstSeg = ma_dr_flac_oggbs__get_current_segment_index(oggbs, &bytesRemainingInSeg);
+        ma_uint32 bytesToEndOfPacketOrPage = bytesRemainingInSeg;
+        for (ma_uint8 iSeg = iFirstSeg; iSeg < oggbs->currentPageHeader.segmentCount; ++iSeg) {
+            ma_uint8 segmentSize = oggbs->currentPageHeader.segmentTable[iSeg];
+            if (segmentSize < 255) {
+                if (iSeg == oggbs->currentPageHeader.segmentCount-1) {
+                    atEndOfPage = MA_TRUE;
+                }
+                break;
+            }
+            bytesToEndOfPacketOrPage += segmentSize;
+        }
+        ma_dr_flac_oggbs__seek_physical(oggbs, bytesToEndOfPacketOrPage, ma_dr_flac_seek_origin_current);
+        oggbs->bytesRemainingInPage -= bytesToEndOfPacketOrPage;
+        if (atEndOfPage) {
+            if (!ma_dr_flac_oggbs__goto_next_page(oggbs)) {
+                return MA_FALSE;
+            }
+            if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {
+                return MA_TRUE;
+            }
+        } else {
+            return MA_TRUE;
+        }
+    }
+}
+static ma_bool32 ma_dr_flac_oggbs__seek_to_next_frame(ma_dr_flac_oggbs* oggbs)
+{
+    return ma_dr_flac_oggbs__seek_to_next_packet(oggbs);
+}
+#endif
+static size_t ma_dr_flac__on_read_ogg(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pUserData;
+    ma_uint8* pRunningBufferOut = (ma_uint8*)bufferOut;
+    size_t bytesRead = 0;
+    MA_DR_FLAC_ASSERT(oggbs != NULL);
+    MA_DR_FLAC_ASSERT(pRunningBufferOut != NULL);
+    while (bytesRead < bytesToRead) {
+        size_t bytesRemainingToRead = bytesToRead - bytesRead;
+        if (oggbs->bytesRemainingInPage >= bytesRemainingToRead) {
+            MA_DR_FLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), bytesRemainingToRead);
+            bytesRead += bytesRemainingToRead;
+            oggbs->bytesRemainingInPage -= (ma_uint32)bytesRemainingToRead;
+            break;
+        }
+        if (oggbs->bytesRemainingInPage > 0) {
+            MA_DR_FLAC_COPY_MEMORY(pRunningBufferOut, oggbs->pageData + (oggbs->pageDataSize - oggbs->bytesRemainingInPage), oggbs->bytesRemainingInPage);
+            bytesRead += oggbs->bytesRemainingInPage;
+            pRunningBufferOut += oggbs->bytesRemainingInPage;
+            oggbs->bytesRemainingInPage = 0;
+        }
+        MA_DR_FLAC_ASSERT(bytesRemainingToRead > 0);
+        if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_recover_on_crc_mismatch)) {
+            break;
+        }
+    }
+    return bytesRead;
+}
+static ma_bool32 ma_dr_flac__on_seek_ogg(void* pUserData, int offset, ma_dr_flac_seek_origin origin)
+{
+    ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pUserData;
+    int bytesSeeked = 0;
+    MA_DR_FLAC_ASSERT(oggbs != NULL);
+    MA_DR_FLAC_ASSERT(offset >= 0);
+    if (origin == ma_dr_flac_seek_origin_start) {
+        if (!ma_dr_flac_oggbs__seek_physical(oggbs, (int)oggbs->firstBytePos, ma_dr_flac_seek_origin_start)) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_fail_on_crc_mismatch)) {
+            return MA_FALSE;
+        }
+        return ma_dr_flac__on_seek_ogg(pUserData, offset, ma_dr_flac_seek_origin_current);
+    }
+    MA_DR_FLAC_ASSERT(origin == ma_dr_flac_seek_origin_current);
+    while (bytesSeeked < offset) {
+        int bytesRemainingToSeek = offset - bytesSeeked;
+        MA_DR_FLAC_ASSERT(bytesRemainingToSeek >= 0);
+        if (oggbs->bytesRemainingInPage >= (size_t)bytesRemainingToSeek) {
+            bytesSeeked += bytesRemainingToSeek;
+            (void)bytesSeeked;
+            oggbs->bytesRemainingInPage -= bytesRemainingToSeek;
+            break;
+        }
+        if (oggbs->bytesRemainingInPage > 0) {
+            bytesSeeked += (int)oggbs->bytesRemainingInPage;
+            oggbs->bytesRemainingInPage = 0;
+        }
+        MA_DR_FLAC_ASSERT(bytesRemainingToSeek > 0);
+        if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_fail_on_crc_mismatch)) {
+            return MA_FALSE;
+        }
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_flac_ogg__seek_to_pcm_frame(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
+{
+    ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pFlac->_oggbs;
+    ma_uint64 originalBytePos;
+    ma_uint64 runningGranulePosition;
+    ma_uint64 runningFrameBytePos;
+    ma_uint64 runningPCMFrameCount;
+    MA_DR_FLAC_ASSERT(oggbs != NULL);
+    originalBytePos = oggbs->currentBytePos;
+    if (!ma_dr_flac__seek_to_byte(&pFlac->bs, pFlac->firstFLACFramePosInBytes)) {
+        return MA_FALSE;
+    }
+    oggbs->bytesRemainingInPage = 0;
+    runningGranulePosition = 0;
+    for (;;) {
+        if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_recover_on_crc_mismatch)) {
+            ma_dr_flac_oggbs__seek_physical(oggbs, originalBytePos, ma_dr_flac_seek_origin_start);
+            return MA_FALSE;
+        }
+        runningFrameBytePos = oggbs->currentBytePos - ma_dr_flac_ogg__get_page_header_size(&oggbs->currentPageHeader) - oggbs->pageDataSize;
+        if (oggbs->currentPageHeader.granulePosition >= pcmFrameIndex) {
+            break;
+        }
+        if ((oggbs->currentPageHeader.headerType & 0x01) == 0) {
+            if (oggbs->currentPageHeader.segmentTable[0] >= 2) {
+                ma_uint8 firstBytesInPage[2];
+                firstBytesInPage[0] = oggbs->pageData[0];
+                firstBytesInPage[1] = oggbs->pageData[1];
+                if ((firstBytesInPage[0] == 0xFF) && (firstBytesInPage[1] & 0xFC) == 0xF8) {
+                    runningGranulePosition = oggbs->currentPageHeader.granulePosition;
+                }
+                continue;
+            }
+        }
+    }
+    if (!ma_dr_flac_oggbs__seek_physical(oggbs, runningFrameBytePos, ma_dr_flac_seek_origin_start)) {
+        return MA_FALSE;
+    }
+    if (!ma_dr_flac_oggbs__goto_next_page(oggbs, ma_dr_flac_ogg_recover_on_crc_mismatch)) {
+        return MA_FALSE;
+    }
+    runningPCMFrameCount = runningGranulePosition;
+    for (;;) {
+        ma_uint64 firstPCMFrameInFLACFrame = 0;
+        ma_uint64 lastPCMFrameInFLACFrame = 0;
+        ma_uint64 pcmFrameCountInThisFrame;
+        if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+            return MA_FALSE;
+        }
+        ma_dr_flac__get_pcm_frame_range_of_current_flac_frame(pFlac, &firstPCMFrameInFLACFrame, &lastPCMFrameInFLACFrame);
+        pcmFrameCountInThisFrame = (lastPCMFrameInFLACFrame - firstPCMFrameInFLACFrame) + 1;
+        if (pcmFrameIndex == pFlac->totalPCMFrameCount && (runningPCMFrameCount + pcmFrameCountInThisFrame) == pFlac->totalPCMFrameCount) {
+            ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
+            if (result == MA_SUCCESS) {
+                pFlac->currentPCMFrame = pcmFrameIndex;
+                pFlac->currentFLACFrame.pcmFramesRemaining = 0;
+                return MA_TRUE;
+            } else {
+                return MA_FALSE;
+            }
+        }
+        if (pcmFrameIndex < (runningPCMFrameCount + pcmFrameCountInThisFrame)) {
+            ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
+            if (result == MA_SUCCESS) {
+                ma_uint64 pcmFramesToDecode = (size_t)(pcmFrameIndex - runningPCMFrameCount);
+                if (pcmFramesToDecode == 0) {
+                    return MA_TRUE;
+                }
+                pFlac->currentPCMFrame = runningPCMFrameCount;
+                return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, pcmFramesToDecode) == pcmFramesToDecode;
+            } else {
+                if (result == MA_CRC_MISMATCH) {
+                    continue;
+                } else {
+                    return MA_FALSE;
+                }
+            }
+        } else {
+            ma_result result = ma_dr_flac__seek_to_next_flac_frame(pFlac);
+            if (result == MA_SUCCESS) {
+                runningPCMFrameCount += pcmFrameCountInThisFrame;
+            } else {
+                if (result == MA_CRC_MISMATCH) {
+                    continue;
+                } else {
+                    return MA_FALSE;
+                }
+            }
+        }
+    }
+}
+static ma_bool32 ma_dr_flac__init_private__ogg(ma_dr_flac_init_info* pInit, ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, void* pUserDataMD, ma_bool32 relaxed)
+{
+    ma_dr_flac_ogg_page_header header;
+    ma_uint32 crc32 = MA_DR_FLAC_OGG_CAPTURE_PATTERN_CRC32;
+    ma_uint32 bytesRead = 0;
+    (void)relaxed;
+    pInit->container = ma_dr_flac_container_ogg;
+    pInit->oggFirstBytePos = 0;
+    if (ma_dr_flac_ogg__read_page_header_after_capture_pattern(onRead, pUserData, &header, &bytesRead, &crc32) != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    pInit->runningFilePos += bytesRead;
+    for (;;) {
+        int pageBodySize;
+        if ((header.headerType & 0x02) == 0) {
+            return MA_FALSE;
+        }
+        pageBodySize = ma_dr_flac_ogg__get_page_body_size(&header);
+        if (pageBodySize == 51) {
+            ma_uint32 bytesRemainingInPage = pageBodySize;
+            ma_uint8 packetType;
+            if (onRead(pUserData, &packetType, 1) != 1) {
+                return MA_FALSE;
+            }
+            bytesRemainingInPage -= 1;
+            if (packetType == 0x7F) {
+                ma_uint8 sig[4];
+                if (onRead(pUserData, sig, 4) != 4) {
+                    return MA_FALSE;
+                }
+                bytesRemainingInPage -= 4;
+                if (sig[0] == 'F' && sig[1] == 'L' && sig[2] == 'A' && sig[3] == 'C') {
+                    ma_uint8 mappingVersion[2];
+                    if (onRead(pUserData, mappingVersion, 2) != 2) {
+                        return MA_FALSE;
+                    }
+                    if (mappingVersion[0] != 1) {
+                        return MA_FALSE;
+                    }
+                    if (!onSeek(pUserData, 2, ma_dr_flac_seek_origin_current)) {
+                        return MA_FALSE;
+                    }
+                    if (onRead(pUserData, sig, 4) != 4) {
+                        return MA_FALSE;
+                    }
+                    if (sig[0] == 'f' && sig[1] == 'L' && sig[2] == 'a' && sig[3] == 'C') {
+                        ma_dr_flac_streaminfo streaminfo;
+                        ma_uint8 isLastBlock;
+                        ma_uint8 blockType;
+                        ma_uint32 blockSize;
+                        if (!ma_dr_flac__read_and_decode_block_header(onRead, pUserData, &isLastBlock, &blockType, &blockSize)) {
+                            return MA_FALSE;
+                        }
+                        if (blockType != MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO || blockSize != 34) {
+                            return MA_FALSE;
+                        }
+                        if (ma_dr_flac__read_streaminfo(onRead, pUserData, &streaminfo)) {
+                            pInit->hasStreamInfoBlock      = MA_TRUE;
+                            pInit->sampleRate              = streaminfo.sampleRate;
+                            pInit->channels                = streaminfo.channels;
+                            pInit->bitsPerSample           = streaminfo.bitsPerSample;
+                            pInit->totalPCMFrameCount      = streaminfo.totalPCMFrameCount;
+                            pInit->maxBlockSizeInPCMFrames = streaminfo.maxBlockSizeInPCMFrames;
+                            pInit->hasMetadataBlocks       = !isLastBlock;
+                            if (onMeta) {
+                                ma_dr_flac_metadata metadata;
+                                metadata.type = MA_DR_FLAC_METADATA_BLOCK_TYPE_STREAMINFO;
+                                metadata.pRawData = NULL;
+                                metadata.rawDataSize = 0;
+                                metadata.data.streaminfo = streaminfo;
+                                onMeta(pUserDataMD, &metadata);
+                            }
+                            pInit->runningFilePos  += pageBodySize;
+                            pInit->oggFirstBytePos  = pInit->runningFilePos - 79;
+                            pInit->oggSerial        = header.serialNumber;
+                            pInit->oggBosHeader     = header;
+                            break;
+                        } else {
+                            return MA_FALSE;
+                        }
+                    } else {
+                        return MA_FALSE;
+                    }
+                } else {
+                    if (!onSeek(pUserData, bytesRemainingInPage, ma_dr_flac_seek_origin_current)) {
+                        return MA_FALSE;
+                    }
+                }
+            } else {
+                if (!onSeek(pUserData, bytesRemainingInPage, ma_dr_flac_seek_origin_current)) {
+                    return MA_FALSE;
+                }
+            }
+        } else {
+            if (!onSeek(pUserData, pageBodySize, ma_dr_flac_seek_origin_current)) {
+                return MA_FALSE;
+            }
+        }
+        pInit->runningFilePos += pageBodySize;
+        if (ma_dr_flac_ogg__read_page_header(onRead, pUserData, &header, &bytesRead, &crc32) != MA_SUCCESS) {
+            return MA_FALSE;
+        }
+        pInit->runningFilePos += bytesRead;
+    }
+    pInit->hasMetadataBlocks = MA_TRUE;
+    return MA_TRUE;
+}
+#endif
+static ma_bool32 ma_dr_flac__init_private(ma_dr_flac_init_info* pInit, ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, ma_dr_flac_container container, void* pUserData, void* pUserDataMD)
+{
+    ma_bool32 relaxed;
+    ma_uint8 id[4];
+    if (pInit == NULL || onRead == NULL || onSeek == NULL) {
+        return MA_FALSE;
+    }
+    MA_DR_FLAC_ZERO_MEMORY(pInit, sizeof(*pInit));
+    pInit->onRead       = onRead;
+    pInit->onSeek       = onSeek;
+    pInit->onMeta       = onMeta;
+    pInit->container    = container;
+    pInit->pUserData    = pUserData;
+    pInit->pUserDataMD  = pUserDataMD;
+    pInit->bs.onRead    = onRead;
+    pInit->bs.onSeek    = onSeek;
+    pInit->bs.pUserData = pUserData;
+    ma_dr_flac__reset_cache(&pInit->bs);
+    relaxed = container != ma_dr_flac_container_unknown;
+    for (;;) {
+        if (onRead(pUserData, id, 4) != 4) {
+            return MA_FALSE;
+        }
+        pInit->runningFilePos += 4;
+        if (id[0] == 'I' && id[1] == 'D' && id[2] == '3') {
+            ma_uint8 header[6];
+            ma_uint8 flags;
+            ma_uint32 headerSize;
+            if (onRead(pUserData, header, 6) != 6) {
+                return MA_FALSE;
+            }
+            pInit->runningFilePos += 6;
+            flags = header[1];
+            MA_DR_FLAC_COPY_MEMORY(&headerSize, header+2, 4);
+            headerSize = ma_dr_flac__unsynchsafe_32(ma_dr_flac__be2host_32(headerSize));
+            if (flags & 0x10) {
+                headerSize += 10;
+            }
+            if (!onSeek(pUserData, headerSize, ma_dr_flac_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            pInit->runningFilePos += headerSize;
+        } else {
+            break;
+        }
+    }
+    if (id[0] == 'f' && id[1] == 'L' && id[2] == 'a' && id[3] == 'C') {
+        return ma_dr_flac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
+    }
+#ifndef MA_DR_FLAC_NO_OGG
+    if (id[0] == 'O' && id[1] == 'g' && id[2] == 'g' && id[3] == 'S') {
+        return ma_dr_flac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
+    }
+#endif
+    if (relaxed) {
+        if (container == ma_dr_flac_container_native) {
+            return ma_dr_flac__init_private__native(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
+        }
+#ifndef MA_DR_FLAC_NO_OGG
+        if (container == ma_dr_flac_container_ogg) {
+            return ma_dr_flac__init_private__ogg(pInit, onRead, onSeek, onMeta, pUserData, pUserDataMD, relaxed);
+        }
+#endif
+    }
+    return MA_FALSE;
+}
+static void ma_dr_flac__init_from_info(ma_dr_flac* pFlac, const ma_dr_flac_init_info* pInit)
+{
+    MA_DR_FLAC_ASSERT(pFlac != NULL);
+    MA_DR_FLAC_ASSERT(pInit != NULL);
+    MA_DR_FLAC_ZERO_MEMORY(pFlac, sizeof(*pFlac));
+    pFlac->bs                      = pInit->bs;
+    pFlac->onMeta                  = pInit->onMeta;
+    pFlac->pUserDataMD             = pInit->pUserDataMD;
+    pFlac->maxBlockSizeInPCMFrames = pInit->maxBlockSizeInPCMFrames;
+    pFlac->sampleRate              = pInit->sampleRate;
+    pFlac->channels                = (ma_uint8)pInit->channels;
+    pFlac->bitsPerSample           = (ma_uint8)pInit->bitsPerSample;
+    pFlac->totalPCMFrameCount      = pInit->totalPCMFrameCount;
+    pFlac->container               = pInit->container;
+}
+static ma_dr_flac* ma_dr_flac_open_with_metadata_private(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, ma_dr_flac_container container, void* pUserData, void* pUserDataMD, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac_init_info init;
+    ma_uint32 allocationSize;
+    ma_uint32 wholeSIMDVectorCountPerChannel;
+    ma_uint32 decodedSamplesAllocationSize;
+#ifndef MA_DR_FLAC_NO_OGG
+    ma_dr_flac_oggbs* pOggbs = NULL;
+#endif
+    ma_uint64 firstFramePos;
+    ma_uint64 seektablePos;
+    ma_uint32 seekpointCount;
+    ma_allocation_callbacks allocationCallbacks;
+    ma_dr_flac* pFlac;
+    ma_dr_flac__init_cpu_caps();
+    if (!ma_dr_flac__init_private(&init, onRead, onSeek, onMeta, container, pUserData, pUserDataMD)) {
+        return NULL;
+    }
+    if (pAllocationCallbacks != NULL) {
+        allocationCallbacks = *pAllocationCallbacks;
+        if (allocationCallbacks.onFree == NULL || (allocationCallbacks.onMalloc == NULL && allocationCallbacks.onRealloc == NULL)) {
+            return NULL;
+        }
+    } else {
+        allocationCallbacks.pUserData = NULL;
+        allocationCallbacks.onMalloc  = ma_dr_flac__malloc_default;
+        allocationCallbacks.onRealloc = ma_dr_flac__realloc_default;
+        allocationCallbacks.onFree    = ma_dr_flac__free_default;
+    }
+    allocationSize = sizeof(ma_dr_flac);
+    if ((init.maxBlockSizeInPCMFrames % (MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE / sizeof(ma_int32))) == 0) {
+        wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE / sizeof(ma_int32)));
+    } else {
+        wholeSIMDVectorCountPerChannel = (init.maxBlockSizeInPCMFrames / (MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE / sizeof(ma_int32))) + 1;
+    }
+    decodedSamplesAllocationSize = wholeSIMDVectorCountPerChannel * MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE * init.channels;
+    allocationSize += decodedSamplesAllocationSize;
+    allocationSize += MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE;
+#ifndef MA_DR_FLAC_NO_OGG
+    if (init.container == ma_dr_flac_container_ogg) {
+        allocationSize += sizeof(ma_dr_flac_oggbs);
+        pOggbs = (ma_dr_flac_oggbs*)ma_dr_flac__malloc_from_callbacks(sizeof(*pOggbs), &allocationCallbacks);
+        if (pOggbs == NULL) {
+            return NULL;
+        }
+        MA_DR_FLAC_ZERO_MEMORY(pOggbs, sizeof(*pOggbs));
+        pOggbs->onRead = onRead;
+        pOggbs->onSeek = onSeek;
+        pOggbs->pUserData = pUserData;
+        pOggbs->currentBytePos = init.oggFirstBytePos;
+        pOggbs->firstBytePos = init.oggFirstBytePos;
+        pOggbs->serialNumber = init.oggSerial;
+        pOggbs->bosPageHeader = init.oggBosHeader;
+        pOggbs->bytesRemainingInPage = 0;
+    }
+#endif
+    firstFramePos  = 42;
+    seektablePos   = 0;
+    seekpointCount = 0;
+    if (init.hasMetadataBlocks) {
+        ma_dr_flac_read_proc onReadOverride = onRead;
+        ma_dr_flac_seek_proc onSeekOverride = onSeek;
+        void* pUserDataOverride = pUserData;
+#ifndef MA_DR_FLAC_NO_OGG
+        if (init.container == ma_dr_flac_container_ogg) {
+            onReadOverride = ma_dr_flac__on_read_ogg;
+            onSeekOverride = ma_dr_flac__on_seek_ogg;
+            pUserDataOverride = (void*)pOggbs;
+        }
+#endif
+        if (!ma_dr_flac__read_and_decode_metadata(onReadOverride, onSeekOverride, onMeta, pUserDataOverride, pUserDataMD, &firstFramePos, &seektablePos, &seekpointCount, &allocationCallbacks)) {
+        #ifndef MA_DR_FLAC_NO_OGG
+            ma_dr_flac__free_from_callbacks(pOggbs, &allocationCallbacks);
+        #endif
+            return NULL;
+        }
+        allocationSize += seekpointCount * sizeof(ma_dr_flac_seekpoint);
+    }
+    pFlac = (ma_dr_flac*)ma_dr_flac__malloc_from_callbacks(allocationSize, &allocationCallbacks);
+    if (pFlac == NULL) {
+    #ifndef MA_DR_FLAC_NO_OGG
+        ma_dr_flac__free_from_callbacks(pOggbs, &allocationCallbacks);
+    #endif
+        return NULL;
+    }
+    ma_dr_flac__init_from_info(pFlac, &init);
+    pFlac->allocationCallbacks = allocationCallbacks;
+    pFlac->pDecodedSamples = (ma_int32*)ma_dr_flac_align((size_t)pFlac->pExtraData, MA_DR_FLAC_MAX_SIMD_VECTOR_SIZE);
+#ifndef MA_DR_FLAC_NO_OGG
+    if (init.container == ma_dr_flac_container_ogg) {
+        ma_dr_flac_oggbs* pInternalOggbs = (ma_dr_flac_oggbs*)((ma_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize + (seekpointCount * sizeof(ma_dr_flac_seekpoint)));
+        MA_DR_FLAC_COPY_MEMORY(pInternalOggbs, pOggbs, sizeof(*pOggbs));
+        ma_dr_flac__free_from_callbacks(pOggbs, &allocationCallbacks);
+        pOggbs = NULL;
+        pFlac->bs.onRead = ma_dr_flac__on_read_ogg;
+        pFlac->bs.onSeek = ma_dr_flac__on_seek_ogg;
+        pFlac->bs.pUserData = (void*)pInternalOggbs;
+        pFlac->_oggbs = (void*)pInternalOggbs;
+    }
+#endif
+    pFlac->firstFLACFramePosInBytes = firstFramePos;
+#ifndef MA_DR_FLAC_NO_OGG
+    if (init.container == ma_dr_flac_container_ogg)
+    {
+        pFlac->pSeekpoints = NULL;
+        pFlac->seekpointCount = 0;
+    }
+    else
+#endif
+    {
+        if (seektablePos != 0) {
+            pFlac->seekpointCount = seekpointCount;
+            pFlac->pSeekpoints = (ma_dr_flac_seekpoint*)((ma_uint8*)pFlac->pDecodedSamples + decodedSamplesAllocationSize);
+            MA_DR_FLAC_ASSERT(pFlac->bs.onSeek != NULL);
+            MA_DR_FLAC_ASSERT(pFlac->bs.onRead != NULL);
+            if (pFlac->bs.onSeek(pFlac->bs.pUserData, (int)seektablePos, ma_dr_flac_seek_origin_start)) {
+                ma_uint32 iSeekpoint;
+                for (iSeekpoint = 0; iSeekpoint < seekpointCount; iSeekpoint += 1) {
+                    if (pFlac->bs.onRead(pFlac->bs.pUserData, pFlac->pSeekpoints + iSeekpoint, MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES) == MA_DR_FLAC_SEEKPOINT_SIZE_IN_BYTES) {
+                        pFlac->pSeekpoints[iSeekpoint].firstPCMFrame   = ma_dr_flac__be2host_64(pFlac->pSeekpoints[iSeekpoint].firstPCMFrame);
+                        pFlac->pSeekpoints[iSeekpoint].flacFrameOffset = ma_dr_flac__be2host_64(pFlac->pSeekpoints[iSeekpoint].flacFrameOffset);
+                        pFlac->pSeekpoints[iSeekpoint].pcmFrameCount   = ma_dr_flac__be2host_16(pFlac->pSeekpoints[iSeekpoint].pcmFrameCount);
+                    } else {
+                        pFlac->pSeekpoints = NULL;
+                        pFlac->seekpointCount = 0;
+                        break;
+                    }
+                }
+                if (!pFlac->bs.onSeek(pFlac->bs.pUserData, (int)pFlac->firstFLACFramePosInBytes, ma_dr_flac_seek_origin_start)) {
+                    ma_dr_flac__free_from_callbacks(pFlac, &allocationCallbacks);
+                    return NULL;
+                }
+            } else {
+                pFlac->pSeekpoints = NULL;
+                pFlac->seekpointCount = 0;
+            }
+        }
+    }
+    if (!init.hasStreamInfoBlock) {
+        pFlac->currentFLACFrame.header = init.firstFrameHeader;
+        for (;;) {
+            ma_result result = ma_dr_flac__decode_flac_frame(pFlac);
+            if (result == MA_SUCCESS) {
+                break;
+            } else {
+                if (result == MA_CRC_MISMATCH) {
+                    if (!ma_dr_flac__read_next_flac_frame_header(&pFlac->bs, pFlac->bitsPerSample, &pFlac->currentFLACFrame.header)) {
+                        ma_dr_flac__free_from_callbacks(pFlac, &allocationCallbacks);
+                        return NULL;
+                    }
+                    continue;
+                } else {
+                    ma_dr_flac__free_from_callbacks(pFlac, &allocationCallbacks);
+                    return NULL;
+                }
+            }
+        }
+    }
+    return pFlac;
+}
+#ifndef MA_DR_FLAC_NO_STDIO
+#include <stdio.h>
+#ifndef MA_DR_FLAC_NO_WCHAR
+#include <wchar.h>
+#endif
+static size_t ma_dr_flac__on_read_stdio(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    return fread(bufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+static ma_bool32 ma_dr_flac__on_seek_stdio(void* pUserData, int offset, ma_dr_flac_seek_origin origin)
+{
+    MA_DR_FLAC_ASSERT(offset >= 0);
+    return fseek((FILE*)pUserData, offset, (origin == ma_dr_flac_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
+}
+MA_API ma_dr_flac* ma_dr_flac_open_file(const char* pFileName, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    FILE* pFile;
+    if (ma_fopen(&pFile, pFileName, "rb") != MA_SUCCESS) {
+        return NULL;
+    }
+    pFlac = ma_dr_flac_open(ma_dr_flac__on_read_stdio, ma_dr_flac__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        fclose(pFile);
+        return NULL;
+    }
+    return pFlac;
+}
+#ifndef MA_DR_FLAC_NO_WCHAR
+MA_API ma_dr_flac* ma_dr_flac_open_file_w(const wchar_t* pFileName, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    FILE* pFile;
+    if (ma_wfopen(&pFile, pFileName, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
+        return NULL;
+    }
+    pFlac = ma_dr_flac_open(ma_dr_flac__on_read_stdio, ma_dr_flac__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        fclose(pFile);
+        return NULL;
+    }
+    return pFlac;
+}
+#endif
+MA_API ma_dr_flac* ma_dr_flac_open_file_with_metadata(const char* pFileName, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    FILE* pFile;
+    if (ma_fopen(&pFile, pFileName, "rb") != MA_SUCCESS) {
+        return NULL;
+    }
+    pFlac = ma_dr_flac_open_with_metadata_private(ma_dr_flac__on_read_stdio, ma_dr_flac__on_seek_stdio, onMeta, ma_dr_flac_container_unknown, (void*)pFile, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        fclose(pFile);
+        return pFlac;
+    }
+    return pFlac;
+}
+#ifndef MA_DR_FLAC_NO_WCHAR
+MA_API ma_dr_flac* ma_dr_flac_open_file_with_metadata_w(const wchar_t* pFileName, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    FILE* pFile;
+    if (ma_wfopen(&pFile, pFileName, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
+        return NULL;
+    }
+    pFlac = ma_dr_flac_open_with_metadata_private(ma_dr_flac__on_read_stdio, ma_dr_flac__on_seek_stdio, onMeta, ma_dr_flac_container_unknown, (void*)pFile, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        fclose(pFile);
+        return pFlac;
+    }
+    return pFlac;
+}
+#endif
+#endif
+static size_t ma_dr_flac__on_read_memory(void* pUserData, void* bufferOut, size_t bytesToRead)
+{
+    ma_dr_flac__memory_stream* memoryStream = (ma_dr_flac__memory_stream*)pUserData;
+    size_t bytesRemaining;
+    MA_DR_FLAC_ASSERT(memoryStream != NULL);
+    MA_DR_FLAC_ASSERT(memoryStream->dataSize >= memoryStream->currentReadPos);
+    bytesRemaining = memoryStream->dataSize - memoryStream->currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+    if (bytesToRead > 0) {
+        MA_DR_FLAC_COPY_MEMORY(bufferOut, memoryStream->data + memoryStream->currentReadPos, bytesToRead);
+        memoryStream->currentReadPos += bytesToRead;
+    }
+    return bytesToRead;
+}
+static ma_bool32 ma_dr_flac__on_seek_memory(void* pUserData, int offset, ma_dr_flac_seek_origin origin)
+{
+    ma_dr_flac__memory_stream* memoryStream = (ma_dr_flac__memory_stream*)pUserData;
+    MA_DR_FLAC_ASSERT(memoryStream != NULL);
+    MA_DR_FLAC_ASSERT(offset >= 0);
+    if (offset > (ma_int64)memoryStream->dataSize) {
+        return MA_FALSE;
+    }
+    if (origin == ma_dr_flac_seek_origin_current) {
+        if (memoryStream->currentReadPos + offset <= memoryStream->dataSize) {
+            memoryStream->currentReadPos += offset;
+        } else {
+            return MA_FALSE;
+        }
+    } else {
+        if ((ma_uint32)offset <= memoryStream->dataSize) {
+            memoryStream->currentReadPos = offset;
+        } else {
+            return MA_FALSE;
+        }
+    }
+    return MA_TRUE;
+}
+MA_API ma_dr_flac* ma_dr_flac_open_memory(const void* pData, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac__memory_stream memoryStream;
+    ma_dr_flac* pFlac;
+    memoryStream.data = (const ma_uint8*)pData;
+    memoryStream.dataSize = dataSize;
+    memoryStream.currentReadPos = 0;
+    pFlac = ma_dr_flac_open(ma_dr_flac__on_read_memory, ma_dr_flac__on_seek_memory, &memoryStream, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    pFlac->memoryStream = memoryStream;
+#ifndef MA_DR_FLAC_NO_OGG
+    if (pFlac->container == ma_dr_flac_container_ogg)
+    {
+        ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pFlac->_oggbs;
+        oggbs->pUserData = &pFlac->memoryStream;
+    }
+    else
+#endif
+    {
+        pFlac->bs.pUserData = &pFlac->memoryStream;
+    }
+    return pFlac;
+}
+MA_API ma_dr_flac* ma_dr_flac_open_memory_with_metadata(const void* pData, size_t dataSize, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac__memory_stream memoryStream;
+    ma_dr_flac* pFlac;
+    memoryStream.data = (const ma_uint8*)pData;
+    memoryStream.dataSize = dataSize;
+    memoryStream.currentReadPos = 0;
+    pFlac = ma_dr_flac_open_with_metadata_private(ma_dr_flac__on_read_memory, ma_dr_flac__on_seek_memory, onMeta, ma_dr_flac_container_unknown, &memoryStream, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    pFlac->memoryStream = memoryStream;
+#ifndef MA_DR_FLAC_NO_OGG
+    if (pFlac->container == ma_dr_flac_container_ogg)
+    {
+        ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pFlac->_oggbs;
+        oggbs->pUserData = &pFlac->memoryStream;
+    }
+    else
+#endif
+    {
+        pFlac->bs.pUserData = &pFlac->memoryStream;
+    }
+    return pFlac;
+}
+MA_API ma_dr_flac* ma_dr_flac_open(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_flac_open_with_metadata_private(onRead, onSeek, NULL, ma_dr_flac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
+}
+MA_API ma_dr_flac* ma_dr_flac_open_relaxed(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_container container, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_flac_open_with_metadata_private(onRead, onSeek, NULL, container, pUserData, pUserData, pAllocationCallbacks);
+}
+MA_API ma_dr_flac* ma_dr_flac_open_with_metadata(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_flac_open_with_metadata_private(onRead, onSeek, onMeta, ma_dr_flac_container_unknown, pUserData, pUserData, pAllocationCallbacks);
+}
+MA_API ma_dr_flac* ma_dr_flac_open_with_metadata_relaxed(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, ma_dr_flac_meta_proc onMeta, ma_dr_flac_container container, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    return ma_dr_flac_open_with_metadata_private(onRead, onSeek, onMeta, container, pUserData, pUserData, pAllocationCallbacks);
+}
+MA_API void ma_dr_flac_close(ma_dr_flac* pFlac)
+{
+    if (pFlac == NULL) {
+        return;
+    }
+#ifndef MA_DR_FLAC_NO_STDIO
+    if (pFlac->bs.onRead == ma_dr_flac__on_read_stdio) {
+        fclose((FILE*)pFlac->bs.pUserData);
+    }
+#ifndef MA_DR_FLAC_NO_OGG
+    if (pFlac->container == ma_dr_flac_container_ogg) {
+        ma_dr_flac_oggbs* oggbs = (ma_dr_flac_oggbs*)pFlac->_oggbs;
+        MA_DR_FLAC_ASSERT(pFlac->bs.onRead == ma_dr_flac__on_read_ogg);
+        if (oggbs->onRead == ma_dr_flac__on_read_stdio) {
+            fclose((FILE*)oggbs->pUserData);
+        }
+    }
+#endif
+#endif
+    ma_dr_flac__free_from_callbacks(pFlac, &pFlac->allocationCallbacks);
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        ma_uint32 left  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        ma_uint32 side  = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        ma_uint32 right = left - side;
+        pOutputSamples[i*2+0] = (ma_int32)left;
+        pOutputSamples[i*2+1] = (ma_int32)right;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
+        ma_uint32 right0 = left0 - side0;
+        ma_uint32 right1 = left1 - side1;
+        ma_uint32 right2 = left2 - side2;
+        ma_uint32 right3 = left3 - side3;
+        pOutputSamples[i*8+0] = (ma_int32)left0;
+        pOutputSamples[i*8+1] = (ma_int32)right0;
+        pOutputSamples[i*8+2] = (ma_int32)left1;
+        pOutputSamples[i*8+3] = (ma_int32)right1;
+        pOutputSamples[i*8+4] = (ma_int32)left2;
+        pOutputSamples[i*8+5] = (ma_int32)right2;
+        pOutputSamples[i*8+6] = (ma_int32)left3;
+        pOutputSamples[i*8+7] = (ma_int32)right3;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        pOutputSamples[i*2+0] = (ma_int32)left;
+        pOutputSamples[i*2+1] = (ma_int32)right;
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i right = _mm_sub_epi32(left, side);
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        pOutputSamples[i*2+0] = (ma_int32)left;
+        pOutputSamples[i*2+1] = (ma_int32)right;
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t left;
+        uint32x4_t side;
+        uint32x4_t right;
+        left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        right = vsubq_u32(left, side);
+        ma_dr_flac__vst2q_u32((ma_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        pOutputSamples[i*2+0] = (ma_int32)left;
+        pOutputSamples[i*2+1] = (ma_int32)right;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_left_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_s32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_s32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        ma_uint32 side  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        ma_uint32 right = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        ma_uint32 left  = right + side;
+        pOutputSamples[i*2+0] = (ma_int32)left;
+        pOutputSamples[i*2+1] = (ma_int32)right;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
+        ma_uint32 left0 = right0 + side0;
+        ma_uint32 left1 = right1 + side1;
+        ma_uint32 left2 = right2 + side2;
+        ma_uint32 left3 = right3 + side3;
+        pOutputSamples[i*8+0] = (ma_int32)left0;
+        pOutputSamples[i*8+1] = (ma_int32)right0;
+        pOutputSamples[i*8+2] = (ma_int32)left1;
+        pOutputSamples[i*8+3] = (ma_int32)right1;
+        pOutputSamples[i*8+4] = (ma_int32)left2;
+        pOutputSamples[i*8+5] = (ma_int32)right2;
+        pOutputSamples[i*8+6] = (ma_int32)left3;
+        pOutputSamples[i*8+7] = (ma_int32)right3;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        pOutputSamples[i*2+0] = (ma_int32)left;
+        pOutputSamples[i*2+1] = (ma_int32)right;
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i left  = _mm_add_epi32(right, side);
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        pOutputSamples[i*2+0] = (ma_int32)left;
+        pOutputSamples[i*2+1] = (ma_int32)right;
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t side;
+        uint32x4_t right;
+        uint32x4_t left;
+        side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        left  = vaddq_u32(right, side);
+        ma_dr_flac__vst2q_u32((ma_uint32*)pOutputSamples + i*8, vzipq_u32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        pOutputSamples[i*2+0] = (ma_int32)left;
+        pOutputSamples[i*2+1] = (ma_int32)right;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_right_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_s32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_s32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    for (ma_uint64 i = 0; i < frameCount; ++i) {
+        ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+        mid = (mid << 1) | (side & 0x01);
+        pOutputSamples[i*2+0] = (ma_int32)((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample);
+        pOutputSamples[i*2+1] = (ma_int32)((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_int32 shift = unusedBitsPerSample;
+    if (shift > 0) {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            ma_uint32 temp0L;
+            ma_uint32 temp1L;
+            ma_uint32 temp2L;
+            ma_uint32 temp3L;
+            ma_uint32 temp0R;
+            ma_uint32 temp1R;
+            ma_uint32 temp2R;
+            ma_uint32 temp3R;
+            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+            temp0L = (mid0 + side0) << shift;
+            temp1L = (mid1 + side1) << shift;
+            temp2L = (mid2 + side2) << shift;
+            temp3L = (mid3 + side3) << shift;
+            temp0R = (mid0 - side0) << shift;
+            temp1R = (mid1 - side1) << shift;
+            temp2R = (mid2 - side2) << shift;
+            temp3R = (mid3 - side3) << shift;
+            pOutputSamples[i*8+0] = (ma_int32)temp0L;
+            pOutputSamples[i*8+1] = (ma_int32)temp0R;
+            pOutputSamples[i*8+2] = (ma_int32)temp1L;
+            pOutputSamples[i*8+3] = (ma_int32)temp1R;
+            pOutputSamples[i*8+4] = (ma_int32)temp2L;
+            pOutputSamples[i*8+5] = (ma_int32)temp2R;
+            pOutputSamples[i*8+6] = (ma_int32)temp3L;
+            pOutputSamples[i*8+7] = (ma_int32)temp3R;
+        }
+    } else {
+        for (i = 0; i < frameCount4; ++i) {
+            ma_uint32 temp0L;
+            ma_uint32 temp1L;
+            ma_uint32 temp2L;
+            ma_uint32 temp3L;
+            ma_uint32 temp0R;
+            ma_uint32 temp1R;
+            ma_uint32 temp2R;
+            ma_uint32 temp3R;
+            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+            temp0L = (ma_uint32)((ma_int32)(mid0 + side0) >> 1);
+            temp1L = (ma_uint32)((ma_int32)(mid1 + side1) >> 1);
+            temp2L = (ma_uint32)((ma_int32)(mid2 + side2) >> 1);
+            temp3L = (ma_uint32)((ma_int32)(mid3 + side3) >> 1);
+            temp0R = (ma_uint32)((ma_int32)(mid0 - side0) >> 1);
+            temp1R = (ma_uint32)((ma_int32)(mid1 - side1) >> 1);
+            temp2R = (ma_uint32)((ma_int32)(mid2 - side2) >> 1);
+            temp3R = (ma_uint32)((ma_int32)(mid3 - side3) >> 1);
+            pOutputSamples[i*8+0] = (ma_int32)temp0L;
+            pOutputSamples[i*8+1] = (ma_int32)temp0R;
+            pOutputSamples[i*8+2] = (ma_int32)temp1L;
+            pOutputSamples[i*8+3] = (ma_int32)temp1R;
+            pOutputSamples[i*8+4] = (ma_int32)temp2L;
+            pOutputSamples[i*8+5] = (ma_int32)temp2R;
+            pOutputSamples[i*8+6] = (ma_int32)temp3L;
+            pOutputSamples[i*8+7] = (ma_int32)temp3R;
+        }
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+        mid = (mid << 1) | (side & 0x01);
+        pOutputSamples[i*2+0] = (ma_int32)((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample);
+        pOutputSamples[i*2+1] = (ma_int32)((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample);
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_int32 shift = unusedBitsPerSample;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i left;
+            __m128i right;
+            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+            left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
+            right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int32)(mid + side) >> 1;
+            pOutputSamples[i*2+1] = (ma_int32)(mid - side) >> 1;
+        }
+    } else {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i left;
+            __m128i right;
+            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+            left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
+            right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int32)((mid + side) << shift);
+            pOutputSamples[i*2+1] = (ma_int32)((mid - side) << shift);
+        }
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_int32 shift = unusedBitsPerSample;
+    int32x4_t  wbpsShift0_4;
+    int32x4_t  wbpsShift1_4;
+    uint32x4_t one4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+    wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+    one4         = vdupq_n_u32(1);
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t left;
+            int32x4_t right;
+            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
+            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
+            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
+            left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
+            right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
+            ma_dr_flac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int32)(mid + side) >> 1;
+            pOutputSamples[i*2+1] = (ma_int32)(mid - side) >> 1;
+        }
+    } else {
+        int32x4_t shift4;
+        shift -= 1;
+        shift4 = vdupq_n_s32(shift);
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t left;
+            int32x4_t right;
+            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
+            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
+            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, one4));
+            left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
+            right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
+            ma_dr_flac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int32)((mid + side) << shift);
+            pOutputSamples[i*2+1] = (ma_int32)((mid - side) << shift);
+        }
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_mid_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_s32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_s32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    for (ma_uint64 i = 0; i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int32)((ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample));
+        pOutputSamples[i*2+1] = (ma_int32)((ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample));
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
+        pOutputSamples[i*8+0] = (ma_int32)tempL0;
+        pOutputSamples[i*8+1] = (ma_int32)tempR0;
+        pOutputSamples[i*8+2] = (ma_int32)tempL1;
+        pOutputSamples[i*8+3] = (ma_int32)tempR1;
+        pOutputSamples[i*8+4] = (ma_int32)tempL2;
+        pOutputSamples[i*8+5] = (ma_int32)tempR2;
+        pOutputSamples[i*8+6] = (ma_int32)tempL3;
+        pOutputSamples[i*8+7] = (ma_int32)tempR3;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0);
+        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1);
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 0), _mm_unpacklo_epi32(left, right));
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8 + 4), _mm_unpackhi_epi32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0);
+        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1);
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift4_0 = vdupq_n_s32(shift0);
+    int32x4_t shift4_1 = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        int32x4_t left;
+        int32x4_t right;
+        left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift4_0));
+        right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift4_1));
+        ma_dr_flac__vst2q_s32(pOutputSamples + i*8, vzipq_s32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0);
+        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int32* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+MA_API ma_uint64 ma_dr_flac_read_pcm_frames_s32(ma_dr_flac* pFlac, ma_uint64 framesToRead, ma_int32* pBufferOut)
+{
+    ma_uint64 framesRead;
+    ma_uint32 unusedBitsPerSample;
+    if (pFlac == NULL || framesToRead == 0) {
+        return 0;
+    }
+    if (pBufferOut == NULL) {
+        return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, framesToRead);
+    }
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 32);
+    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
+    framesRead = 0;
+    while (framesToRead > 0) {
+        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
+                break;
+            }
+        } else {
+            unsigned int channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+            ma_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
+            ma_uint64 frameCountThisIteration = framesToRead;
+            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
+                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
+            }
+            if (channelCount == 2) {
+                const ma_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
+                const ma_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
+                switch (pFlac->currentFLACFrame.header.channelAssignment)
+                {
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_s32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_s32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_s32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        ma_dr_flac_read_pcm_frames_s32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                }
+            } else {
+                ma_uint64 i;
+                for (i = 0; i < frameCountThisIteration; ++i) {
+                    unsigned int j;
+                    for (j = 0; j < channelCount; ++j) {
+                        pBufferOut[(i*channelCount)+j] = (ma_int32)((ma_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
+                    }
+                }
+            }
+            framesRead                += frameCountThisIteration;
+            pBufferOut                += frameCountThisIteration * channelCount;
+            framesToRead              -= frameCountThisIteration;
+            pFlac->currentPCMFrame    += frameCountThisIteration;
+            pFlac->currentFLACFrame.pcmFramesRemaining -= (ma_uint32)frameCountThisIteration;
+        }
+    }
+    return framesRead;
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        ma_uint32 left  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        ma_uint32 side  = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        ma_uint32 right = left - side;
+        left  >>= 16;
+        right >>= 16;
+        pOutputSamples[i*2+0] = (ma_int16)left;
+        pOutputSamples[i*2+1] = (ma_int16)right;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
+        ma_uint32 right0 = left0 - side0;
+        ma_uint32 right1 = left1 - side1;
+        ma_uint32 right2 = left2 - side2;
+        ma_uint32 right3 = left3 - side3;
+        left0  >>= 16;
+        left1  >>= 16;
+        left2  >>= 16;
+        left3  >>= 16;
+        right0 >>= 16;
+        right1 >>= 16;
+        right2 >>= 16;
+        right3 >>= 16;
+        pOutputSamples[i*8+0] = (ma_int16)left0;
+        pOutputSamples[i*8+1] = (ma_int16)right0;
+        pOutputSamples[i*8+2] = (ma_int16)left1;
+        pOutputSamples[i*8+3] = (ma_int16)right1;
+        pOutputSamples[i*8+4] = (ma_int16)left2;
+        pOutputSamples[i*8+5] = (ma_int16)right2;
+        pOutputSamples[i*8+6] = (ma_int16)left3;
+        pOutputSamples[i*8+7] = (ma_int16)right3;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        left  >>= 16;
+        right >>= 16;
+        pOutputSamples[i*2+0] = (ma_int16)left;
+        pOutputSamples[i*2+1] = (ma_int16)right;
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i right = _mm_sub_epi32(left, side);
+        left  = _mm_srai_epi32(left,  16);
+        right = _mm_srai_epi32(right, 16);
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        left  >>= 16;
+        right >>= 16;
+        pOutputSamples[i*2+0] = (ma_int16)left;
+        pOutputSamples[i*2+1] = (ma_int16)right;
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t left;
+        uint32x4_t side;
+        uint32x4_t right;
+        left  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        right = vsubq_u32(left, side);
+        left  = vshrq_n_u32(left,  16);
+        right = vshrq_n_u32(right, 16);
+        ma_dr_flac__vst2q_u16((ma_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        left  >>= 16;
+        right >>= 16;
+        pOutputSamples[i*2+0] = (ma_int16)left;
+        pOutputSamples[i*2+1] = (ma_int16)right;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_left_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s16__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s16__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_s16__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_s16__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        ma_uint32 side  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        ma_uint32 right = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        ma_uint32 left  = right + side;
+        left  >>= 16;
+        right >>= 16;
+        pOutputSamples[i*2+0] = (ma_int16)left;
+        pOutputSamples[i*2+1] = (ma_int16)right;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
+        ma_uint32 left0 = right0 + side0;
+        ma_uint32 left1 = right1 + side1;
+        ma_uint32 left2 = right2 + side2;
+        ma_uint32 left3 = right3 + side3;
+        left0  >>= 16;
+        left1  >>= 16;
+        left2  >>= 16;
+        left3  >>= 16;
+        right0 >>= 16;
+        right1 >>= 16;
+        right2 >>= 16;
+        right3 >>= 16;
+        pOutputSamples[i*8+0] = (ma_int16)left0;
+        pOutputSamples[i*8+1] = (ma_int16)right0;
+        pOutputSamples[i*8+2] = (ma_int16)left1;
+        pOutputSamples[i*8+3] = (ma_int16)right1;
+        pOutputSamples[i*8+4] = (ma_int16)left2;
+        pOutputSamples[i*8+5] = (ma_int16)right2;
+        pOutputSamples[i*8+6] = (ma_int16)left3;
+        pOutputSamples[i*8+7] = (ma_int16)right3;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        left  >>= 16;
+        right >>= 16;
+        pOutputSamples[i*2+0] = (ma_int16)left;
+        pOutputSamples[i*2+1] = (ma_int16)right;
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i left  = _mm_add_epi32(right, side);
+        left  = _mm_srai_epi32(left,  16);
+        right = _mm_srai_epi32(right, 16);
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        left  >>= 16;
+        right >>= 16;
+        pOutputSamples[i*2+0] = (ma_int16)left;
+        pOutputSamples[i*2+1] = (ma_int16)right;
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t side;
+        uint32x4_t right;
+        uint32x4_t left;
+        side  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        right = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        left  = vaddq_u32(right, side);
+        left  = vshrq_n_u32(left,  16);
+        right = vshrq_n_u32(right, 16);
+        ma_dr_flac__vst2q_u16((ma_uint16*)pOutputSamples + i*8, vzip_u16(vmovn_u32(left), vmovn_u32(right)));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        left  >>= 16;
+        right >>= 16;
+        pOutputSamples[i*2+0] = (ma_int16)left;
+        pOutputSamples[i*2+1] = (ma_int16)right;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_right_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s16__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s16__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_s16__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_s16__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    for (ma_uint64 i = 0; i < frameCount; ++i) {
+        ma_uint32 mid  = (ma_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        ma_uint32 side = (ma_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+        mid = (mid << 1) | (side & 0x01);
+        pOutputSamples[i*2+0] = (ma_int16)(((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
+        pOutputSamples[i*2+1] = (ma_int16)(((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift = unusedBitsPerSample;
+    if (shift > 0) {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            ma_uint32 temp0L;
+            ma_uint32 temp1L;
+            ma_uint32 temp2L;
+            ma_uint32 temp3L;
+            ma_uint32 temp0R;
+            ma_uint32 temp1R;
+            ma_uint32 temp2R;
+            ma_uint32 temp3R;
+            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+            temp0L = (mid0 + side0) << shift;
+            temp1L = (mid1 + side1) << shift;
+            temp2L = (mid2 + side2) << shift;
+            temp3L = (mid3 + side3) << shift;
+            temp0R = (mid0 - side0) << shift;
+            temp1R = (mid1 - side1) << shift;
+            temp2R = (mid2 - side2) << shift;
+            temp3R = (mid3 - side3) << shift;
+            temp0L >>= 16;
+            temp1L >>= 16;
+            temp2L >>= 16;
+            temp3L >>= 16;
+            temp0R >>= 16;
+            temp1R >>= 16;
+            temp2R >>= 16;
+            temp3R >>= 16;
+            pOutputSamples[i*8+0] = (ma_int16)temp0L;
+            pOutputSamples[i*8+1] = (ma_int16)temp0R;
+            pOutputSamples[i*8+2] = (ma_int16)temp1L;
+            pOutputSamples[i*8+3] = (ma_int16)temp1R;
+            pOutputSamples[i*8+4] = (ma_int16)temp2L;
+            pOutputSamples[i*8+5] = (ma_int16)temp2R;
+            pOutputSamples[i*8+6] = (ma_int16)temp3L;
+            pOutputSamples[i*8+7] = (ma_int16)temp3R;
+        }
+    } else {
+        for (i = 0; i < frameCount4; ++i) {
+            ma_uint32 temp0L;
+            ma_uint32 temp1L;
+            ma_uint32 temp2L;
+            ma_uint32 temp3L;
+            ma_uint32 temp0R;
+            ma_uint32 temp1R;
+            ma_uint32 temp2R;
+            ma_uint32 temp3R;
+            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+            temp0L = ((ma_int32)(mid0 + side0) >> 1);
+            temp1L = ((ma_int32)(mid1 + side1) >> 1);
+            temp2L = ((ma_int32)(mid2 + side2) >> 1);
+            temp3L = ((ma_int32)(mid3 + side3) >> 1);
+            temp0R = ((ma_int32)(mid0 - side0) >> 1);
+            temp1R = ((ma_int32)(mid1 - side1) >> 1);
+            temp2R = ((ma_int32)(mid2 - side2) >> 1);
+            temp3R = ((ma_int32)(mid3 - side3) >> 1);
+            temp0L >>= 16;
+            temp1L >>= 16;
+            temp2L >>= 16;
+            temp3L >>= 16;
+            temp0R >>= 16;
+            temp1R >>= 16;
+            temp2R >>= 16;
+            temp3R >>= 16;
+            pOutputSamples[i*8+0] = (ma_int16)temp0L;
+            pOutputSamples[i*8+1] = (ma_int16)temp0R;
+            pOutputSamples[i*8+2] = (ma_int16)temp1L;
+            pOutputSamples[i*8+3] = (ma_int16)temp1R;
+            pOutputSamples[i*8+4] = (ma_int16)temp2L;
+            pOutputSamples[i*8+5] = (ma_int16)temp2R;
+            pOutputSamples[i*8+6] = (ma_int16)temp3L;
+            pOutputSamples[i*8+7] = (ma_int16)temp3R;
+        }
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+        mid = (mid << 1) | (side & 0x01);
+        pOutputSamples[i*2+0] = (ma_int16)(((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample) >> 16);
+        pOutputSamples[i*2+1] = (ma_int16)(((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample) >> 16);
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift = unusedBitsPerSample;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i left;
+            __m128i right;
+            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+            left  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
+            right = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
+            left  = _mm_srai_epi32(left,  16);
+            right = _mm_srai_epi32(right, 16);
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int16)(((ma_int32)(mid + side) >> 1) >> 16);
+            pOutputSamples[i*2+1] = (ma_int16)(((ma_int32)(mid - side) >> 1) >> 16);
+        }
+    } else {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i left;
+            __m128i right;
+            mid   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+            mid   = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+            left  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
+            right = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
+            left  = _mm_srai_epi32(left,  16);
+            right = _mm_srai_epi32(right, 16);
+            _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int16)(((mid + side) << shift) >> 16);
+            pOutputSamples[i*2+1] = (ma_int16)(((mid - side) << shift) >> 16);
+        }
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift = unusedBitsPerSample;
+    int32x4_t wbpsShift0_4;
+    int32x4_t wbpsShift1_4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    wbpsShift0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+    wbpsShift1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t left;
+            int32x4_t right;
+            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
+            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
+            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
+            left  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
+            right = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
+            left  = vshrq_n_s32(left,  16);
+            right = vshrq_n_s32(right, 16);
+            ma_dr_flac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int16)(((ma_int32)(mid + side) >> 1) >> 16);
+            pOutputSamples[i*2+1] = (ma_int16)(((ma_int32)(mid - side) >> 1) >> 16);
+        }
+    } else {
+        int32x4_t shift4;
+        shift -= 1;
+        shift4 = vdupq_n_s32(shift);
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t left;
+            int32x4_t right;
+            mid   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbpsShift0_4);
+            side  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbpsShift1_4);
+            mid   = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
+            left  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
+            right = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
+            left  = vshrq_n_s32(left,  16);
+            right = vshrq_n_s32(right, 16);
+            ma_dr_flac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int16)(((mid + side) << shift) >> 16);
+            pOutputSamples[i*2+1] = (ma_int16)(((mid - side) << shift) >> 16);
+        }
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_mid_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s16__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s16__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_s16__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_s16__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    for (ma_uint64 i = 0; i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int16)((ma_int32)((ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) >> 16);
+        pOutputSamples[i*2+1] = (ma_int16)((ma_int32)((ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) >> 16);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
+        tempL0 >>= 16;
+        tempL1 >>= 16;
+        tempL2 >>= 16;
+        tempL3 >>= 16;
+        tempR0 >>= 16;
+        tempR1 >>= 16;
+        tempR2 >>= 16;
+        tempR3 >>= 16;
+        pOutputSamples[i*8+0] = (ma_int16)tempL0;
+        pOutputSamples[i*8+1] = (ma_int16)tempR0;
+        pOutputSamples[i*8+2] = (ma_int16)tempL1;
+        pOutputSamples[i*8+3] = (ma_int16)tempR1;
+        pOutputSamples[i*8+4] = (ma_int16)tempL2;
+        pOutputSamples[i*8+5] = (ma_int16)tempR2;
+        pOutputSamples[i*8+6] = (ma_int16)tempL3;
+        pOutputSamples[i*8+7] = (ma_int16)tempR3;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int16)((pInputSamples0U32[i] << shift0) >> 16);
+        pOutputSamples[i*2+1] = (ma_int16)((pInputSamples1U32[i] << shift1) >> 16);
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        left  = _mm_srai_epi32(left,  16);
+        right = _mm_srai_epi32(right, 16);
+        _mm_storeu_si128((__m128i*)(pOutputSamples + i*8), ma_dr_flac__mm_packs_interleaved_epi32(left, right));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int16)((pInputSamples0U32[i] << shift0) >> 16);
+        pOutputSamples[i*2+1] = (ma_int16)((pInputSamples1U32[i] << shift1) >> 16);
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    int32x4_t shift0_4 = vdupq_n_s32(shift0);
+    int32x4_t shift1_4 = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        int32x4_t left;
+        int32x4_t right;
+        left  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
+        right = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
+        left  = vshrq_n_s32(left,  16);
+        right = vshrq_n_s32(right, 16);
+        ma_dr_flac__vst2q_s16(pOutputSamples + i*8, vzip_s16(vmovn_s32(left), vmovn_s32(right)));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int16)((pInputSamples0U32[i] << shift0) >> 16);
+        pOutputSamples[i*2+1] = (ma_int16)((pInputSamples1U32[i] << shift1) >> 16);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, ma_int16* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+MA_API ma_uint64 ma_dr_flac_read_pcm_frames_s16(ma_dr_flac* pFlac, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    ma_uint64 framesRead;
+    ma_uint32 unusedBitsPerSample;
+    if (pFlac == NULL || framesToRead == 0) {
+        return 0;
+    }
+    if (pBufferOut == NULL) {
+        return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, framesToRead);
+    }
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 32);
+    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
+    framesRead = 0;
+    while (framesToRead > 0) {
+        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
+                break;
+            }
+        } else {
+            unsigned int channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+            ma_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
+            ma_uint64 frameCountThisIteration = framesToRead;
+            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
+                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
+            }
+            if (channelCount == 2) {
+                const ma_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
+                const ma_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
+                switch (pFlac->currentFLACFrame.header.channelAssignment)
+                {
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_s16__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_s16__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_s16__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        ma_dr_flac_read_pcm_frames_s16__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                }
+            } else {
+                ma_uint64 i;
+                for (i = 0; i < frameCountThisIteration; ++i) {
+                    unsigned int j;
+                    for (j = 0; j < channelCount; ++j) {
+                        ma_int32 sampleS32 = (ma_int32)((ma_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
+                        pBufferOut[(i*channelCount)+j] = (ma_int16)(sampleS32 >> 16);
+                    }
+                }
+            }
+            framesRead                += frameCountThisIteration;
+            pBufferOut                += frameCountThisIteration * channelCount;
+            framesToRead              -= frameCountThisIteration;
+            pFlac->currentPCMFrame    += frameCountThisIteration;
+            pFlac->currentFLACFrame.pcmFramesRemaining -= (ma_uint32)frameCountThisIteration;
+        }
+    }
+    return framesRead;
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        ma_uint32 left  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        ma_uint32 side  = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        ma_uint32 right = left - side;
+        pOutputSamples[i*2+0] = (float)((ma_int32)left  / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((ma_int32)right / 2147483648.0);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    float factor = 1 / 2147483648.0;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 left0 = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 left1 = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 left2 = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 left3 = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 side0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 side1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 side2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 side3 = pInputSamples1U32[i*4+3] << shift1;
+        ma_uint32 right0 = left0 - side0;
+        ma_uint32 right1 = left1 - side1;
+        ma_uint32 right2 = left2 - side2;
+        ma_uint32 right3 = left3 - side3;
+        pOutputSamples[i*8+0] = (ma_int32)left0  * factor;
+        pOutputSamples[i*8+1] = (ma_int32)right0 * factor;
+        pOutputSamples[i*8+2] = (ma_int32)left1  * factor;
+        pOutputSamples[i*8+3] = (ma_int32)right1 * factor;
+        pOutputSamples[i*8+4] = (ma_int32)left2  * factor;
+        pOutputSamples[i*8+5] = (ma_int32)right2 * factor;
+        pOutputSamples[i*8+6] = (ma_int32)left3  * factor;
+        pOutputSamples[i*8+7] = (ma_int32)right3 * factor;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        pOutputSamples[i*2+0] = (ma_int32)left  * factor;
+        pOutputSamples[i*2+1] = (ma_int32)right * factor;
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    __m128 factor;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    factor = _mm_set1_ps(1.0f / 8388608.0f);
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i left  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i right = _mm_sub_epi32(left, side);
+        __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
+        __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
+        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        pOutputSamples[i*2+0] = (ma_int32)left  / 8388608.0f;
+        pOutputSamples[i*2+1] = (ma_int32)right / 8388608.0f;
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    float32x4_t factor4;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    factor4  = vdupq_n_f32(1.0f / 8388608.0f);
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t left;
+        uint32x4_t side;
+        uint32x4_t right;
+        float32x4_t leftf;
+        float32x4_t rightf;
+        left   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        right  = vsubq_u32(left, side);
+        leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
+        rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
+        ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 left  = pInputSamples0U32[i] << shift0;
+        ma_uint32 side  = pInputSamples1U32[i] << shift1;
+        ma_uint32 right = left - side;
+        pOutputSamples[i*2+0] = (ma_int32)left  / 8388608.0f;
+        pOutputSamples[i*2+1] = (ma_int32)right / 8388608.0f;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_left_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_f32__decode_left_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_f32__decode_left_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_f32__decode_left_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_f32__decode_left_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    for (i = 0; i < frameCount; ++i) {
+        ma_uint32 side  = (ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+        ma_uint32 right = (ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+        ma_uint32 left  = right + side;
+        pOutputSamples[i*2+0] = (float)((ma_int32)left  / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((ma_int32)right / 2147483648.0);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    float factor = 1 / 2147483648.0;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 side0  = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 side1  = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 side2  = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 side3  = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 right0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 right1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 right2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 right3 = pInputSamples1U32[i*4+3] << shift1;
+        ma_uint32 left0 = right0 + side0;
+        ma_uint32 left1 = right1 + side1;
+        ma_uint32 left2 = right2 + side2;
+        ma_uint32 left3 = right3 + side3;
+        pOutputSamples[i*8+0] = (ma_int32)left0  * factor;
+        pOutputSamples[i*8+1] = (ma_int32)right0 * factor;
+        pOutputSamples[i*8+2] = (ma_int32)left1  * factor;
+        pOutputSamples[i*8+3] = (ma_int32)right1 * factor;
+        pOutputSamples[i*8+4] = (ma_int32)left2  * factor;
+        pOutputSamples[i*8+5] = (ma_int32)right2 * factor;
+        pOutputSamples[i*8+6] = (ma_int32)left3  * factor;
+        pOutputSamples[i*8+7] = (ma_int32)right3 * factor;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        pOutputSamples[i*2+0] = (ma_int32)left  * factor;
+        pOutputSamples[i*2+1] = (ma_int32)right * factor;
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    __m128 factor;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    factor = _mm_set1_ps(1.0f / 8388608.0f);
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i side  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        __m128i right = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        __m128i left  = _mm_add_epi32(right, side);
+        __m128 leftf  = _mm_mul_ps(_mm_cvtepi32_ps(left),  factor);
+        __m128 rightf = _mm_mul_ps(_mm_cvtepi32_ps(right), factor);
+        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        pOutputSamples[i*2+0] = (ma_int32)left  / 8388608.0f;
+        pOutputSamples[i*2+1] = (ma_int32)right / 8388608.0f;
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    float32x4_t factor4;
+    int32x4_t shift0_4;
+    int32x4_t shift1_4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    factor4  = vdupq_n_f32(1.0f / 8388608.0f);
+    shift0_4 = vdupq_n_s32(shift0);
+    shift1_4 = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        uint32x4_t side;
+        uint32x4_t right;
+        uint32x4_t left;
+        float32x4_t leftf;
+        float32x4_t rightf;
+        side   = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4);
+        right  = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4);
+        left   = vaddq_u32(right, side);
+        leftf  = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(left)),  factor4);
+        rightf = vmulq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(right)), factor4);
+        ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 side  = pInputSamples0U32[i] << shift0;
+        ma_uint32 right = pInputSamples1U32[i] << shift1;
+        ma_uint32 left  = right + side;
+        pOutputSamples[i*2+0] = (ma_int32)left  / 8388608.0f;
+        pOutputSamples[i*2+1] = (ma_int32)right / 8388608.0f;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_right_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_f32__decode_right_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_f32__decode_right_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_f32__decode_right_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_f32__decode_right_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    for (ma_uint64 i = 0; i < frameCount; ++i) {
+        ma_uint32 mid  = (ma_uint32)pInputSamples0[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        ma_uint32 side = (ma_uint32)pInputSamples1[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+        mid = (mid << 1) | (side & 0x01);
+        pOutputSamples[i*2+0] = (float)((((ma_int32)(mid + side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((((ma_int32)(mid - side) >> 1) << (unusedBitsPerSample)) / 2147483648.0);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift = unusedBitsPerSample;
+    float factor = 1 / 2147483648.0;
+    if (shift > 0) {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            ma_uint32 temp0L;
+            ma_uint32 temp1L;
+            ma_uint32 temp2L;
+            ma_uint32 temp3L;
+            ma_uint32 temp0R;
+            ma_uint32 temp1R;
+            ma_uint32 temp2R;
+            ma_uint32 temp3R;
+            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+            temp0L = (mid0 + side0) << shift;
+            temp1L = (mid1 + side1) << shift;
+            temp2L = (mid2 + side2) << shift;
+            temp3L = (mid3 + side3) << shift;
+            temp0R = (mid0 - side0) << shift;
+            temp1R = (mid1 - side1) << shift;
+            temp2R = (mid2 - side2) << shift;
+            temp3R = (mid3 - side3) << shift;
+            pOutputSamples[i*8+0] = (ma_int32)temp0L * factor;
+            pOutputSamples[i*8+1] = (ma_int32)temp0R * factor;
+            pOutputSamples[i*8+2] = (ma_int32)temp1L * factor;
+            pOutputSamples[i*8+3] = (ma_int32)temp1R * factor;
+            pOutputSamples[i*8+4] = (ma_int32)temp2L * factor;
+            pOutputSamples[i*8+5] = (ma_int32)temp2R * factor;
+            pOutputSamples[i*8+6] = (ma_int32)temp3L * factor;
+            pOutputSamples[i*8+7] = (ma_int32)temp3R * factor;
+        }
+    } else {
+        for (i = 0; i < frameCount4; ++i) {
+            ma_uint32 temp0L;
+            ma_uint32 temp1L;
+            ma_uint32 temp2L;
+            ma_uint32 temp3L;
+            ma_uint32 temp0R;
+            ma_uint32 temp1R;
+            ma_uint32 temp2R;
+            ma_uint32 temp3R;
+            ma_uint32 mid0  = pInputSamples0U32[i*4+0] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid1  = pInputSamples0U32[i*4+1] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid2  = pInputSamples0U32[i*4+2] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 mid3  = pInputSamples0U32[i*4+3] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side0 = pInputSamples1U32[i*4+0] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side1 = pInputSamples1U32[i*4+1] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side2 = pInputSamples1U32[i*4+2] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            ma_uint32 side3 = pInputSamples1U32[i*4+3] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid0 = (mid0 << 1) | (side0 & 0x01);
+            mid1 = (mid1 << 1) | (side1 & 0x01);
+            mid2 = (mid2 << 1) | (side2 & 0x01);
+            mid3 = (mid3 << 1) | (side3 & 0x01);
+            temp0L = (ma_uint32)((ma_int32)(mid0 + side0) >> 1);
+            temp1L = (ma_uint32)((ma_int32)(mid1 + side1) >> 1);
+            temp2L = (ma_uint32)((ma_int32)(mid2 + side2) >> 1);
+            temp3L = (ma_uint32)((ma_int32)(mid3 + side3) >> 1);
+            temp0R = (ma_uint32)((ma_int32)(mid0 - side0) >> 1);
+            temp1R = (ma_uint32)((ma_int32)(mid1 - side1) >> 1);
+            temp2R = (ma_uint32)((ma_int32)(mid2 - side2) >> 1);
+            temp3R = (ma_uint32)((ma_int32)(mid3 - side3) >> 1);
+            pOutputSamples[i*8+0] = (ma_int32)temp0L * factor;
+            pOutputSamples[i*8+1] = (ma_int32)temp0R * factor;
+            pOutputSamples[i*8+2] = (ma_int32)temp1L * factor;
+            pOutputSamples[i*8+3] = (ma_int32)temp1R * factor;
+            pOutputSamples[i*8+4] = (ma_int32)temp2L * factor;
+            pOutputSamples[i*8+5] = (ma_int32)temp2R * factor;
+            pOutputSamples[i*8+6] = (ma_int32)temp3L * factor;
+            pOutputSamples[i*8+7] = (ma_int32)temp3R * factor;
+        }
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+        ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+        mid = (mid << 1) | (side & 0x01);
+        pOutputSamples[i*2+0] = (ma_int32)((ma_uint32)((ma_int32)(mid + side) >> 1) << unusedBitsPerSample) * factor;
+        pOutputSamples[i*2+1] = (ma_int32)((ma_uint32)((ma_int32)(mid - side) >> 1) << unusedBitsPerSample) * factor;
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift = unusedBitsPerSample - 8;
+    float factor;
+    __m128 factor128;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    factor = 1.0f / 8388608.0f;
+    factor128 = _mm_set1_ps(factor);
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i tempL;
+            __m128i tempR;
+            __m128  leftf;
+            __m128  rightf;
+            mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+            mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+            tempL  = _mm_srai_epi32(_mm_add_epi32(mid, side), 1);
+            tempR  = _mm_srai_epi32(_mm_sub_epi32(mid, side), 1);
+            leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
+            rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
+            _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+            _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = ((ma_int32)(mid + side) >> 1) * factor;
+            pOutputSamples[i*2+1] = ((ma_int32)(mid - side) >> 1) * factor;
+        }
+    } else {
+        shift -= 1;
+        for (i = 0; i < frameCount4; ++i) {
+            __m128i mid;
+            __m128i side;
+            __m128i tempL;
+            __m128i tempR;
+            __m128 leftf;
+            __m128 rightf;
+            mid    = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+            side   = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+            mid    = _mm_or_si128(_mm_slli_epi32(mid, 1), _mm_and_si128(side, _mm_set1_epi32(0x01)));
+            tempL  = _mm_slli_epi32(_mm_add_epi32(mid, side), shift);
+            tempR  = _mm_slli_epi32(_mm_sub_epi32(mid, side), shift);
+            leftf  = _mm_mul_ps(_mm_cvtepi32_ps(tempL), factor128);
+            rightf = _mm_mul_ps(_mm_cvtepi32_ps(tempR), factor128);
+            _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+            _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int32)((mid + side) << shift) * factor;
+            pOutputSamples[i*2+1] = (ma_int32)((mid - side) << shift) * factor;
+        }
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift = unusedBitsPerSample - 8;
+    float factor;
+    float32x4_t factor4;
+    int32x4_t shift4;
+    int32x4_t wbps0_4;
+    int32x4_t wbps1_4;
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 24);
+    factor  = 1.0f / 8388608.0f;
+    factor4 = vdupq_n_f32(factor);
+    wbps0_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample);
+    wbps1_4 = vdupq_n_s32(pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample);
+    if (shift == 0) {
+        for (i = 0; i < frameCount4; ++i) {
+            int32x4_t lefti;
+            int32x4_t righti;
+            float32x4_t leftf;
+            float32x4_t rightf;
+            uint32x4_t mid  = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
+            uint32x4_t side = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
+            mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
+            lefti  = vshrq_n_s32(vreinterpretq_s32_u32(vaddq_u32(mid, side)), 1);
+            righti = vshrq_n_s32(vreinterpretq_s32_u32(vsubq_u32(mid, side)), 1);
+            leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
+            rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
+            ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = ((ma_int32)(mid + side) >> 1) * factor;
+            pOutputSamples[i*2+1] = ((ma_int32)(mid - side) >> 1) * factor;
+        }
+    } else {
+        shift -= 1;
+        shift4 = vdupq_n_s32(shift);
+        for (i = 0; i < frameCount4; ++i) {
+            uint32x4_t mid;
+            uint32x4_t side;
+            int32x4_t lefti;
+            int32x4_t righti;
+            float32x4_t leftf;
+            float32x4_t rightf;
+            mid    = vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), wbps0_4);
+            side   = vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), wbps1_4);
+            mid    = vorrq_u32(vshlq_n_u32(mid, 1), vandq_u32(side, vdupq_n_u32(1)));
+            lefti  = vreinterpretq_s32_u32(vshlq_u32(vaddq_u32(mid, side), shift4));
+            righti = vreinterpretq_s32_u32(vshlq_u32(vsubq_u32(mid, side), shift4));
+            leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
+            rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
+            ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
+        }
+        for (i = (frameCount4 << 2); i < frameCount; ++i) {
+            ma_uint32 mid  = pInputSamples0U32[i] << pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+            ma_uint32 side = pInputSamples1U32[i] << pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+            mid = (mid << 1) | (side & 0x01);
+            pOutputSamples[i*2+0] = (ma_int32)((mid + side) << shift) * factor;
+            pOutputSamples[i*2+1] = (ma_int32)((mid - side) << shift) * factor;
+        }
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_mid_side(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_f32__decode_mid_side__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_f32__decode_mid_side__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_f32__decode_mid_side__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_f32__decode_mid_side__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+#if 0
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__reference(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    for (ma_uint64 i = 0; i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (float)((ma_int32)((ma_uint32)pInputSamples0[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample)) / 2147483648.0);
+        pOutputSamples[i*2+1] = (float)((ma_int32)((ma_uint32)pInputSamples1[i] << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample)) / 2147483648.0);
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__scalar(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample;
+    ma_uint32 shift1 = unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample;
+    float factor = 1 / 2147483648.0;
+    for (i = 0; i < frameCount4; ++i) {
+        ma_uint32 tempL0 = pInputSamples0U32[i*4+0] << shift0;
+        ma_uint32 tempL1 = pInputSamples0U32[i*4+1] << shift0;
+        ma_uint32 tempL2 = pInputSamples0U32[i*4+2] << shift0;
+        ma_uint32 tempL3 = pInputSamples0U32[i*4+3] << shift0;
+        ma_uint32 tempR0 = pInputSamples1U32[i*4+0] << shift1;
+        ma_uint32 tempR1 = pInputSamples1U32[i*4+1] << shift1;
+        ma_uint32 tempR2 = pInputSamples1U32[i*4+2] << shift1;
+        ma_uint32 tempR3 = pInputSamples1U32[i*4+3] << shift1;
+        pOutputSamples[i*8+0] = (ma_int32)tempL0 * factor;
+        pOutputSamples[i*8+1] = (ma_int32)tempR0 * factor;
+        pOutputSamples[i*8+2] = (ma_int32)tempL1 * factor;
+        pOutputSamples[i*8+3] = (ma_int32)tempR1 * factor;
+        pOutputSamples[i*8+4] = (ma_int32)tempL2 * factor;
+        pOutputSamples[i*8+5] = (ma_int32)tempR2 * factor;
+        pOutputSamples[i*8+6] = (ma_int32)tempL3 * factor;
+        pOutputSamples[i*8+7] = (ma_int32)tempR3 * factor;
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0) * factor;
+        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1) * factor;
+    }
+}
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__sse2(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    float factor = 1.0f / 8388608.0f;
+    __m128 factor128 = _mm_set1_ps(factor);
+    for (i = 0; i < frameCount4; ++i) {
+        __m128i lefti;
+        __m128i righti;
+        __m128 leftf;
+        __m128 rightf;
+        lefti  = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples0 + i), shift0);
+        righti = _mm_slli_epi32(_mm_loadu_si128((const __m128i*)pInputSamples1 + i), shift1);
+        leftf  = _mm_mul_ps(_mm_cvtepi32_ps(lefti),  factor128);
+        rightf = _mm_mul_ps(_mm_cvtepi32_ps(righti), factor128);
+        _mm_storeu_ps(pOutputSamples + i*8 + 0, _mm_unpacklo_ps(leftf, rightf));
+        _mm_storeu_ps(pOutputSamples + i*8 + 4, _mm_unpackhi_ps(leftf, rightf));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0) * factor;
+        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1) * factor;
+    }
+}
+#endif
+#if defined(MA_DR_FLAC_SUPPORT_NEON)
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__neon(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+    ma_uint64 i;
+    ma_uint64 frameCount4 = frameCount >> 2;
+    const ma_uint32* pInputSamples0U32 = (const ma_uint32*)pInputSamples0;
+    const ma_uint32* pInputSamples1U32 = (const ma_uint32*)pInputSamples1;
+    ma_uint32 shift0 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[0].wastedBitsPerSample) - 8;
+    ma_uint32 shift1 = (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[1].wastedBitsPerSample) - 8;
+    float factor = 1.0f / 8388608.0f;
+    float32x4_t factor4 = vdupq_n_f32(factor);
+    int32x4_t shift0_4  = vdupq_n_s32(shift0);
+    int32x4_t shift1_4  = vdupq_n_s32(shift1);
+    for (i = 0; i < frameCount4; ++i) {
+        int32x4_t lefti;
+        int32x4_t righti;
+        float32x4_t leftf;
+        float32x4_t rightf;
+        lefti  = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples0U32 + i*4), shift0_4));
+        righti = vreinterpretq_s32_u32(vshlq_u32(vld1q_u32(pInputSamples1U32 + i*4), shift1_4));
+        leftf  = vmulq_f32(vcvtq_f32_s32(lefti),  factor4);
+        rightf = vmulq_f32(vcvtq_f32_s32(righti), factor4);
+        ma_dr_flac__vst2q_f32(pOutputSamples + i*8, vzipq_f32(leftf, rightf));
+    }
+    for (i = (frameCount4 << 2); i < frameCount; ++i) {
+        pOutputSamples[i*2+0] = (ma_int32)(pInputSamples0U32[i] << shift0) * factor;
+        pOutputSamples[i*2+1] = (ma_int32)(pInputSamples1U32[i] << shift1) * factor;
+    }
+}
+#endif
+static MA_INLINE void ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo(ma_dr_flac* pFlac, ma_uint64 frameCount, ma_uint32 unusedBitsPerSample, const ma_int32* pInputSamples0, const ma_int32* pInputSamples1, float* pOutputSamples)
+{
+#if defined(MA_DR_FLAC_SUPPORT_SSE2)
+    if (ma_dr_flac__gIsSSE2Supported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__sse2(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#elif defined(MA_DR_FLAC_SUPPORT_NEON)
+    if (ma_dr_flac__gIsNEONSupported && pFlac->bitsPerSample <= 24) {
+        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__neon(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+    } else
+#endif
+    {
+#if 0
+        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__reference(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#else
+        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo__scalar(pFlac, frameCount, unusedBitsPerSample, pInputSamples0, pInputSamples1, pOutputSamples);
+#endif
+    }
+}
+MA_API ma_uint64 ma_dr_flac_read_pcm_frames_f32(ma_dr_flac* pFlac, ma_uint64 framesToRead, float* pBufferOut)
+{
+    ma_uint64 framesRead;
+    ma_uint32 unusedBitsPerSample;
+    if (pFlac == NULL || framesToRead == 0) {
+        return 0;
+    }
+    if (pBufferOut == NULL) {
+        return ma_dr_flac__seek_forward_by_pcm_frames(pFlac, framesToRead);
+    }
+    MA_DR_FLAC_ASSERT(pFlac->bitsPerSample <= 32);
+    unusedBitsPerSample = 32 - pFlac->bitsPerSample;
+    framesRead = 0;
+    while (framesToRead > 0) {
+        if (pFlac->currentFLACFrame.pcmFramesRemaining == 0) {
+            if (!ma_dr_flac__read_and_decode_next_flac_frame(pFlac)) {
+                break;
+            }
+        } else {
+            unsigned int channelCount = ma_dr_flac__get_channel_count_from_channel_assignment(pFlac->currentFLACFrame.header.channelAssignment);
+            ma_uint64 iFirstPCMFrame = pFlac->currentFLACFrame.header.blockSizeInPCMFrames - pFlac->currentFLACFrame.pcmFramesRemaining;
+            ma_uint64 frameCountThisIteration = framesToRead;
+            if (frameCountThisIteration > pFlac->currentFLACFrame.pcmFramesRemaining) {
+                frameCountThisIteration = pFlac->currentFLACFrame.pcmFramesRemaining;
+            }
+            if (channelCount == 2) {
+                const ma_int32* pDecodedSamples0 = pFlac->currentFLACFrame.subframes[0].pSamplesS32 + iFirstPCMFrame;
+                const ma_int32* pDecodedSamples1 = pFlac->currentFLACFrame.subframes[1].pSamplesS32 + iFirstPCMFrame;
+                switch (pFlac->currentFLACFrame.header.channelAssignment)
+                {
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_LEFT_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_f32__decode_left_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_f32__decode_right_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_MID_SIDE:
+                    {
+                        ma_dr_flac_read_pcm_frames_f32__decode_mid_side(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                    case MA_DR_FLAC_CHANNEL_ASSIGNMENT_INDEPENDENT:
+                    default:
+                    {
+                        ma_dr_flac_read_pcm_frames_f32__decode_independent_stereo(pFlac, frameCountThisIteration, unusedBitsPerSample, pDecodedSamples0, pDecodedSamples1, pBufferOut);
+                    } break;
+                }
+            } else {
+                ma_uint64 i;
+                for (i = 0; i < frameCountThisIteration; ++i) {
+                    unsigned int j;
+                    for (j = 0; j < channelCount; ++j) {
+                        ma_int32 sampleS32 = (ma_int32)((ma_uint32)(pFlac->currentFLACFrame.subframes[j].pSamplesS32[iFirstPCMFrame + i]) << (unusedBitsPerSample + pFlac->currentFLACFrame.subframes[j].wastedBitsPerSample));
+                        pBufferOut[(i*channelCount)+j] = (float)(sampleS32 / 2147483648.0);
+                    }
+                }
+            }
+            framesRead                += frameCountThisIteration;
+            pBufferOut                += frameCountThisIteration * channelCount;
+            framesToRead              -= frameCountThisIteration;
+            pFlac->currentPCMFrame    += frameCountThisIteration;
+            pFlac->currentFLACFrame.pcmFramesRemaining -= (unsigned int)frameCountThisIteration;
+        }
+    }
+    return framesRead;
+}
+MA_API ma_bool32 ma_dr_flac_seek_to_pcm_frame(ma_dr_flac* pFlac, ma_uint64 pcmFrameIndex)
+{
+    if (pFlac == NULL) {
+        return MA_FALSE;
+    }
+    if (pFlac->currentPCMFrame == pcmFrameIndex) {
+        return MA_TRUE;
+    }
+    if (pFlac->firstFLACFramePosInBytes == 0) {
+        return MA_FALSE;
+    }
+    if (pcmFrameIndex == 0) {
+        pFlac->currentPCMFrame = 0;
+        return ma_dr_flac__seek_to_first_frame(pFlac);
+    } else {
+        ma_bool32 wasSuccessful = MA_FALSE;
+        ma_uint64 originalPCMFrame = pFlac->currentPCMFrame;
+        if (pcmFrameIndex > pFlac->totalPCMFrameCount) {
+            pcmFrameIndex = pFlac->totalPCMFrameCount;
+        }
+        if (pcmFrameIndex > pFlac->currentPCMFrame) {
+            ma_uint32 offset = (ma_uint32)(pcmFrameIndex - pFlac->currentPCMFrame);
+            if (pFlac->currentFLACFrame.pcmFramesRemaining >  offset) {
+                pFlac->currentFLACFrame.pcmFramesRemaining -= offset;
+                pFlac->currentPCMFrame = pcmFrameIndex;
+                return MA_TRUE;
+            }
+        } else {
+            ma_uint32 offsetAbs = (ma_uint32)(pFlac->currentPCMFrame - pcmFrameIndex);
+            ma_uint32 currentFLACFramePCMFrameCount = pFlac->currentFLACFrame.header.blockSizeInPCMFrames;
+            ma_uint32 currentFLACFramePCMFramesConsumed = currentFLACFramePCMFrameCount - pFlac->currentFLACFrame.pcmFramesRemaining;
+            if (currentFLACFramePCMFramesConsumed > offsetAbs) {
+                pFlac->currentFLACFrame.pcmFramesRemaining += offsetAbs;
+                pFlac->currentPCMFrame = pcmFrameIndex;
+                return MA_TRUE;
+            }
+        }
+#ifndef MA_DR_FLAC_NO_OGG
+        if (pFlac->container == ma_dr_flac_container_ogg)
+        {
+            wasSuccessful = ma_dr_flac_ogg__seek_to_pcm_frame(pFlac, pcmFrameIndex);
+        }
+        else
+#endif
+        {
+            if (!pFlac->_noSeekTableSeek) {
+                wasSuccessful = ma_dr_flac__seek_to_pcm_frame__seek_table(pFlac, pcmFrameIndex);
+            }
+#if !defined(MA_DR_FLAC_NO_CRC)
+            if (!wasSuccessful && !pFlac->_noBinarySearchSeek && pFlac->totalPCMFrameCount > 0) {
+                wasSuccessful = ma_dr_flac__seek_to_pcm_frame__binary_search(pFlac, pcmFrameIndex);
+            }
+#endif
+            if (!wasSuccessful && !pFlac->_noBruteForceSeek) {
+                wasSuccessful = ma_dr_flac__seek_to_pcm_frame__brute_force(pFlac, pcmFrameIndex);
+            }
+        }
+        if (wasSuccessful) {
+            pFlac->currentPCMFrame = pcmFrameIndex;
+        } else {
+            if (ma_dr_flac_seek_to_pcm_frame(pFlac, originalPCMFrame) == MA_FALSE) {
+                ma_dr_flac_seek_to_pcm_frame(pFlac, 0);
+            }
+        }
+        return wasSuccessful;
+    }
+}
+#define MA_DR_FLAC_DEFINE_FULL_READ_AND_CLOSE(extension, type) \
+static type* ma_dr_flac__full_read_and_close_ ## extension (ma_dr_flac* pFlac, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalPCMFrameCountOut)\
+{                                                                                                                                                                   \
+    type* pSampleData = NULL;                                                                                                                                       \
+    ma_uint64 totalPCMFrameCount;                                                                                                                               \
+                                                                                                                                                                    \
+    MA_DR_FLAC_ASSERT(pFlac != NULL);                                                                                                                                   \
+                                                                                                                                                                    \
+    totalPCMFrameCount = pFlac->totalPCMFrameCount;                                                                                                                 \
+                                                                                                                                                                    \
+    if (totalPCMFrameCount == 0) {                                                                                                                                  \
+        type buffer[4096];                                                                                                                                          \
+        ma_uint64 pcmFramesRead;                                                                                                                                \
+        size_t sampleDataBufferSize = sizeof(buffer);                                                                                                               \
+                                                                                                                                                                    \
+        pSampleData = (type*)ma_dr_flac__malloc_from_callbacks(sampleDataBufferSize, &pFlac->allocationCallbacks);                                                      \
+        if (pSampleData == NULL) {                                                                                                                                  \
+            goto on_error;                                                                                                                                          \
+        }                                                                                                                                                           \
+                                                                                                                                                                    \
+        while ((pcmFramesRead = (ma_uint64)ma_dr_flac_read_pcm_frames_##extension(pFlac, sizeof(buffer)/sizeof(buffer[0])/pFlac->channels, buffer)) > 0) {          \
+            if (((totalPCMFrameCount + pcmFramesRead) * pFlac->channels * sizeof(type)) > sampleDataBufferSize) {                                                   \
+                type* pNewSampleData;                                                                                                                               \
+                size_t newSampleDataBufferSize;                                                                                                                     \
+                                                                                                                                                                    \
+                newSampleDataBufferSize = sampleDataBufferSize * 2;                                                                                                 \
+                pNewSampleData = (type*)ma_dr_flac__realloc_from_callbacks(pSampleData, newSampleDataBufferSize, sampleDataBufferSize, &pFlac->allocationCallbacks);    \
+                if (pNewSampleData == NULL) {                                                                                                                       \
+                    ma_dr_flac__free_from_callbacks(pSampleData, &pFlac->allocationCallbacks);                                                                          \
+                    goto on_error;                                                                                                                                  \
+                }                                                                                                                                                   \
+                                                                                                                                                                    \
+                sampleDataBufferSize = newSampleDataBufferSize;                                                                                                     \
+                pSampleData = pNewSampleData;                                                                                                                       \
+            }                                                                                                                                                       \
+                                                                                                                                                                    \
+            MA_DR_FLAC_COPY_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), buffer, (size_t)(pcmFramesRead*pFlac->channels*sizeof(type)));                   \
+            totalPCMFrameCount += pcmFramesRead;                                                                                                                    \
+        }                                                                                                                                                           \
+                                                                                                                                                                    \
+                                                                                                                         \
+        MA_DR_FLAC_ZERO_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), (size_t)(sampleDataBufferSize - totalPCMFrameCount*pFlac->channels*sizeof(type)));   \
+    } else {                                                                                                                                                        \
+        ma_uint64 dataSize = totalPCMFrameCount*pFlac->channels*sizeof(type);                                                                                   \
+        if (dataSize > (ma_uint64)MA_SIZE_MAX) {                                                                                                            \
+            goto on_error;                                                                                                        \
+        }                                                                                                                                                           \
+                                                                                                                                                                    \
+        pSampleData = (type*)ma_dr_flac__malloc_from_callbacks((size_t)dataSize, &pFlac->allocationCallbacks);               \
+        if (pSampleData == NULL) {                                                                                                                                  \
+            goto on_error;                                                                                                                                          \
+        }                                                                                                                                                           \
+                                                                                                                                                                    \
+        totalPCMFrameCount = ma_dr_flac_read_pcm_frames_##extension(pFlac, pFlac->totalPCMFrameCount, pSampleData);                                                     \
+    }                                                                                                                                                               \
+                                                                                                                                                                    \
+    if (sampleRateOut) *sampleRateOut = pFlac->sampleRate;                                                                                                          \
+    if (channelsOut) *channelsOut = pFlac->channels;                                                                                                                \
+    if (totalPCMFrameCountOut) *totalPCMFrameCountOut = totalPCMFrameCount;                                                                                         \
+                                                                                                                                                                    \
+    ma_dr_flac_close(pFlac);                                                                                                                                            \
+    return pSampleData;                                                                                                                                             \
+                                                                                                                                                                    \
+on_error:                                                                                                                                                           \
+    ma_dr_flac_close(pFlac);                                                                                                                                            \
+    return NULL;                                                                                                                                                    \
+}
+MA_DR_FLAC_DEFINE_FULL_READ_AND_CLOSE(s32, ma_int32)
+MA_DR_FLAC_DEFINE_FULL_READ_AND_CLOSE(s16, ma_int16)
+MA_DR_FLAC_DEFINE_FULL_READ_AND_CLOSE(f32, float)
+MA_API ma_int32* ma_dr_flac_open_and_read_pcm_frames_s32(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalPCMFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalPCMFrameCountOut) {
+        *totalPCMFrameCountOut = 0;
+    }
+    pFlac = ma_dr_flac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_s32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
+}
+MA_API ma_int16* ma_dr_flac_open_and_read_pcm_frames_s16(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalPCMFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalPCMFrameCountOut) {
+        *totalPCMFrameCountOut = 0;
+    }
+    pFlac = ma_dr_flac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_s16(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
+}
+MA_API float* ma_dr_flac_open_and_read_pcm_frames_f32(ma_dr_flac_read_proc onRead, ma_dr_flac_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, ma_uint64* totalPCMFrameCountOut, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (channelsOut) {
+        *channelsOut = 0;
+    }
+    if (sampleRateOut) {
+        *sampleRateOut = 0;
+    }
+    if (totalPCMFrameCountOut) {
+        *totalPCMFrameCountOut = 0;
+    }
+    pFlac = ma_dr_flac_open(onRead, onSeek, pUserData, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_f32(pFlac, channelsOut, sampleRateOut, totalPCMFrameCountOut);
+}
+#ifndef MA_DR_FLAC_NO_STDIO
+MA_API ma_int32* ma_dr_flac_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
+    }
+    pFlac = ma_dr_flac_open_file(filename, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+MA_API ma_int16* ma_dr_flac_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
+    }
+    pFlac = ma_dr_flac_open_file(filename, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+MA_API float* ma_dr_flac_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
+    }
+    pFlac = ma_dr_flac_open_file(filename, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+#endif
+MA_API ma_int32* ma_dr_flac_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
+    }
+    pFlac = ma_dr_flac_open_memory(data, dataSize, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_s32(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+MA_API ma_int16* ma_dr_flac_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
+    }
+    pFlac = ma_dr_flac_open_memory(data, dataSize, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_s16(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+MA_API float* ma_dr_flac_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channels, unsigned int* sampleRate, ma_uint64* totalPCMFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_flac* pFlac;
+    if (sampleRate) {
+        *sampleRate = 0;
+    }
+    if (channels) {
+        *channels = 0;
+    }
+    if (totalPCMFrameCount) {
+        *totalPCMFrameCount = 0;
+    }
+    pFlac = ma_dr_flac_open_memory(data, dataSize, pAllocationCallbacks);
+    if (pFlac == NULL) {
+        return NULL;
+    }
+    return ma_dr_flac__full_read_and_close_f32(pFlac, channels, sampleRate, totalPCMFrameCount);
+}
+MA_API void ma_dr_flac_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        ma_dr_flac__free_from_callbacks(p, pAllocationCallbacks);
+    } else {
+        ma_dr_flac__free_default(p, NULL);
+    }
+}
+MA_API void ma_dr_flac_init_vorbis_comment_iterator(ma_dr_flac_vorbis_comment_iterator* pIter, ma_uint32 commentCount, const void* pComments)
+{
+    if (pIter == NULL) {
+        return;
+    }
+    pIter->countRemaining = commentCount;
+    pIter->pRunningData   = (const char*)pComments;
+}
+MA_API const char* ma_dr_flac_next_vorbis_comment(ma_dr_flac_vorbis_comment_iterator* pIter, ma_uint32* pCommentLengthOut)
+{
+    ma_int32 length;
+    const char* pComment;
+    if (pCommentLengthOut) {
+        *pCommentLengthOut = 0;
+    }
+    if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
+        return NULL;
+    }
+    length = ma_dr_flac__le2host_32_ptr_unaligned(pIter->pRunningData);
+    pIter->pRunningData += 4;
+    pComment = pIter->pRunningData;
+    pIter->pRunningData += length;
+    pIter->countRemaining -= 1;
+    if (pCommentLengthOut) {
+        *pCommentLengthOut = length;
+    }
+    return pComment;
+}
+MA_API void ma_dr_flac_init_cuesheet_track_iterator(ma_dr_flac_cuesheet_track_iterator* pIter, ma_uint32 trackCount, const void* pTrackData)
+{
+    if (pIter == NULL) {
+        return;
+    }
+    pIter->countRemaining = trackCount;
+    pIter->pRunningData   = (const char*)pTrackData;
+}
+MA_API ma_bool32 ma_dr_flac_next_cuesheet_track(ma_dr_flac_cuesheet_track_iterator* pIter, ma_dr_flac_cuesheet_track* pCuesheetTrack)
+{
+    ma_dr_flac_cuesheet_track cuesheetTrack;
+    const char* pRunningData;
+    ma_uint64 offsetHi;
+    ma_uint64 offsetLo;
+    if (pIter == NULL || pIter->countRemaining == 0 || pIter->pRunningData == NULL) {
+        return MA_FALSE;
+    }
+    pRunningData = pIter->pRunningData;
+    offsetHi                   = ma_dr_flac__be2host_32(*(const ma_uint32*)pRunningData); pRunningData += 4;
+    offsetLo                   = ma_dr_flac__be2host_32(*(const ma_uint32*)pRunningData); pRunningData += 4;
+    cuesheetTrack.offset       = offsetLo | (offsetHi << 32);
+    cuesheetTrack.trackNumber  = pRunningData[0];                                         pRunningData += 1;
+    MA_DR_FLAC_COPY_MEMORY(cuesheetTrack.ISRC, pRunningData, sizeof(cuesheetTrack.ISRC));     pRunningData += 12;
+    cuesheetTrack.isAudio      = (pRunningData[0] & 0x80) != 0;
+    cuesheetTrack.preEmphasis  = (pRunningData[0] & 0x40) != 0;                           pRunningData += 14;
+    cuesheetTrack.indexCount   = pRunningData[0];                                         pRunningData += 1;
+    cuesheetTrack.pIndexPoints = (const ma_dr_flac_cuesheet_track_index*)pRunningData;        pRunningData += cuesheetTrack.indexCount * sizeof(ma_dr_flac_cuesheet_track_index);
+    pIter->pRunningData = pRunningData;
+    pIter->countRemaining -= 1;
+    if (pCuesheetTrack) {
+        *pCuesheetTrack = cuesheetTrack;
+    }
+    return MA_TRUE;
+}
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+    #pragma GCC diagnostic pop
+#endif
+#endif
+/* dr_flac_c end */
+#endif  /* MA_DR_FLAC_IMPLEMENTATION */
+#endif  /* MA_NO_FLAC */
+
+#if !defined(MA_NO_MP3) && !defined(MA_NO_DECODING)
+#if !defined(MA_DR_MP3_IMPLEMENTATION)
+/* dr_mp3_c begin */
+#ifndef ma_dr_mp3_c
+#define ma_dr_mp3_c
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+MA_API void ma_dr_mp3_version(ma_uint32* pMajor, ma_uint32* pMinor, ma_uint32* pRevision)
+{
+    if (pMajor) {
+        *pMajor = MA_DR_MP3_VERSION_MAJOR;
+    }
+    if (pMinor) {
+        *pMinor = MA_DR_MP3_VERSION_MINOR;
+    }
+    if (pRevision) {
+        *pRevision = MA_DR_MP3_VERSION_REVISION;
+    }
+}
+MA_API const char* ma_dr_mp3_version_string(void)
+{
+    return MA_DR_MP3_VERSION_STRING;
+}
+#if defined(__TINYC__)
+#define MA_DR_MP3_NO_SIMD
+#endif
+#define MA_DR_MP3_OFFSET_PTR(p, offset) ((void*)((ma_uint8*)(p) + (offset)))
+#define MA_DR_MP3_MAX_FREE_FORMAT_FRAME_SIZE  2304
+#ifndef MA_DR_MP3_MAX_FRAME_SYNC_MATCHES
+#define MA_DR_MP3_MAX_FRAME_SYNC_MATCHES      10
+#endif
+#define MA_DR_MP3_MAX_L3_FRAME_PAYLOAD_BYTES  MA_DR_MP3_MAX_FREE_FORMAT_FRAME_SIZE
+#define MA_DR_MP3_MAX_BITRESERVOIR_BYTES      511
+#define MA_DR_MP3_SHORT_BLOCK_TYPE            2
+#define MA_DR_MP3_STOP_BLOCK_TYPE             3
+#define MA_DR_MP3_MODE_MONO                   3
+#define MA_DR_MP3_MODE_JOINT_STEREO           1
+#define MA_DR_MP3_HDR_SIZE                    4
+#define MA_DR_MP3_HDR_IS_MONO(h)              (((h[3]) & 0xC0) == 0xC0)
+#define MA_DR_MP3_HDR_IS_MS_STEREO(h)         (((h[3]) & 0xE0) == 0x60)
+#define MA_DR_MP3_HDR_IS_FREE_FORMAT(h)       (((h[2]) & 0xF0) == 0)
+#define MA_DR_MP3_HDR_IS_CRC(h)               (!((h[1]) & 1))
+#define MA_DR_MP3_HDR_TEST_PADDING(h)         ((h[2]) & 0x2)
+#define MA_DR_MP3_HDR_TEST_MPEG1(h)           ((h[1]) & 0x8)
+#define MA_DR_MP3_HDR_TEST_NOT_MPEG25(h)      ((h[1]) & 0x10)
+#define MA_DR_MP3_HDR_TEST_I_STEREO(h)        ((h[3]) & 0x10)
+#define MA_DR_MP3_HDR_TEST_MS_STEREO(h)       ((h[3]) & 0x20)
+#define MA_DR_MP3_HDR_GET_STEREO_MODE(h)      (((h[3]) >> 6) & 3)
+#define MA_DR_MP3_HDR_GET_STEREO_MODE_EXT(h)  (((h[3]) >> 4) & 3)
+#define MA_DR_MP3_HDR_GET_LAYER(h)            (((h[1]) >> 1) & 3)
+#define MA_DR_MP3_HDR_GET_BITRATE(h)          ((h[2]) >> 4)
+#define MA_DR_MP3_HDR_GET_SAMPLE_RATE(h)      (((h[2]) >> 2) & 3)
+#define MA_DR_MP3_HDR_GET_MY_SAMPLE_RATE(h)   (MA_DR_MP3_HDR_GET_SAMPLE_RATE(h) + (((h[1] >> 3) & 1) + ((h[1] >> 4) & 1))*3)
+#define MA_DR_MP3_HDR_IS_FRAME_576(h)         ((h[1] & 14) == 2)
+#define MA_DR_MP3_HDR_IS_LAYER_1(h)           ((h[1] & 6) == 6)
+#define MA_DR_MP3_BITS_DEQUANTIZER_OUT        -1
+#define MA_DR_MP3_MAX_SCF                     (255 + MA_DR_MP3_BITS_DEQUANTIZER_OUT*4 - 210)
+#define MA_DR_MP3_MAX_SCFI                    ((MA_DR_MP3_MAX_SCF + 3) & ~3)
+#define MA_DR_MP3_MIN(a, b)           ((a) > (b) ? (b) : (a))
+#define MA_DR_MP3_MAX(a, b)           ((a) < (b) ? (b) : (a))
+#if !defined(MA_DR_MP3_NO_SIMD)
+#if !defined(MA_DR_MP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC))
+#define MA_DR_MP3_ONLY_SIMD
+#endif
+#if ((defined(_MSC_VER) && _MSC_VER >= 1400) && defined(_M_X64)) || ((defined(__i386) || defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && ((defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)))
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <emmintrin.h>
+#define MA_DR_MP3_HAVE_SSE 1
+#define MA_DR_MP3_HAVE_SIMD 1
+#define MA_DR_MP3_VSTORE _mm_storeu_ps
+#define MA_DR_MP3_VLD _mm_loadu_ps
+#define MA_DR_MP3_VSET _mm_set1_ps
+#define MA_DR_MP3_VADD _mm_add_ps
+#define MA_DR_MP3_VSUB _mm_sub_ps
+#define MA_DR_MP3_VMUL _mm_mul_ps
+#define MA_DR_MP3_VMAC(a, x, y) _mm_add_ps(a, _mm_mul_ps(x, y))
+#define MA_DR_MP3_VMSB(a, x, y) _mm_sub_ps(a, _mm_mul_ps(x, y))
+#define MA_DR_MP3_VMUL_S(x, s)  _mm_mul_ps(x, _mm_set1_ps(s))
+#define MA_DR_MP3_VREV(x) _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3))
+typedef __m128 ma_dr_mp3_f4;
+#if defined(_MSC_VER) || defined(MA_DR_MP3_ONLY_SIMD)
+#define ma_dr_mp3_cpuid __cpuid
+#else
+static __inline__ __attribute__((always_inline)) void ma_dr_mp3_cpuid(int CPUInfo[], const int InfoType)
+{
+#if defined(__PIC__)
+    __asm__ __volatile__(
+#if defined(__x86_64__)
+        "push %%rbx\n"
+        "cpuid\n"
+        "xchgl %%ebx, %1\n"
+        "pop  %%rbx\n"
+#else
+        "xchgl %%ebx, %1\n"
+        "cpuid\n"
+        "xchgl %%ebx, %1\n"
+#endif
+        : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+        : "a" (InfoType));
+#else
+    __asm__ __volatile__(
+        "cpuid"
+        : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+        : "a" (InfoType));
+#endif
+}
+#endif
+static int ma_dr_mp3_have_simd(void)
+{
+#ifdef MA_DR_MP3_ONLY_SIMD
+    return 1;
+#else
+    static int g_have_simd;
+    int CPUInfo[4];
+#ifdef MINIMP3_TEST
+    static int g_counter;
+    if (g_counter++ > 100)
+        return 0;
+#endif
+    if (g_have_simd)
+        goto end;
+    ma_dr_mp3_cpuid(CPUInfo, 0);
+    if (CPUInfo[0] > 0)
+    {
+        ma_dr_mp3_cpuid(CPUInfo, 1);
+        g_have_simd = (CPUInfo[3] & (1 << 26)) + 1;
+        return g_have_simd - 1;
+    }
+end:
+    return g_have_simd - 1;
+#endif
+}
+#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#include <arm_neon.h>
+#define MA_DR_MP3_HAVE_SSE 0
+#define MA_DR_MP3_HAVE_SIMD 1
+#define MA_DR_MP3_VSTORE vst1q_f32
+#define MA_DR_MP3_VLD vld1q_f32
+#define MA_DR_MP3_VSET vmovq_n_f32
+#define MA_DR_MP3_VADD vaddq_f32
+#define MA_DR_MP3_VSUB vsubq_f32
+#define MA_DR_MP3_VMUL vmulq_f32
+#define MA_DR_MP3_VMAC(a, x, y) vmlaq_f32(a, x, y)
+#define MA_DR_MP3_VMSB(a, x, y) vmlsq_f32(a, x, y)
+#define MA_DR_MP3_VMUL_S(x, s)  vmulq_f32(x, vmovq_n_f32(s))
+#define MA_DR_MP3_VREV(x) vcombine_f32(vget_high_f32(vrev64q_f32(x)), vget_low_f32(vrev64q_f32(x)))
+typedef float32x4_t ma_dr_mp3_f4;
+static int ma_dr_mp3_have_simd(void)
+{
+    return 1;
+}
+#else
+#define MA_DR_MP3_HAVE_SSE 0
+#define MA_DR_MP3_HAVE_SIMD 0
+#ifdef MA_DR_MP3_ONLY_SIMD
+#error MA_DR_MP3_ONLY_SIMD used, but SSE/NEON not enabled
+#endif
+#endif
+#else
+#define MA_DR_MP3_HAVE_SIMD 0
+#endif
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC) && !defined(__ARM_ARCH_6M__)
+#define MA_DR_MP3_HAVE_ARMV6 1
+static __inline__ __attribute__((always_inline)) ma_int32 ma_dr_mp3_clip_int16_arm(ma_int32 a)
+{
+    ma_int32 x = 0;
+    __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a));
+    return x;
+}
+#else
+#define MA_DR_MP3_HAVE_ARMV6 0
+#endif
+#ifndef MA_DR_MP3_ASSERT
+#include <assert.h>
+#define MA_DR_MP3_ASSERT(expression) assert(expression)
+#endif
+#ifndef MA_DR_MP3_COPY_MEMORY
+#define MA_DR_MP3_COPY_MEMORY(dst, src, sz) memcpy((dst), (src), (sz))
+#endif
+#ifndef MA_DR_MP3_MOVE_MEMORY
+#define MA_DR_MP3_MOVE_MEMORY(dst, src, sz) memmove((dst), (src), (sz))
+#endif
+#ifndef MA_DR_MP3_ZERO_MEMORY
+#define MA_DR_MP3_ZERO_MEMORY(p, sz) memset((p), 0, (sz))
+#endif
+#define MA_DR_MP3_ZERO_OBJECT(p) MA_DR_MP3_ZERO_MEMORY((p), sizeof(*(p)))
+#ifndef MA_DR_MP3_MALLOC
+#define MA_DR_MP3_MALLOC(sz) malloc((sz))
+#endif
+#ifndef MA_DR_MP3_REALLOC
+#define MA_DR_MP3_REALLOC(p, sz) realloc((p), (sz))
+#endif
+#ifndef MA_DR_MP3_FREE
+#define MA_DR_MP3_FREE(p) free((p))
+#endif
+typedef struct
+{
+    const ma_uint8 *buf;
+    int pos, limit;
+} ma_dr_mp3_bs;
+typedef struct
+{
+    float scf[3*64];
+    ma_uint8 total_bands, stereo_bands, bitalloc[64], scfcod[64];
+} ma_dr_mp3_L12_scale_info;
+typedef struct
+{
+    ma_uint8 tab_offset, code_tab_width, band_count;
+} ma_dr_mp3_L12_subband_alloc;
+typedef struct
+{
+    const ma_uint8 *sfbtab;
+    ma_uint16 part_23_length, big_values, scalefac_compress;
+    ma_uint8 global_gain, block_type, mixed_block_flag, n_long_sfb, n_short_sfb;
+    ma_uint8 table_select[3], region_count[3], subblock_gain[3];
+    ma_uint8 preflag, scalefac_scale, count1_table, scfsi;
+} ma_dr_mp3_L3_gr_info;
+typedef struct
+{
+    ma_dr_mp3_bs bs;
+    ma_uint8 maindata[MA_DR_MP3_MAX_BITRESERVOIR_BYTES + MA_DR_MP3_MAX_L3_FRAME_PAYLOAD_BYTES];
+    ma_dr_mp3_L3_gr_info gr_info[4];
+    float grbuf[2][576], scf[40], syn[18 + 15][2*32];
+    ma_uint8 ist_pos[2][39];
+} ma_dr_mp3dec_scratch;
+static void ma_dr_mp3_bs_init(ma_dr_mp3_bs *bs, const ma_uint8 *data, int bytes)
+{
+    bs->buf   = data;
+    bs->pos   = 0;
+    bs->limit = bytes*8;
+}
+static ma_uint32 ma_dr_mp3_bs_get_bits(ma_dr_mp3_bs *bs, int n)
+{
+    ma_uint32 next, cache = 0, s = bs->pos & 7;
+    int shl = n + s;
+    const ma_uint8 *p = bs->buf + (bs->pos >> 3);
+    if ((bs->pos += n) > bs->limit)
+        return 0;
+    next = *p++ & (255 >> s);
+    while ((shl -= 8) > 0)
+    {
+        cache |= next << shl;
+        next = *p++;
+    }
+    return cache | (next >> -shl);
+}
+static int ma_dr_mp3_hdr_valid(const ma_uint8 *h)
+{
+    return h[0] == 0xff &&
+        ((h[1] & 0xF0) == 0xf0 || (h[1] & 0xFE) == 0xe2) &&
+        (MA_DR_MP3_HDR_GET_LAYER(h) != 0) &&
+        (MA_DR_MP3_HDR_GET_BITRATE(h) != 15) &&
+        (MA_DR_MP3_HDR_GET_SAMPLE_RATE(h) != 3);
+}
+static int ma_dr_mp3_hdr_compare(const ma_uint8 *h1, const ma_uint8 *h2)
+{
+    return ma_dr_mp3_hdr_valid(h2) &&
+        ((h1[1] ^ h2[1]) & 0xFE) == 0 &&
+        ((h1[2] ^ h2[2]) & 0x0C) == 0 &&
+        !(MA_DR_MP3_HDR_IS_FREE_FORMAT(h1) ^ MA_DR_MP3_HDR_IS_FREE_FORMAT(h2));
+}
+static unsigned ma_dr_mp3_hdr_bitrate_kbps(const ma_uint8 *h)
+{
+    static const ma_uint8 halfrate[2][3][15] = {
+        { { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,16,24,28,32,40,48,56,64,72,80,88,96,112,128 } },
+        { { 0,16,20,24,28,32,40,48,56,64,80,96,112,128,160 }, { 0,16,24,28,32,40,48,56,64,80,96,112,128,160,192 }, { 0,16,32,48,64,80,96,112,128,144,160,176,192,208,224 } },
+    };
+    return 2*halfrate[!!MA_DR_MP3_HDR_TEST_MPEG1(h)][MA_DR_MP3_HDR_GET_LAYER(h) - 1][MA_DR_MP3_HDR_GET_BITRATE(h)];
+}
+static unsigned ma_dr_mp3_hdr_sample_rate_hz(const ma_uint8 *h)
+{
+    static const unsigned g_hz[3] = { 44100, 48000, 32000 };
+    return g_hz[MA_DR_MP3_HDR_GET_SAMPLE_RATE(h)] >> (int)!MA_DR_MP3_HDR_TEST_MPEG1(h) >> (int)!MA_DR_MP3_HDR_TEST_NOT_MPEG25(h);
+}
+static unsigned ma_dr_mp3_hdr_frame_samples(const ma_uint8 *h)
+{
+    return MA_DR_MP3_HDR_IS_LAYER_1(h) ? 384 : (1152 >> (int)MA_DR_MP3_HDR_IS_FRAME_576(h));
+}
+static int ma_dr_mp3_hdr_frame_bytes(const ma_uint8 *h, int free_format_size)
+{
+    int frame_bytes = ma_dr_mp3_hdr_frame_samples(h)*ma_dr_mp3_hdr_bitrate_kbps(h)*125/ma_dr_mp3_hdr_sample_rate_hz(h);
+    if (MA_DR_MP3_HDR_IS_LAYER_1(h))
+    {
+        frame_bytes &= ~3;
+    }
+    return frame_bytes ? frame_bytes : free_format_size;
+}
+static int ma_dr_mp3_hdr_padding(const ma_uint8 *h)
+{
+    return MA_DR_MP3_HDR_TEST_PADDING(h) ? (MA_DR_MP3_HDR_IS_LAYER_1(h) ? 4 : 1) : 0;
+}
+#ifndef MA_DR_MP3_ONLY_MP3
+static const ma_dr_mp3_L12_subband_alloc *ma_dr_mp3_L12_subband_alloc_table(const ma_uint8 *hdr, ma_dr_mp3_L12_scale_info *sci)
+{
+    const ma_dr_mp3_L12_subband_alloc *alloc;
+    int mode = MA_DR_MP3_HDR_GET_STEREO_MODE(hdr);
+    int nbands, stereo_bands = (mode == MA_DR_MP3_MODE_MONO) ? 0 : (mode == MA_DR_MP3_MODE_JOINT_STEREO) ? (MA_DR_MP3_HDR_GET_STEREO_MODE_EXT(hdr) << 2) + 4 : 32;
+    if (MA_DR_MP3_HDR_IS_LAYER_1(hdr))
+    {
+        static const ma_dr_mp3_L12_subband_alloc g_alloc_L1[] = { { 76, 4, 32 } };
+        alloc = g_alloc_L1;
+        nbands = 32;
+    } else if (!MA_DR_MP3_HDR_TEST_MPEG1(hdr))
+    {
+        static const ma_dr_mp3_L12_subband_alloc g_alloc_L2M2[] = { { 60, 4, 4 }, { 44, 3, 7 }, { 44, 2, 19 } };
+        alloc = g_alloc_L2M2;
+        nbands = 30;
+    } else
+    {
+        static const ma_dr_mp3_L12_subband_alloc g_alloc_L2M1[] = { { 0, 4, 3 }, { 16, 4, 8 }, { 32, 3, 12 }, { 40, 2, 7 } };
+        int sample_rate_idx = MA_DR_MP3_HDR_GET_SAMPLE_RATE(hdr);
+        unsigned kbps = ma_dr_mp3_hdr_bitrate_kbps(hdr) >> (int)(mode != MA_DR_MP3_MODE_MONO);
+        if (!kbps)
+        {
+            kbps = 192;
+        }
+        alloc = g_alloc_L2M1;
+        nbands = 27;
+        if (kbps < 56)
+        {
+            static const ma_dr_mp3_L12_subband_alloc g_alloc_L2M1_lowrate[] = { { 44, 4, 2 }, { 44, 3, 10 } };
+            alloc = g_alloc_L2M1_lowrate;
+            nbands = sample_rate_idx == 2 ? 12 : 8;
+        } else if (kbps >= 96 && sample_rate_idx != 1)
+        {
+            nbands = 30;
+        }
+    }
+    sci->total_bands = (ma_uint8)nbands;
+    sci->stereo_bands = (ma_uint8)MA_DR_MP3_MIN(stereo_bands, nbands);
+    return alloc;
+}
+static void ma_dr_mp3_L12_read_scalefactors(ma_dr_mp3_bs *bs, ma_uint8 *pba, ma_uint8 *scfcod, int bands, float *scf)
+{
+    static const float g_deq_L12[18*3] = {
+#define MA_DR_MP3_DQ(x) 9.53674316e-07f/x, 7.56931807e-07f/x, 6.00777173e-07f/x
+        MA_DR_MP3_DQ(3),MA_DR_MP3_DQ(7),MA_DR_MP3_DQ(15),MA_DR_MP3_DQ(31),MA_DR_MP3_DQ(63),MA_DR_MP3_DQ(127),MA_DR_MP3_DQ(255),MA_DR_MP3_DQ(511),MA_DR_MP3_DQ(1023),MA_DR_MP3_DQ(2047),MA_DR_MP3_DQ(4095),MA_DR_MP3_DQ(8191),MA_DR_MP3_DQ(16383),MA_DR_MP3_DQ(32767),MA_DR_MP3_DQ(65535),MA_DR_MP3_DQ(3),MA_DR_MP3_DQ(5),MA_DR_MP3_DQ(9)
+    };
+    int i, m;
+    for (i = 0; i < bands; i++)
+    {
+        float s = 0;
+        int ba = *pba++;
+        int mask = ba ? 4 + ((19 >> scfcod[i]) & 3) : 0;
+        for (m = 4; m; m >>= 1)
+        {
+            if (mask & m)
+            {
+                int b = ma_dr_mp3_bs_get_bits(bs, 6);
+                s = g_deq_L12[ba*3 - 6 + b % 3]*(int)(1 << 21 >> b/3);
+            }
+            *scf++ = s;
+        }
+    }
+}
+static void ma_dr_mp3_L12_read_scale_info(const ma_uint8 *hdr, ma_dr_mp3_bs *bs, ma_dr_mp3_L12_scale_info *sci)
+{
+    static const ma_uint8 g_bitalloc_code_tab[] = {
+        0,17, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16,
+        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,16,
+        0,17,18, 3,19,4,5,16,
+        0,17,18,16,
+        0,17,18,19, 4,5,6, 7,8, 9,10,11,12,13,14,15,
+        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,14,
+        0, 2, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16
+    };
+    const ma_dr_mp3_L12_subband_alloc *subband_alloc = ma_dr_mp3_L12_subband_alloc_table(hdr, sci);
+    int i, k = 0, ba_bits = 0;
+    const ma_uint8 *ba_code_tab = g_bitalloc_code_tab;
+    for (i = 0; i < sci->total_bands; i++)
+    {
+        ma_uint8 ba;
+        if (i == k)
+        {
+            k += subband_alloc->band_count;
+            ba_bits = subband_alloc->code_tab_width;
+            ba_code_tab = g_bitalloc_code_tab + subband_alloc->tab_offset;
+            subband_alloc++;
+        }
+        ba = ba_code_tab[ma_dr_mp3_bs_get_bits(bs, ba_bits)];
+        sci->bitalloc[2*i] = ba;
+        if (i < sci->stereo_bands)
+        {
+            ba = ba_code_tab[ma_dr_mp3_bs_get_bits(bs, ba_bits)];
+        }
+        sci->bitalloc[2*i + 1] = sci->stereo_bands ? ba : 0;
+    }
+    for (i = 0; i < 2*sci->total_bands; i++)
+    {
+        sci->scfcod[i] = (ma_uint8)(sci->bitalloc[i] ? MA_DR_MP3_HDR_IS_LAYER_1(hdr) ? 2 : ma_dr_mp3_bs_get_bits(bs, 2) : 6);
+    }
+    ma_dr_mp3_L12_read_scalefactors(bs, sci->bitalloc, sci->scfcod, sci->total_bands*2, sci->scf);
+    for (i = sci->stereo_bands; i < sci->total_bands; i++)
+    {
+        sci->bitalloc[2*i + 1] = 0;
+    }
+}
+static int ma_dr_mp3_L12_dequantize_granule(float *grbuf, ma_dr_mp3_bs *bs, ma_dr_mp3_L12_scale_info *sci, int group_size)
+{
+    int i, j, k, choff = 576;
+    for (j = 0; j < 4; j++)
+    {
+        float *dst = grbuf + group_size*j;
+        for (i = 0; i < 2*sci->total_bands; i++)
+        {
+            int ba = sci->bitalloc[i];
+            if (ba != 0)
+            {
+                if (ba < 17)
+                {
+                    int half = (1 << (ba - 1)) - 1;
+                    for (k = 0; k < group_size; k++)
+                    {
+                        dst[k] = (float)((int)ma_dr_mp3_bs_get_bits(bs, ba) - half);
+                    }
+                } else
+                {
+                    unsigned mod = (2 << (ba - 17)) + 1;
+                    unsigned code = ma_dr_mp3_bs_get_bits(bs, mod + 2 - (mod >> 3));
+                    for (k = 0; k < group_size; k++, code /= mod)
+                    {
+                        dst[k] = (float)((int)(code % mod - mod/2));
+                    }
+                }
+            }
+            dst += choff;
+            choff = 18 - choff;
+        }
+    }
+    return group_size*4;
+}
+static void ma_dr_mp3_L12_apply_scf_384(ma_dr_mp3_L12_scale_info *sci, const float *scf, float *dst)
+{
+    int i, k;
+    MA_DR_MP3_COPY_MEMORY(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float));
+    for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6)
+    {
+        for (k = 0; k < 12; k++)
+        {
+            dst[k + 0]   *= scf[0];
+            dst[k + 576] *= scf[3];
+        }
+    }
+}
+#endif
+static int ma_dr_mp3_L3_read_side_info(ma_dr_mp3_bs *bs, ma_dr_mp3_L3_gr_info *gr, const ma_uint8 *hdr)
+{
+    static const ma_uint8 g_scf_long[8][23] = {
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 4,4,4,4,4,4,6,6,8,8,10,12,16,20,24,28,34,42,50,54,76,158,0 },
+        { 4,4,4,4,4,4,6,6,6,8,10,12,16,18,22,28,34,40,46,54,54,192,0 },
+        { 4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102,26,0 }
+    };
+    static const ma_uint8 g_scf_short[8][40] = {
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 8,8,8,8,8,8,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
+    };
+    static const ma_uint8 g_scf_mixed[8][40] = {
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 12,12,12,4,4,4,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
+        { 6,6,6,6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
+    };
+    unsigned tables, scfsi = 0;
+    int main_data_begin, part_23_sum = 0;
+    int gr_count = MA_DR_MP3_HDR_IS_MONO(hdr) ? 1 : 2;
+    int sr_idx = MA_DR_MP3_HDR_GET_MY_SAMPLE_RATE(hdr); sr_idx -= (sr_idx != 0);
+    if (MA_DR_MP3_HDR_TEST_MPEG1(hdr))
+    {
+        gr_count *= 2;
+        main_data_begin = ma_dr_mp3_bs_get_bits(bs, 9);
+        scfsi = ma_dr_mp3_bs_get_bits(bs, 7 + gr_count);
+    } else
+    {
+        main_data_begin = ma_dr_mp3_bs_get_bits(bs, 8 + gr_count) >> gr_count;
+    }
+    do
+    {
+        if (MA_DR_MP3_HDR_IS_MONO(hdr))
+        {
+            scfsi <<= 4;
+        }
+        gr->part_23_length = (ma_uint16)ma_dr_mp3_bs_get_bits(bs, 12);
+        part_23_sum += gr->part_23_length;
+        gr->big_values = (ma_uint16)ma_dr_mp3_bs_get_bits(bs,  9);
+        if (gr->big_values > 288)
+        {
+            return -1;
+        }
+        gr->global_gain = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 8);
+        gr->scalefac_compress = (ma_uint16)ma_dr_mp3_bs_get_bits(bs, MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 4 : 9);
+        gr->sfbtab = g_scf_long[sr_idx];
+        gr->n_long_sfb  = 22;
+        gr->n_short_sfb = 0;
+        if (ma_dr_mp3_bs_get_bits(bs, 1))
+        {
+            gr->block_type = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 2);
+            if (!gr->block_type)
+            {
+                return -1;
+            }
+            gr->mixed_block_flag = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 1);
+            gr->region_count[0] = 7;
+            gr->region_count[1] = 255;
+            if (gr->block_type == MA_DR_MP3_SHORT_BLOCK_TYPE)
+            {
+                scfsi &= 0x0F0F;
+                if (!gr->mixed_block_flag)
+                {
+                    gr->region_count[0] = 8;
+                    gr->sfbtab = g_scf_short[sr_idx];
+                    gr->n_long_sfb = 0;
+                    gr->n_short_sfb = 39;
+                } else
+                {
+                    gr->sfbtab = g_scf_mixed[sr_idx];
+                    gr->n_long_sfb = MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 8 : 6;
+                    gr->n_short_sfb = 30;
+                }
+            }
+            tables = ma_dr_mp3_bs_get_bits(bs, 10);
+            tables <<= 5;
+            gr->subblock_gain[0] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 3);
+            gr->subblock_gain[1] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 3);
+            gr->subblock_gain[2] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 3);
+        } else
+        {
+            gr->block_type = 0;
+            gr->mixed_block_flag = 0;
+            tables = ma_dr_mp3_bs_get_bits(bs, 15);
+            gr->region_count[0] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 4);
+            gr->region_count[1] = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 3);
+            gr->region_count[2] = 255;
+        }
+        gr->table_select[0] = (ma_uint8)(tables >> 10);
+        gr->table_select[1] = (ma_uint8)((tables >> 5) & 31);
+        gr->table_select[2] = (ma_uint8)((tables) & 31);
+        gr->preflag = (ma_uint8)(MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? ma_dr_mp3_bs_get_bits(bs, 1) : (gr->scalefac_compress >= 500));
+        gr->scalefac_scale = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 1);
+        gr->count1_table = (ma_uint8)ma_dr_mp3_bs_get_bits(bs, 1);
+        gr->scfsi = (ma_uint8)((scfsi >> 12) & 15);
+        scfsi <<= 4;
+        gr++;
+    } while(--gr_count);
+    if (part_23_sum + bs->pos > bs->limit + main_data_begin*8)
+    {
+        return -1;
+    }
+    return main_data_begin;
+}
+static void ma_dr_mp3_L3_read_scalefactors(ma_uint8 *scf, ma_uint8 *ist_pos, const ma_uint8 *scf_size, const ma_uint8 *scf_count, ma_dr_mp3_bs *bitbuf, int scfsi)
+{
+    int i, k;
+    for (i = 0; i < 4 && scf_count[i]; i++, scfsi *= 2)
+    {
+        int cnt = scf_count[i];
+        if (scfsi & 8)
+        {
+            MA_DR_MP3_COPY_MEMORY(scf, ist_pos, cnt);
+        } else
+        {
+            int bits = scf_size[i];
+            if (!bits)
+            {
+                MA_DR_MP3_ZERO_MEMORY(scf, cnt);
+                MA_DR_MP3_ZERO_MEMORY(ist_pos, cnt);
+            } else
+            {
+                int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1;
+                for (k = 0; k < cnt; k++)
+                {
+                    int s = ma_dr_mp3_bs_get_bits(bitbuf, bits);
+                    ist_pos[k] = (ma_uint8)(s == max_scf ? -1 : s);
+                    scf[k] = (ma_uint8)s;
+                }
+            }
+        }
+        ist_pos += cnt;
+        scf += cnt;
+    }
+    scf[0] = scf[1] = scf[2] = 0;
+}
+static float ma_dr_mp3_L3_ldexp_q2(float y, int exp_q2)
+{
+    static const float g_expfrac[4] = { 9.31322575e-10f,7.83145814e-10f,6.58544508e-10f,5.53767716e-10f };
+    int e;
+    do
+    {
+        e = MA_DR_MP3_MIN(30*4, exp_q2);
+        y *= g_expfrac[e & 3]*(1 << 30 >> (e >> 2));
+    } while ((exp_q2 -= e) > 0);
+    return y;
+}
+static void ma_dr_mp3_L3_decode_scalefactors(const ma_uint8 *hdr, ma_uint8 *ist_pos, ma_dr_mp3_bs *bs, const ma_dr_mp3_L3_gr_info *gr, float *scf, int ch)
+{
+    static const ma_uint8 g_scf_partitions[3][28] = {
+        { 6,5,5, 5,6,5,5,5,6,5, 7,3,11,10,0,0, 7, 7, 7,0, 6, 6,6,3, 8, 8,5,0 },
+        { 8,9,6,12,6,9,9,9,6,9,12,6,15,18,0,0, 6,15,12,0, 6,12,9,6, 6,18,9,0 },
+        { 9,9,6,12,9,9,9,9,9,9,12,6,18,18,0,0,12,12,12,0,12, 9,9,6,15,12,9,0 }
+    };
+    const ma_uint8 *scf_partition = g_scf_partitions[!!gr->n_short_sfb + !gr->n_long_sfb];
+    ma_uint8 scf_size[4], iscf[40];
+    int i, scf_shift = gr->scalefac_scale + 1, gain_exp, scfsi = gr->scfsi;
+    float gain;
+    if (MA_DR_MP3_HDR_TEST_MPEG1(hdr))
+    {
+        static const ma_uint8 g_scfc_decode[16] = { 0,1,2,3, 12,5,6,7, 9,10,11,13, 14,15,18,19 };
+        int part = g_scfc_decode[gr->scalefac_compress];
+        scf_size[1] = scf_size[0] = (ma_uint8)(part >> 2);
+        scf_size[3] = scf_size[2] = (ma_uint8)(part & 3);
+    } else
+    {
+        static const ma_uint8 g_mod[6*4] = { 5,5,4,4,5,5,4,1,4,3,1,1,5,6,6,1,4,4,4,1,4,3,1,1 };
+        int k, modprod, sfc, ist = MA_DR_MP3_HDR_TEST_I_STEREO(hdr) && ch;
+        sfc = gr->scalefac_compress >> ist;
+        for (k = ist*3*4; sfc >= 0; sfc -= modprod, k += 4)
+        {
+            for (modprod = 1, i = 3; i >= 0; i--)
+            {
+                scf_size[i] = (ma_uint8)(sfc / modprod % g_mod[k + i]);
+                modprod *= g_mod[k + i];
+            }
+        }
+        scf_partition += k;
+        scfsi = -16;
+    }
+    ma_dr_mp3_L3_read_scalefactors(iscf, ist_pos, scf_size, scf_partition, bs, scfsi);
+    if (gr->n_short_sfb)
+    {
+        int sh = 3 - scf_shift;
+        for (i = 0; i < gr->n_short_sfb; i += 3)
+        {
+            iscf[gr->n_long_sfb + i + 0] = (ma_uint8)(iscf[gr->n_long_sfb + i + 0] + (gr->subblock_gain[0] << sh));
+            iscf[gr->n_long_sfb + i + 1] = (ma_uint8)(iscf[gr->n_long_sfb + i + 1] + (gr->subblock_gain[1] << sh));
+            iscf[gr->n_long_sfb + i + 2] = (ma_uint8)(iscf[gr->n_long_sfb + i + 2] + (gr->subblock_gain[2] << sh));
+        }
+    } else if (gr->preflag)
+    {
+        static const ma_uint8 g_preamp[10] = { 1,1,1,1,2,2,3,3,3,2 };
+        for (i = 0; i < 10; i++)
+        {
+            iscf[11 + i] = (ma_uint8)(iscf[11 + i] + g_preamp[i]);
+        }
+    }
+    gain_exp = gr->global_gain + MA_DR_MP3_BITS_DEQUANTIZER_OUT*4 - 210 - (MA_DR_MP3_HDR_IS_MS_STEREO(hdr) ? 2 : 0);
+    gain = ma_dr_mp3_L3_ldexp_q2(1 << (MA_DR_MP3_MAX_SCFI/4),  MA_DR_MP3_MAX_SCFI - gain_exp);
+    for (i = 0; i < (int)(gr->n_long_sfb + gr->n_short_sfb); i++)
+    {
+        scf[i] = ma_dr_mp3_L3_ldexp_q2(gain, iscf[i] << scf_shift);
+    }
+}
+static const float g_ma_dr_mp3_pow43[129 + 16] = {
+    0,-1,-2.519842f,-4.326749f,-6.349604f,-8.549880f,-10.902724f,-13.390518f,-16.000000f,-18.720754f,-21.544347f,-24.463781f,-27.473142f,-30.567351f,-33.741992f,-36.993181f,
+    0,1,2.519842f,4.326749f,6.349604f,8.549880f,10.902724f,13.390518f,16.000000f,18.720754f,21.544347f,24.463781f,27.473142f,30.567351f,33.741992f,36.993181f,40.317474f,43.711787f,47.173345f,50.699631f,54.288352f,57.937408f,61.644865f,65.408941f,69.227979f,73.100443f,77.024898f,81.000000f,85.024491f,89.097188f,93.216975f,97.382800f,101.593667f,105.848633f,110.146801f,114.487321f,118.869381f,123.292209f,127.755065f,132.257246f,136.798076f,141.376907f,145.993119f,150.646117f,155.335327f,160.060199f,164.820202f,169.614826f,174.443577f,179.305980f,184.201575f,189.129918f,194.090580f,199.083145f,204.107210f,209.162385f,214.248292f,219.364564f,224.510845f,229.686789f,234.892058f,240.126328f,245.389280f,250.680604f,256.000000f,261.347174f,266.721841f,272.123723f,277.552547f,283.008049f,288.489971f,293.998060f,299.532071f,305.091761f,310.676898f,316.287249f,321.922592f,327.582707f,333.267377f,338.976394f,344.709550f,350.466646f,356.247482f,362.051866f,367.879608f,373.730522f,379.604427f,385.501143f,391.420496f,397.362314f,403.326427f,409.312672f,415.320884f,421.350905f,427.402579f,433.475750f,439.570269f,445.685987f,451.822757f,457.980436f,464.158883f,470.357960f,476.577530f,482.817459f,489.077615f,495.357868f,501.658090f,507.978156f,514.317941f,520.677324f,527.056184f,533.454404f,539.871867f,546.308458f,552.764065f,559.238575f,565.731879f,572.243870f,578.774440f,585.323483f,591.890898f,598.476581f,605.080431f,611.702349f,618.342238f,625.000000f,631.675540f,638.368763f,645.079578f
+};
+static float ma_dr_mp3_L3_pow_43(int x)
+{
+    float frac;
+    int sign, mult = 256;
+    if (x < 129)
+    {
+        return g_ma_dr_mp3_pow43[16 + x];
+    }
+    if (x < 1024)
+    {
+        mult = 16;
+        x <<= 3;
+    }
+    sign = 2*x & 64;
+    frac = (float)((x & 63) - sign) / ((x & ~63) + sign);
+    return g_ma_dr_mp3_pow43[16 + ((x + sign) >> 6)]*(1.f + frac*((4.f/3) + frac*(2.f/9)))*mult;
+}
+static void ma_dr_mp3_L3_huffman(float *dst, ma_dr_mp3_bs *bs, const ma_dr_mp3_L3_gr_info *gr_info, const float *scf, int layer3gr_limit)
+{
+    static const ma_int16 tabs[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        785,785,785,785,784,784,784,784,513,513,513,513,513,513,513,513,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,
+        -255,1313,1298,1282,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,290,288,
+        -255,1313,1298,1282,769,769,769,769,529,529,529,529,529,529,529,529,528,528,528,528,528,528,528,528,512,512,512,512,512,512,512,512,290,288,
+        -253,-318,-351,-367,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,819,818,547,547,275,275,275,275,561,560,515,546,289,274,288,258,
+        -254,-287,1329,1299,1314,1312,1057,1057,1042,1042,1026,1026,784,784,784,784,529,529,529,529,529,529,529,529,769,769,769,769,768,768,768,768,563,560,306,306,291,259,
+        -252,-413,-477,-542,1298,-575,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-383,-399,1107,1092,1106,1061,849,849,789,789,1104,1091,773,773,1076,1075,341,340,325,309,834,804,577,577,532,532,516,516,832,818,803,816,561,561,531,531,515,546,289,289,288,258,
+        -252,-429,-493,-559,1057,1057,1042,1042,529,529,529,529,529,529,529,529,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,-382,1077,-415,1106,1061,1104,849,849,789,789,1091,1076,1029,1075,834,834,597,581,340,340,339,324,804,833,532,532,832,772,818,803,817,787,816,771,290,290,290,290,288,258,
+        -253,-349,-414,-447,-463,1329,1299,-479,1314,1312,1057,1057,1042,1042,1026,1026,785,785,785,785,784,784,784,784,769,769,769,769,768,768,768,768,-319,851,821,-335,836,850,805,849,341,340,325,336,533,533,579,579,564,564,773,832,578,548,563,516,321,276,306,291,304,259,
+        -251,-572,-733,-830,-863,-879,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,1396,1351,1381,1366,1395,1335,1380,-559,1334,1138,1138,1063,1063,1350,1392,1031,1031,1062,1062,1364,1363,1120,1120,1333,1348,881,881,881,881,375,374,359,373,343,358,341,325,791,791,1123,1122,-703,1105,1045,-719,865,865,790,790,774,774,1104,1029,338,293,323,308,-799,-815,833,788,772,818,803,816,322,292,307,320,561,531,515,546,289,274,288,258,
+        -251,-525,-605,-685,-765,-831,-846,1298,1057,1057,1312,1282,785,785,785,785,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,1399,1398,1383,1367,1382,1396,1351,-511,1381,1366,1139,1139,1079,1079,1124,1124,1364,1349,1363,1333,882,882,882,882,807,807,807,807,1094,1094,1136,1136,373,341,535,535,881,775,867,822,774,-591,324,338,-671,849,550,550,866,864,609,609,293,336,534,534,789,835,773,-751,834,804,308,307,833,788,832,772,562,562,547,547,305,275,560,515,290,290,
+        -252,-397,-477,-557,-622,-653,-719,-735,-750,1329,1299,1314,1057,1057,1042,1042,1312,1282,1024,1024,785,785,785,785,784,784,784,784,769,769,769,769,-383,1127,1141,1111,1126,1140,1095,1110,869,869,883,883,1079,1109,882,882,375,374,807,868,838,881,791,-463,867,822,368,263,852,837,836,-543,610,610,550,550,352,336,534,534,865,774,851,821,850,805,593,533,579,564,773,832,578,578,548,548,577,577,307,276,306,291,516,560,259,259,
+        -250,-2107,-2507,-2764,-2909,-2974,-3007,-3023,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-767,-1052,-1213,-1277,-1358,-1405,-1469,-1535,-1550,-1582,-1614,-1647,-1662,-1694,-1726,-1759,-1774,-1807,-1822,-1854,-1886,1565,-1919,-1935,-1951,-1967,1731,1730,1580,1717,-1983,1729,1564,-1999,1548,-2015,-2031,1715,1595,-2047,1714,-2063,1610,-2079,1609,-2095,1323,1323,1457,1457,1307,1307,1712,1547,1641,1700,1699,1594,1685,1625,1442,1442,1322,1322,-780,-973,-910,1279,1278,1277,1262,1276,1261,1275,1215,1260,1229,-959,974,974,989,989,-943,735,478,478,495,463,506,414,-1039,1003,958,1017,927,942,987,957,431,476,1272,1167,1228,-1183,1256,-1199,895,895,941,941,1242,1227,1212,1135,1014,1014,490,489,503,487,910,1013,985,925,863,894,970,955,1012,847,-1343,831,755,755,984,909,428,366,754,559,-1391,752,486,457,924,997,698,698,983,893,740,740,908,877,739,739,667,667,953,938,497,287,271,271,683,606,590,712,726,574,302,302,738,736,481,286,526,725,605,711,636,724,696,651,589,681,666,710,364,467,573,695,466,466,301,465,379,379,709,604,665,679,316,316,634,633,436,436,464,269,424,394,452,332,438,363,347,408,393,448,331,422,362,407,392,421,346,406,391,376,375,359,1441,1306,-2367,1290,-2383,1337,-2399,-2415,1426,1321,-2431,1411,1336,-2447,-2463,-2479,1169,1169,1049,1049,1424,1289,1412,1352,1319,-2495,1154,1154,1064,1064,1153,1153,416,390,360,404,403,389,344,374,373,343,358,372,327,357,342,311,356,326,1395,1394,1137,1137,1047,1047,1365,1392,1287,1379,1334,1364,1349,1378,1318,1363,792,792,792,792,1152,1152,1032,1032,1121,1121,1046,1046,1120,1120,1030,1030,-2895,1106,1061,1104,849,849,789,789,1091,1076,1029,1090,1060,1075,833,833,309,324,532,532,832,772,818,803,561,561,531,560,515,546,289,274,288,258,
+        -250,-1179,-1579,-1836,-1996,-2124,-2253,-2333,-2413,-2477,-2542,-2574,-2607,-2622,-2655,1314,1313,1298,1312,1282,785,785,785,785,1040,1040,1025,1025,768,768,768,768,-766,-798,-830,-862,-895,-911,-927,-943,-959,-975,-991,-1007,-1023,-1039,-1055,-1070,1724,1647,-1103,-1119,1631,1767,1662,1738,1708,1723,-1135,1780,1615,1779,1599,1677,1646,1778,1583,-1151,1777,1567,1737,1692,1765,1722,1707,1630,1751,1661,1764,1614,1736,1676,1763,1750,1645,1598,1721,1691,1762,1706,1582,1761,1566,-1167,1749,1629,767,766,751,765,494,494,735,764,719,749,734,763,447,447,748,718,477,506,431,491,446,476,461,505,415,430,475,445,504,399,460,489,414,503,383,474,429,459,502,502,746,752,488,398,501,473,413,472,486,271,480,270,-1439,-1455,1357,-1471,-1487,-1503,1341,1325,-1519,1489,1463,1403,1309,-1535,1372,1448,1418,1476,1356,1462,1387,-1551,1475,1340,1447,1402,1386,-1567,1068,1068,1474,1461,455,380,468,440,395,425,410,454,364,467,466,464,453,269,409,448,268,432,1371,1473,1432,1417,1308,1460,1355,1446,1459,1431,1083,1083,1401,1416,1458,1445,1067,1067,1370,1457,1051,1051,1291,1430,1385,1444,1354,1415,1400,1443,1082,1082,1173,1113,1186,1066,1185,1050,-1967,1158,1128,1172,1097,1171,1081,-1983,1157,1112,416,266,375,400,1170,1142,1127,1065,793,793,1169,1033,1156,1096,1141,1111,1155,1080,1126,1140,898,898,808,808,897,897,792,792,1095,1152,1032,1125,1110,1139,1079,1124,882,807,838,881,853,791,-2319,867,368,263,822,852,837,866,806,865,-2399,851,352,262,534,534,821,836,594,594,549,549,593,593,533,533,848,773,579,579,564,578,548,563,276,276,577,576,306,291,516,560,305,305,275,259,
+        -251,-892,-2058,-2620,-2828,-2957,-3023,-3039,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,-559,1530,-575,-591,1528,1527,1407,1526,1391,1023,1023,1023,1023,1525,1375,1268,1268,1103,1103,1087,1087,1039,1039,1523,-604,815,815,815,815,510,495,509,479,508,463,507,447,431,505,415,399,-734,-782,1262,-815,1259,1244,-831,1258,1228,-847,-863,1196,-879,1253,987,987,748,-767,493,493,462,477,414,414,686,669,478,446,461,445,474,429,487,458,412,471,1266,1264,1009,1009,799,799,-1019,-1276,-1452,-1581,-1677,-1757,-1821,-1886,-1933,-1997,1257,1257,1483,1468,1512,1422,1497,1406,1467,1496,1421,1510,1134,1134,1225,1225,1466,1451,1374,1405,1252,1252,1358,1480,1164,1164,1251,1251,1238,1238,1389,1465,-1407,1054,1101,-1423,1207,-1439,830,830,1248,1038,1237,1117,1223,1148,1236,1208,411,426,395,410,379,269,1193,1222,1132,1235,1221,1116,976,976,1192,1162,1177,1220,1131,1191,963,963,-1647,961,780,-1663,558,558,994,993,437,408,393,407,829,978,813,797,947,-1743,721,721,377,392,844,950,828,890,706,706,812,859,796,960,948,843,934,874,571,571,-1919,690,555,689,421,346,539,539,944,779,918,873,932,842,903,888,570,570,931,917,674,674,-2575,1562,-2591,1609,-2607,1654,1322,1322,1441,1441,1696,1546,1683,1593,1669,1624,1426,1426,1321,1321,1639,1680,1425,1425,1305,1305,1545,1668,1608,1623,1667,1592,1638,1666,1320,1320,1652,1607,1409,1409,1304,1304,1288,1288,1664,1637,1395,1395,1335,1335,1622,1636,1394,1394,1319,1319,1606,1621,1392,1392,1137,1137,1137,1137,345,390,360,375,404,373,1047,-2751,-2767,-2783,1062,1121,1046,-2799,1077,-2815,1106,1061,789,789,1105,1104,263,355,310,340,325,354,352,262,339,324,1091,1076,1029,1090,1060,1075,833,833,788,788,1088,1028,818,818,803,803,561,561,531,531,816,771,546,546,289,274,288,258,
+        -253,-317,-381,-446,-478,-509,1279,1279,-811,-1179,-1451,-1756,-1900,-2028,-2189,-2253,-2333,-2414,-2445,-2511,-2526,1313,1298,-2559,1041,1041,1040,1040,1025,1025,1024,1024,1022,1007,1021,991,1020,975,1019,959,687,687,1018,1017,671,671,655,655,1016,1015,639,639,758,758,623,623,757,607,756,591,755,575,754,559,543,543,1009,783,-575,-621,-685,-749,496,-590,750,749,734,748,974,989,1003,958,988,973,1002,942,987,957,972,1001,926,986,941,971,956,1000,910,985,925,999,894,970,-1071,-1087,-1102,1390,-1135,1436,1509,1451,1374,-1151,1405,1358,1480,1420,-1167,1507,1494,1389,1342,1465,1435,1450,1326,1505,1310,1493,1373,1479,1404,1492,1464,1419,428,443,472,397,736,526,464,464,486,457,442,471,484,482,1357,1449,1434,1478,1388,1491,1341,1490,1325,1489,1463,1403,1309,1477,1372,1448,1418,1433,1476,1356,1462,1387,-1439,1475,1340,1447,1402,1474,1324,1461,1371,1473,269,448,1432,1417,1308,1460,-1711,1459,-1727,1441,1099,1099,1446,1386,1431,1401,-1743,1289,1083,1083,1160,1160,1458,1445,1067,1067,1370,1457,1307,1430,1129,1129,1098,1098,268,432,267,416,266,400,-1887,1144,1187,1082,1173,1113,1186,1066,1050,1158,1128,1143,1172,1097,1171,1081,420,391,1157,1112,1170,1142,1127,1065,1169,1049,1156,1096,1141,1111,1155,1080,1126,1154,1064,1153,1140,1095,1048,-2159,1125,1110,1137,-2175,823,823,1139,1138,807,807,384,264,368,263,868,838,853,791,867,822,852,837,866,806,865,790,-2319,851,821,836,352,262,850,805,849,-2399,533,533,835,820,336,261,578,548,563,577,532,532,832,772,562,562,547,547,305,275,560,515,290,290,288,258 };
+    static const ma_uint8 tab32[] = { 130,162,193,209,44,28,76,140,9,9,9,9,9,9,9,9,190,254,222,238,126,94,157,157,109,61,173,205};
+    static const ma_uint8 tab33[] = { 252,236,220,204,188,172,156,140,124,108,92,76,60,44,28,12 };
+    static const ma_int16 tabindex[2*16] = { 0,32,64,98,0,132,180,218,292,364,426,538,648,746,0,1126,1460,1460,1460,1460,1460,1460,1460,1460,1842,1842,1842,1842,1842,1842,1842,1842 };
+    static const ma_uint8 g_linbits[] =  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,6,8,10,13,4,5,6,7,8,9,11,13 };
+#define MA_DR_MP3_PEEK_BITS(n)    (bs_cache >> (32 - (n)))
+#define MA_DR_MP3_FLUSH_BITS(n)   { bs_cache <<= (n); bs_sh += (n); }
+#define MA_DR_MP3_CHECK_BITS      while (bs_sh >= 0) { bs_cache |= (ma_uint32)*bs_next_ptr++ << bs_sh; bs_sh -= 8; }
+#define MA_DR_MP3_BSPOS           ((bs_next_ptr - bs->buf)*8 - 24 + bs_sh)
+    float one = 0.0f;
+    int ireg = 0, big_val_cnt = gr_info->big_values;
+    const ma_uint8 *sfb = gr_info->sfbtab;
+    const ma_uint8 *bs_next_ptr = bs->buf + bs->pos/8;
+    ma_uint32 bs_cache = (((bs_next_ptr[0]*256u + bs_next_ptr[1])*256u + bs_next_ptr[2])*256u + bs_next_ptr[3]) << (bs->pos & 7);
+    int pairs_to_decode, np, bs_sh = (bs->pos & 7) - 8;
+    bs_next_ptr += 4;
+    while (big_val_cnt > 0)
+    {
+        int tab_num = gr_info->table_select[ireg];
+        int sfb_cnt = gr_info->region_count[ireg++];
+        const ma_int16 *codebook = tabs + tabindex[tab_num];
+        int linbits = g_linbits[tab_num];
+        if (linbits)
+        {
+            do
+            {
+                np = *sfb++ / 2;
+                pairs_to_decode = MA_DR_MP3_MIN(big_val_cnt, np);
+                one = *scf++;
+                do
+                {
+                    int j, w = 5;
+                    int leaf = codebook[MA_DR_MP3_PEEK_BITS(w)];
+                    while (leaf < 0)
+                    {
+                        MA_DR_MP3_FLUSH_BITS(w);
+                        w = leaf & 7;
+                        leaf = codebook[MA_DR_MP3_PEEK_BITS(w) - (leaf >> 3)];
+                    }
+                    MA_DR_MP3_FLUSH_BITS(leaf >> 8);
+                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
+                    {
+                        int lsb = leaf & 0x0F;
+                        if (lsb == 15)
+                        {
+                            lsb += MA_DR_MP3_PEEK_BITS(linbits);
+                            MA_DR_MP3_FLUSH_BITS(linbits);
+                            MA_DR_MP3_CHECK_BITS;
+                            *dst = one*ma_dr_mp3_L3_pow_43(lsb)*((ma_int32)bs_cache < 0 ? -1: 1);
+                        } else
+                        {
+                            *dst = g_ma_dr_mp3_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
+                        }
+                        MA_DR_MP3_FLUSH_BITS(lsb ? 1 : 0);
+                    }
+                    MA_DR_MP3_CHECK_BITS;
+                } while (--pairs_to_decode);
+            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
+        } else
+        {
+            do
+            {
+                np = *sfb++ / 2;
+                pairs_to_decode = MA_DR_MP3_MIN(big_val_cnt, np);
+                one = *scf++;
+                do
+                {
+                    int j, w = 5;
+                    int leaf = codebook[MA_DR_MP3_PEEK_BITS(w)];
+                    while (leaf < 0)
+                    {
+                        MA_DR_MP3_FLUSH_BITS(w);
+                        w = leaf & 7;
+                        leaf = codebook[MA_DR_MP3_PEEK_BITS(w) - (leaf >> 3)];
+                    }
+                    MA_DR_MP3_FLUSH_BITS(leaf >> 8);
+                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
+                    {
+                        int lsb = leaf & 0x0F;
+                        *dst = g_ma_dr_mp3_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
+                        MA_DR_MP3_FLUSH_BITS(lsb ? 1 : 0);
+                    }
+                    MA_DR_MP3_CHECK_BITS;
+                } while (--pairs_to_decode);
+            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
+        }
+    }
+    for (np = 1 - big_val_cnt;; dst += 4)
+    {
+        const ma_uint8 *codebook_count1 = (gr_info->count1_table) ? tab33 : tab32;
+        int leaf = codebook_count1[MA_DR_MP3_PEEK_BITS(4)];
+        if (!(leaf & 8))
+        {
+            leaf = codebook_count1[(leaf >> 3) + (bs_cache << 4 >> (32 - (leaf & 3)))];
+        }
+        MA_DR_MP3_FLUSH_BITS(leaf & 7);
+        if (MA_DR_MP3_BSPOS > layer3gr_limit)
+        {
+            break;
+        }
+#define MA_DR_MP3_RELOAD_SCALEFACTOR  if (!--np) { np = *sfb++/2; if (!np) break; one = *scf++; }
+#define MA_DR_MP3_DEQ_COUNT1(s) if (leaf & (128 >> s)) { dst[s] = ((ma_int32)bs_cache < 0) ? -one : one; MA_DR_MP3_FLUSH_BITS(1) }
+        MA_DR_MP3_RELOAD_SCALEFACTOR;
+        MA_DR_MP3_DEQ_COUNT1(0);
+        MA_DR_MP3_DEQ_COUNT1(1);
+        MA_DR_MP3_RELOAD_SCALEFACTOR;
+        MA_DR_MP3_DEQ_COUNT1(2);
+        MA_DR_MP3_DEQ_COUNT1(3);
+        MA_DR_MP3_CHECK_BITS;
+    }
+    bs->pos = layer3gr_limit;
+}
+static void ma_dr_mp3_L3_midside_stereo(float *left, int n)
+{
+    int i = 0;
+    float *right = left + 576;
+#if MA_DR_MP3_HAVE_SIMD
+    if (ma_dr_mp3_have_simd())
+    {
+        for (; i < n - 3; i += 4)
+        {
+            ma_dr_mp3_f4 vl = MA_DR_MP3_VLD(left + i);
+            ma_dr_mp3_f4 vr = MA_DR_MP3_VLD(right + i);
+            MA_DR_MP3_VSTORE(left + i, MA_DR_MP3_VADD(vl, vr));
+            MA_DR_MP3_VSTORE(right + i, MA_DR_MP3_VSUB(vl, vr));
+        }
+#ifdef __GNUC__
+        if (__builtin_constant_p(n % 4 == 0) && n % 4 == 0)
+            return;
+#endif
+    }
+#endif
+    for (; i < n; i++)
+    {
+        float a = left[i];
+        float b = right[i];
+        left[i] = a + b;
+        right[i] = a - b;
+    }
+}
+static void ma_dr_mp3_L3_intensity_stereo_band(float *left, int n, float kl, float kr)
+{
+    int i;
+    for (i = 0; i < n; i++)
+    {
+        left[i + 576] = left[i]*kr;
+        left[i] = left[i]*kl;
+    }
+}
+static void ma_dr_mp3_L3_stereo_top_band(const float *right, const ma_uint8 *sfb, int nbands, int max_band[3])
+{
+    int i, k;
+    max_band[0] = max_band[1] = max_band[2] = -1;
+    for (i = 0; i < nbands; i++)
+    {
+        for (k = 0; k < sfb[i]; k += 2)
+        {
+            if (right[k] != 0 || right[k + 1] != 0)
+            {
+                max_band[i % 3] = i;
+                break;
+            }
+        }
+        right += sfb[i];
+    }
+}
+static void ma_dr_mp3_L3_stereo_process(float *left, const ma_uint8 *ist_pos, const ma_uint8 *sfb, const ma_uint8 *hdr, int max_band[3], int mpeg2_sh)
+{
+    static const float g_pan[7*2] = { 0,1,0.21132487f,0.78867513f,0.36602540f,0.63397460f,0.5f,0.5f,0.63397460f,0.36602540f,0.78867513f,0.21132487f,1,0 };
+    unsigned i, max_pos = MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 7 : 64;
+    for (i = 0; sfb[i]; i++)
+    {
+        unsigned ipos = ist_pos[i];
+        if ((int)i > max_band[i % 3] && ipos < max_pos)
+        {
+            float kl, kr, s = MA_DR_MP3_HDR_TEST_MS_STEREO(hdr) ? 1.41421356f : 1;
+            if (MA_DR_MP3_HDR_TEST_MPEG1(hdr))
+            {
+                kl = g_pan[2*ipos];
+                kr = g_pan[2*ipos + 1];
+            } else
+            {
+                kl = 1;
+                kr = ma_dr_mp3_L3_ldexp_q2(1, (ipos + 1) >> 1 << mpeg2_sh);
+                if (ipos & 1)
+                {
+                    kl = kr;
+                    kr = 1;
+                }
+            }
+            ma_dr_mp3_L3_intensity_stereo_band(left, sfb[i], kl*s, kr*s);
+        } else if (MA_DR_MP3_HDR_TEST_MS_STEREO(hdr))
+        {
+            ma_dr_mp3_L3_midside_stereo(left, sfb[i]);
+        }
+        left += sfb[i];
+    }
+}
+static void ma_dr_mp3_L3_intensity_stereo(float *left, ma_uint8 *ist_pos, const ma_dr_mp3_L3_gr_info *gr, const ma_uint8 *hdr)
+{
+    int max_band[3], n_sfb = gr->n_long_sfb + gr->n_short_sfb;
+    int i, max_blocks = gr->n_short_sfb ? 3 : 1;
+    ma_dr_mp3_L3_stereo_top_band(left + 576, gr->sfbtab, n_sfb, max_band);
+    if (gr->n_long_sfb)
+    {
+        max_band[0] = max_band[1] = max_band[2] = MA_DR_MP3_MAX(MA_DR_MP3_MAX(max_band[0], max_band[1]), max_band[2]);
+    }
+    for (i = 0; i < max_blocks; i++)
+    {
+        int default_pos = MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 3 : 0;
+        int itop = n_sfb - max_blocks + i;
+        int prev = itop - max_blocks;
+        ist_pos[itop] = (ma_uint8)(max_band[i] >= prev ? default_pos : ist_pos[prev]);
+    }
+    ma_dr_mp3_L3_stereo_process(left, ist_pos, gr->sfbtab, hdr, max_band, gr[1].scalefac_compress & 1);
+}
+static void ma_dr_mp3_L3_reorder(float *grbuf, float *scratch, const ma_uint8 *sfb)
+{
+    int i, len;
+    float *src = grbuf, *dst = scratch;
+    for (;0 != (len = *sfb); sfb += 3, src += 2*len)
+    {
+        for (i = 0; i < len; i++, src++)
+        {
+            *dst++ = src[0*len];
+            *dst++ = src[1*len];
+            *dst++ = src[2*len];
+        }
+    }
+    MA_DR_MP3_COPY_MEMORY(grbuf, scratch, (dst - scratch)*sizeof(float));
+}
+static void ma_dr_mp3_L3_antialias(float *grbuf, int nbands)
+{
+    static const float g_aa[2][8] = {
+        {0.85749293f,0.88174200f,0.94962865f,0.98331459f,0.99551782f,0.99916056f,0.99989920f,0.99999316f},
+        {0.51449576f,0.47173197f,0.31337745f,0.18191320f,0.09457419f,0.04096558f,0.01419856f,0.00369997f}
+    };
+    for (; nbands > 0; nbands--, grbuf += 18)
+    {
+        int i = 0;
+#if MA_DR_MP3_HAVE_SIMD
+        if (ma_dr_mp3_have_simd()) for (; i < 8; i += 4)
+        {
+            ma_dr_mp3_f4 vu = MA_DR_MP3_VLD(grbuf + 18 + i);
+            ma_dr_mp3_f4 vd = MA_DR_MP3_VLD(grbuf + 14 - i);
+            ma_dr_mp3_f4 vc0 = MA_DR_MP3_VLD(g_aa[0] + i);
+            ma_dr_mp3_f4 vc1 = MA_DR_MP3_VLD(g_aa[1] + i);
+            vd = MA_DR_MP3_VREV(vd);
+            MA_DR_MP3_VSTORE(grbuf + 18 + i, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vu, vc0), MA_DR_MP3_VMUL(vd, vc1)));
+            vd = MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vu, vc1), MA_DR_MP3_VMUL(vd, vc0));
+            MA_DR_MP3_VSTORE(grbuf + 14 - i, MA_DR_MP3_VREV(vd));
+        }
+#endif
+#ifndef MA_DR_MP3_ONLY_SIMD
+        for(; i < 8; i++)
+        {
+            float u = grbuf[18 + i];
+            float d = grbuf[17 - i];
+            grbuf[18 + i] = u*g_aa[0][i] - d*g_aa[1][i];
+            grbuf[17 - i] = u*g_aa[1][i] + d*g_aa[0][i];
+        }
+#endif
+    }
+}
+static void ma_dr_mp3_L3_dct3_9(float *y)
+{
+    float s0, s1, s2, s3, s4, s5, s6, s7, s8, t0, t2, t4;
+    s0 = y[0]; s2 = y[2]; s4 = y[4]; s6 = y[6]; s8 = y[8];
+    t0 = s0 + s6*0.5f;
+    s0 -= s6;
+    t4 = (s4 + s2)*0.93969262f;
+    t2 = (s8 + s2)*0.76604444f;
+    s6 = (s4 - s8)*0.17364818f;
+    s4 += s8 - s2;
+    s2 = s0 - s4*0.5f;
+    y[4] = s4 + s0;
+    s8 = t0 - t2 + s6;
+    s0 = t0 - t4 + t2;
+    s4 = t0 + t4 - s6;
+    s1 = y[1]; s3 = y[3]; s5 = y[5]; s7 = y[7];
+    s3 *= 0.86602540f;
+    t0 = (s5 + s1)*0.98480775f;
+    t4 = (s5 - s7)*0.34202014f;
+    t2 = (s1 + s7)*0.64278761f;
+    s1 = (s1 - s5 - s7)*0.86602540f;
+    s5 = t0 - s3 - t2;
+    s7 = t4 - s3 - t0;
+    s3 = t4 + s3 - t2;
+    y[0] = s4 - s7;
+    y[1] = s2 + s1;
+    y[2] = s0 - s3;
+    y[3] = s8 + s5;
+    y[5] = s8 - s5;
+    y[6] = s0 + s3;
+    y[7] = s2 - s1;
+    y[8] = s4 + s7;
+}
+static void ma_dr_mp3_L3_imdct36(float *grbuf, float *overlap, const float *window, int nbands)
+{
+    int i, j;
+    static const float g_twid9[18] = {
+        0.73727734f,0.79335334f,0.84339145f,0.88701083f,0.92387953f,0.95371695f,0.97629601f,0.99144486f,0.99904822f,0.67559021f,0.60876143f,0.53729961f,0.46174861f,0.38268343f,0.30070580f,0.21643961f,0.13052619f,0.04361938f
+    };
+    for (j = 0; j < nbands; j++, grbuf += 18, overlap += 9)
+    {
+        float co[9], si[9];
+        co[0] = -grbuf[0];
+        si[0] = grbuf[17];
+        for (i = 0; i < 4; i++)
+        {
+            si[8 - 2*i] =   grbuf[4*i + 1] - grbuf[4*i + 2];
+            co[1 + 2*i] =   grbuf[4*i + 1] + grbuf[4*i + 2];
+            si[7 - 2*i] =   grbuf[4*i + 4] - grbuf[4*i + 3];
+            co[2 + 2*i] = -(grbuf[4*i + 3] + grbuf[4*i + 4]);
+        }
+        ma_dr_mp3_L3_dct3_9(co);
+        ma_dr_mp3_L3_dct3_9(si);
+        si[1] = -si[1];
+        si[3] = -si[3];
+        si[5] = -si[5];
+        si[7] = -si[7];
+        i = 0;
+#if MA_DR_MP3_HAVE_SIMD
+        if (ma_dr_mp3_have_simd()) for (; i < 8; i += 4)
+        {
+            ma_dr_mp3_f4 vovl = MA_DR_MP3_VLD(overlap + i);
+            ma_dr_mp3_f4 vc = MA_DR_MP3_VLD(co + i);
+            ma_dr_mp3_f4 vs = MA_DR_MP3_VLD(si + i);
+            ma_dr_mp3_f4 vr0 = MA_DR_MP3_VLD(g_twid9 + i);
+            ma_dr_mp3_f4 vr1 = MA_DR_MP3_VLD(g_twid9 + 9 + i);
+            ma_dr_mp3_f4 vw0 = MA_DR_MP3_VLD(window + i);
+            ma_dr_mp3_f4 vw1 = MA_DR_MP3_VLD(window + 9 + i);
+            ma_dr_mp3_f4 vsum = MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vc, vr1), MA_DR_MP3_VMUL(vs, vr0));
+            MA_DR_MP3_VSTORE(overlap + i, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vc, vr0), MA_DR_MP3_VMUL(vs, vr1)));
+            MA_DR_MP3_VSTORE(grbuf + i, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vovl, vw0), MA_DR_MP3_VMUL(vsum, vw1)));
+            vsum = MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vovl, vw1), MA_DR_MP3_VMUL(vsum, vw0));
+            MA_DR_MP3_VSTORE(grbuf + 14 - i, MA_DR_MP3_VREV(vsum));
+        }
+#endif
+        for (; i < 9; i++)
+        {
+            float ovl  = overlap[i];
+            float sum  = co[i]*g_twid9[9 + i] + si[i]*g_twid9[0 + i];
+            overlap[i] = co[i]*g_twid9[0 + i] - si[i]*g_twid9[9 + i];
+            grbuf[i]      = ovl*window[0 + i] - sum*window[9 + i];
+            grbuf[17 - i] = ovl*window[9 + i] + sum*window[0 + i];
+        }
+    }
+}
+static void ma_dr_mp3_L3_idct3(float x0, float x1, float x2, float *dst)
+{
+    float m1 = x1*0.86602540f;
+    float a1 = x0 - x2*0.5f;
+    dst[1] = x0 + x2;
+    dst[0] = a1 + m1;
+    dst[2] = a1 - m1;
+}
+static void ma_dr_mp3_L3_imdct12(float *x, float *dst, float *overlap)
+{
+    static const float g_twid3[6] = { 0.79335334f,0.92387953f,0.99144486f, 0.60876143f,0.38268343f,0.13052619f };
+    float co[3], si[3];
+    int i;
+    ma_dr_mp3_L3_idct3(-x[0], x[6] + x[3], x[12] + x[9], co);
+    ma_dr_mp3_L3_idct3(x[15], x[12] - x[9], x[6] - x[3], si);
+    si[1] = -si[1];
+    for (i = 0; i < 3; i++)
+    {
+        float ovl  = overlap[i];
+        float sum  = co[i]*g_twid3[3 + i] + si[i]*g_twid3[0 + i];
+        overlap[i] = co[i]*g_twid3[0 + i] - si[i]*g_twid3[3 + i];
+        dst[i]     = ovl*g_twid3[2 - i] - sum*g_twid3[5 - i];
+        dst[5 - i] = ovl*g_twid3[5 - i] + sum*g_twid3[2 - i];
+    }
+}
+static void ma_dr_mp3_L3_imdct_short(float *grbuf, float *overlap, int nbands)
+{
+    for (;nbands > 0; nbands--, overlap += 9, grbuf += 18)
+    {
+        float tmp[18];
+        MA_DR_MP3_COPY_MEMORY(tmp, grbuf, sizeof(tmp));
+        MA_DR_MP3_COPY_MEMORY(grbuf, overlap, 6*sizeof(float));
+        ma_dr_mp3_L3_imdct12(tmp, grbuf + 6, overlap + 6);
+        ma_dr_mp3_L3_imdct12(tmp + 1, grbuf + 12, overlap + 6);
+        ma_dr_mp3_L3_imdct12(tmp + 2, overlap, overlap + 6);
+    }
+}
+static void ma_dr_mp3_L3_change_sign(float *grbuf)
+{
+    int b, i;
+    for (b = 0, grbuf += 18; b < 32; b += 2, grbuf += 36)
+        for (i = 1; i < 18; i += 2)
+            grbuf[i] = -grbuf[i];
+}
+static void ma_dr_mp3_L3_imdct_gr(float *grbuf, float *overlap, unsigned block_type, unsigned n_long_bands)
+{
+    static const float g_mdct_window[2][18] = {
+        { 0.99904822f,0.99144486f,0.97629601f,0.95371695f,0.92387953f,0.88701083f,0.84339145f,0.79335334f,0.73727734f,0.04361938f,0.13052619f,0.21643961f,0.30070580f,0.38268343f,0.46174861f,0.53729961f,0.60876143f,0.67559021f },
+        { 1,1,1,1,1,1,0.99144486f,0.92387953f,0.79335334f,0,0,0,0,0,0,0.13052619f,0.38268343f,0.60876143f }
+    };
+    if (n_long_bands)
+    {
+        ma_dr_mp3_L3_imdct36(grbuf, overlap, g_mdct_window[0], n_long_bands);
+        grbuf += 18*n_long_bands;
+        overlap += 9*n_long_bands;
+    }
+    if (block_type == MA_DR_MP3_SHORT_BLOCK_TYPE)
+        ma_dr_mp3_L3_imdct_short(grbuf, overlap, 32 - n_long_bands);
+    else
+        ma_dr_mp3_L3_imdct36(grbuf, overlap, g_mdct_window[block_type == MA_DR_MP3_STOP_BLOCK_TYPE], 32 - n_long_bands);
+}
+static void ma_dr_mp3_L3_save_reservoir(ma_dr_mp3dec *h, ma_dr_mp3dec_scratch *s)
+{
+    int pos = (s->bs.pos + 7)/8u;
+    int remains = s->bs.limit/8u - pos;
+    if (remains > MA_DR_MP3_MAX_BITRESERVOIR_BYTES)
+    {
+        pos += remains - MA_DR_MP3_MAX_BITRESERVOIR_BYTES;
+        remains = MA_DR_MP3_MAX_BITRESERVOIR_BYTES;
+    }
+    if (remains > 0)
+    {
+        MA_DR_MP3_MOVE_MEMORY(h->reserv_buf, s->maindata + pos, remains);
+    }
+    h->reserv = remains;
+}
+static int ma_dr_mp3_L3_restore_reservoir(ma_dr_mp3dec *h, ma_dr_mp3_bs *bs, ma_dr_mp3dec_scratch *s, int main_data_begin)
+{
+    int frame_bytes = (bs->limit - bs->pos)/8;
+    int bytes_have = MA_DR_MP3_MIN(h->reserv, main_data_begin);
+    MA_DR_MP3_COPY_MEMORY(s->maindata, h->reserv_buf + MA_DR_MP3_MAX(0, h->reserv - main_data_begin), MA_DR_MP3_MIN(h->reserv, main_data_begin));
+    MA_DR_MP3_COPY_MEMORY(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes);
+    ma_dr_mp3_bs_init(&s->bs, s->maindata, bytes_have + frame_bytes);
+    return h->reserv >= main_data_begin;
+}
+static void ma_dr_mp3_L3_decode(ma_dr_mp3dec *h, ma_dr_mp3dec_scratch *s, ma_dr_mp3_L3_gr_info *gr_info, int nch)
+{
+    int ch;
+    for (ch = 0; ch < nch; ch++)
+    {
+        int layer3gr_limit = s->bs.pos + gr_info[ch].part_23_length;
+        ma_dr_mp3_L3_decode_scalefactors(h->header, s->ist_pos[ch], &s->bs, gr_info + ch, s->scf, ch);
+        ma_dr_mp3_L3_huffman(s->grbuf[ch], &s->bs, gr_info + ch, s->scf, layer3gr_limit);
+    }
+    if (MA_DR_MP3_HDR_TEST_I_STEREO(h->header))
+    {
+        ma_dr_mp3_L3_intensity_stereo(s->grbuf[0], s->ist_pos[1], gr_info, h->header);
+    } else if (MA_DR_MP3_HDR_IS_MS_STEREO(h->header))
+    {
+        ma_dr_mp3_L3_midside_stereo(s->grbuf[0], 576);
+    }
+    for (ch = 0; ch < nch; ch++, gr_info++)
+    {
+        int aa_bands = 31;
+        int n_long_bands = (gr_info->mixed_block_flag ? 2 : 0) << (int)(MA_DR_MP3_HDR_GET_MY_SAMPLE_RATE(h->header) == 2);
+        if (gr_info->n_short_sfb)
+        {
+            aa_bands = n_long_bands - 1;
+            ma_dr_mp3_L3_reorder(s->grbuf[ch] + n_long_bands*18, s->syn[0], gr_info->sfbtab + gr_info->n_long_sfb);
+        }
+        ma_dr_mp3_L3_antialias(s->grbuf[ch], aa_bands);
+        ma_dr_mp3_L3_imdct_gr(s->grbuf[ch], h->mdct_overlap[ch], gr_info->block_type, n_long_bands);
+        ma_dr_mp3_L3_change_sign(s->grbuf[ch]);
+    }
+}
+static void ma_dr_mp3d_DCT_II(float *grbuf, int n)
+{
+    static const float g_sec[24] = {
+        10.19000816f,0.50060302f,0.50241929f,3.40760851f,0.50547093f,0.52249861f,2.05778098f,0.51544732f,0.56694406f,1.48416460f,0.53104258f,0.64682180f,1.16943991f,0.55310392f,0.78815460f,0.97256821f,0.58293498f,1.06067765f,0.83934963f,0.62250412f,1.72244716f,0.74453628f,0.67480832f,5.10114861f
+    };
+    int i, k = 0;
+#if MA_DR_MP3_HAVE_SIMD
+    if (ma_dr_mp3_have_simd()) for (; k < n; k += 4)
+    {
+        ma_dr_mp3_f4 t[4][8], *x;
+        float *y = grbuf + k;
+        for (x = t[0], i = 0; i < 8; i++, x++)
+        {
+            ma_dr_mp3_f4 x0 = MA_DR_MP3_VLD(&y[i*18]);
+            ma_dr_mp3_f4 x1 = MA_DR_MP3_VLD(&y[(15 - i)*18]);
+            ma_dr_mp3_f4 x2 = MA_DR_MP3_VLD(&y[(16 + i)*18]);
+            ma_dr_mp3_f4 x3 = MA_DR_MP3_VLD(&y[(31 - i)*18]);
+            ma_dr_mp3_f4 t0 = MA_DR_MP3_VADD(x0, x3);
+            ma_dr_mp3_f4 t1 = MA_DR_MP3_VADD(x1, x2);
+            ma_dr_mp3_f4 t2 = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x1, x2), g_sec[3*i + 0]);
+            ma_dr_mp3_f4 t3 = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x0, x3), g_sec[3*i + 1]);
+            x[0] = MA_DR_MP3_VADD(t0, t1);
+            x[8] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(t0, t1), g_sec[3*i + 2]);
+            x[16] = MA_DR_MP3_VADD(t3, t2);
+            x[24] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(t3, t2), g_sec[3*i + 2]);
+        }
+        for (x = t[0], i = 0; i < 4; i++, x += 8)
+        {
+            ma_dr_mp3_f4 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
+            xt = MA_DR_MP3_VSUB(x0, x7); x0 = MA_DR_MP3_VADD(x0, x7);
+            x7 = MA_DR_MP3_VSUB(x1, x6); x1 = MA_DR_MP3_VADD(x1, x6);
+            x6 = MA_DR_MP3_VSUB(x2, x5); x2 = MA_DR_MP3_VADD(x2, x5);
+            x5 = MA_DR_MP3_VSUB(x3, x4); x3 = MA_DR_MP3_VADD(x3, x4);
+            x4 = MA_DR_MP3_VSUB(x0, x3); x0 = MA_DR_MP3_VADD(x0, x3);
+            x3 = MA_DR_MP3_VSUB(x1, x2); x1 = MA_DR_MP3_VADD(x1, x2);
+            x[0] = MA_DR_MP3_VADD(x0, x1);
+            x[4] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x0, x1), 0.70710677f);
+            x5 = MA_DR_MP3_VADD(x5, x6);
+            x6 = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(x6, x7), 0.70710677f);
+            x7 = MA_DR_MP3_VADD(x7, xt);
+            x3 = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(x3, x4), 0.70710677f);
+            x5 = MA_DR_MP3_VSUB(x5, MA_DR_MP3_VMUL_S(x7, 0.198912367f));
+            x7 = MA_DR_MP3_VADD(x7, MA_DR_MP3_VMUL_S(x5, 0.382683432f));
+            x5 = MA_DR_MP3_VSUB(x5, MA_DR_MP3_VMUL_S(x7, 0.198912367f));
+            x0 = MA_DR_MP3_VSUB(xt, x6); xt = MA_DR_MP3_VADD(xt, x6);
+            x[1] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(xt, x7), 0.50979561f);
+            x[2] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(x4, x3), 0.54119611f);
+            x[3] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x0, x5), 0.60134488f);
+            x[5] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VADD(x0, x5), 0.89997619f);
+            x[6] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(x4, x3), 1.30656302f);
+            x[7] = MA_DR_MP3_VMUL_S(MA_DR_MP3_VSUB(xt, x7), 2.56291556f);
+        }
+        if (k > n - 3)
+        {
+#if MA_DR_MP3_HAVE_SSE
+#define MA_DR_MP3_VSAVE2(i, v) _mm_storel_pi((__m64 *)(void*)&y[i*18], v)
+#else
+#define MA_DR_MP3_VSAVE2(i, v) vst1_f32((float32_t *)&y[(i)*18],  vget_low_f32(v))
+#endif
+            for (i = 0; i < 7; i++, y += 4*18)
+            {
+                ma_dr_mp3_f4 s = MA_DR_MP3_VADD(t[3][i], t[3][i + 1]);
+                MA_DR_MP3_VSAVE2(0, t[0][i]);
+                MA_DR_MP3_VSAVE2(1, MA_DR_MP3_VADD(t[2][i], s));
+                MA_DR_MP3_VSAVE2(2, MA_DR_MP3_VADD(t[1][i], t[1][i + 1]));
+                MA_DR_MP3_VSAVE2(3, MA_DR_MP3_VADD(t[2][1 + i], s));
+            }
+            MA_DR_MP3_VSAVE2(0, t[0][7]);
+            MA_DR_MP3_VSAVE2(1, MA_DR_MP3_VADD(t[2][7], t[3][7]));
+            MA_DR_MP3_VSAVE2(2, t[1][7]);
+            MA_DR_MP3_VSAVE2(3, t[3][7]);
+        } else
+        {
+#define MA_DR_MP3_VSAVE4(i, v) MA_DR_MP3_VSTORE(&y[(i)*18], v)
+            for (i = 0; i < 7; i++, y += 4*18)
+            {
+                ma_dr_mp3_f4 s = MA_DR_MP3_VADD(t[3][i], t[3][i + 1]);
+                MA_DR_MP3_VSAVE4(0, t[0][i]);
+                MA_DR_MP3_VSAVE4(1, MA_DR_MP3_VADD(t[2][i], s));
+                MA_DR_MP3_VSAVE4(2, MA_DR_MP3_VADD(t[1][i], t[1][i + 1]));
+                MA_DR_MP3_VSAVE4(3, MA_DR_MP3_VADD(t[2][1 + i], s));
+            }
+            MA_DR_MP3_VSAVE4(0, t[0][7]);
+            MA_DR_MP3_VSAVE4(1, MA_DR_MP3_VADD(t[2][7], t[3][7]));
+            MA_DR_MP3_VSAVE4(2, t[1][7]);
+            MA_DR_MP3_VSAVE4(3, t[3][7]);
+        }
+    } else
+#endif
+#ifdef MA_DR_MP3_ONLY_SIMD
+    {}
+#else
+    for (; k < n; k++)
+    {
+        float t[4][8], *x, *y = grbuf + k;
+        for (x = t[0], i = 0; i < 8; i++, x++)
+        {
+            float x0 = y[i*18];
+            float x1 = y[(15 - i)*18];
+            float x2 = y[(16 + i)*18];
+            float x3 = y[(31 - i)*18];
+            float t0 = x0 + x3;
+            float t1 = x1 + x2;
+            float t2 = (x1 - x2)*g_sec[3*i + 0];
+            float t3 = (x0 - x3)*g_sec[3*i + 1];
+            x[0] = t0 + t1;
+            x[8] = (t0 - t1)*g_sec[3*i + 2];
+            x[16] = t3 + t2;
+            x[24] = (t3 - t2)*g_sec[3*i + 2];
+        }
+        for (x = t[0], i = 0; i < 4; i++, x += 8)
+        {
+            float x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
+            xt = x0 - x7; x0 += x7;
+            x7 = x1 - x6; x1 += x6;
+            x6 = x2 - x5; x2 += x5;
+            x5 = x3 - x4; x3 += x4;
+            x4 = x0 - x3; x0 += x3;
+            x3 = x1 - x2; x1 += x2;
+            x[0] = x0 + x1;
+            x[4] = (x0 - x1)*0.70710677f;
+            x5 =  x5 + x6;
+            x6 = (x6 + x7)*0.70710677f;
+            x7 =  x7 + xt;
+            x3 = (x3 + x4)*0.70710677f;
+            x5 -= x7*0.198912367f;
+            x7 += x5*0.382683432f;
+            x5 -= x7*0.198912367f;
+            x0 = xt - x6; xt += x6;
+            x[1] = (xt + x7)*0.50979561f;
+            x[2] = (x4 + x3)*0.54119611f;
+            x[3] = (x0 - x5)*0.60134488f;
+            x[5] = (x0 + x5)*0.89997619f;
+            x[6] = (x4 - x3)*1.30656302f;
+            x[7] = (xt - x7)*2.56291556f;
+        }
+        for (i = 0; i < 7; i++, y += 4*18)
+        {
+            y[0*18] = t[0][i];
+            y[1*18] = t[2][i] + t[3][i] + t[3][i + 1];
+            y[2*18] = t[1][i] + t[1][i + 1];
+            y[3*18] = t[2][i + 1] + t[3][i] + t[3][i + 1];
+        }
+        y[0*18] = t[0][7];
+        y[1*18] = t[2][7] + t[3][7];
+        y[2*18] = t[1][7];
+        y[3*18] = t[3][7];
+    }
+#endif
+}
+#ifndef MA_DR_MP3_FLOAT_OUTPUT
+typedef ma_int16 ma_dr_mp3d_sample_t;
+static ma_int16 ma_dr_mp3d_scale_pcm(float sample)
+{
+    ma_int16 s;
+#if MA_DR_MP3_HAVE_ARMV6
+    ma_int32 s32 = (ma_int32)(sample + .5f);
+    s32 -= (s32 < 0);
+    s = (ma_int16)ma_dr_mp3_clip_int16_arm(s32);
+#else
+    if (sample >=  32766.5f) return (ma_int16) 32767;
+    if (sample <= -32767.5f) return (ma_int16)-32768;
+    s = (ma_int16)(sample + .5f);
+    s -= (s < 0);
+#endif
+    return s;
+}
+#else
+typedef float ma_dr_mp3d_sample_t;
+static float ma_dr_mp3d_scale_pcm(float sample)
+{
+    return sample*(1.f/32768.f);
+}
+#endif
+static void ma_dr_mp3d_synth_pair(ma_dr_mp3d_sample_t *pcm, int nch, const float *z)
+{
+    float a;
+    a  = (z[14*64] - z[    0]) * 29;
+    a += (z[ 1*64] + z[13*64]) * 213;
+    a += (z[12*64] - z[ 2*64]) * 459;
+    a += (z[ 3*64] + z[11*64]) * 2037;
+    a += (z[10*64] - z[ 4*64]) * 5153;
+    a += (z[ 5*64] + z[ 9*64]) * 6574;
+    a += (z[ 8*64] - z[ 6*64]) * 37489;
+    a +=  z[ 7*64]             * 75038;
+    pcm[0] = ma_dr_mp3d_scale_pcm(a);
+    z += 2;
+    a  = z[14*64] * 104;
+    a += z[12*64] * 1567;
+    a += z[10*64] * 9727;
+    a += z[ 8*64] * 64019;
+    a += z[ 6*64] * -9975;
+    a += z[ 4*64] * -45;
+    a += z[ 2*64] * 146;
+    a += z[ 0*64] * -5;
+    pcm[16*nch] = ma_dr_mp3d_scale_pcm(a);
+}
+static void ma_dr_mp3d_synth(float *xl, ma_dr_mp3d_sample_t *dstl, int nch, float *lins)
+{
+    int i;
+    float *xr = xl + 576*(nch - 1);
+    ma_dr_mp3d_sample_t *dstr = dstl + (nch - 1);
+    static const float g_win[] = {
+        -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,
+        -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856,
+        -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630,
+        -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313,
+        -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908,
+        -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415,
+        -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835,
+        -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169,
+        -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420,
+        -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590,
+        -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679,
+        -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692,
+        -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629,
+        -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494,
+        -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290
+    };
+    float *zlin = lins + 15*64;
+    const float *w = g_win;
+    zlin[4*15]     = xl[18*16];
+    zlin[4*15 + 1] = xr[18*16];
+    zlin[4*15 + 2] = xl[0];
+    zlin[4*15 + 3] = xr[0];
+    zlin[4*31]     = xl[1 + 18*16];
+    zlin[4*31 + 1] = xr[1 + 18*16];
+    zlin[4*31 + 2] = xl[1];
+    zlin[4*31 + 3] = xr[1];
+    ma_dr_mp3d_synth_pair(dstr, nch, lins + 4*15 + 1);
+    ma_dr_mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1);
+    ma_dr_mp3d_synth_pair(dstl, nch, lins + 4*15);
+    ma_dr_mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64);
+#if MA_DR_MP3_HAVE_SIMD
+    if (ma_dr_mp3_have_simd()) for (i = 14; i >= 0; i--)
+    {
+#define MA_DR_MP3_VLOAD(k) ma_dr_mp3_f4 w0 = MA_DR_MP3_VSET(*w++); ma_dr_mp3_f4 w1 = MA_DR_MP3_VSET(*w++); ma_dr_mp3_f4 vz = MA_DR_MP3_VLD(&zlin[4*i - 64*k]); ma_dr_mp3_f4 vy = MA_DR_MP3_VLD(&zlin[4*i - 64*(15 - k)]);
+#define MA_DR_MP3_V0(k) { MA_DR_MP3_VLOAD(k) b =               MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vz, w1), MA_DR_MP3_VMUL(vy, w0)) ; a =               MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vz, w0), MA_DR_MP3_VMUL(vy, w1));  }
+#define MA_DR_MP3_V1(k) { MA_DR_MP3_VLOAD(k) b = MA_DR_MP3_VADD(b, MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vz, w1), MA_DR_MP3_VMUL(vy, w0))); a = MA_DR_MP3_VADD(a, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vz, w0), MA_DR_MP3_VMUL(vy, w1))); }
+#define MA_DR_MP3_V2(k) { MA_DR_MP3_VLOAD(k) b = MA_DR_MP3_VADD(b, MA_DR_MP3_VADD(MA_DR_MP3_VMUL(vz, w1), MA_DR_MP3_VMUL(vy, w0))); a = MA_DR_MP3_VADD(a, MA_DR_MP3_VSUB(MA_DR_MP3_VMUL(vy, w1), MA_DR_MP3_VMUL(vz, w0))); }
+        ma_dr_mp3_f4 a, b;
+        zlin[4*i]     = xl[18*(31 - i)];
+        zlin[4*i + 1] = xr[18*(31 - i)];
+        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
+        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
+        zlin[4*i + 64] = xl[1 + 18*(1 + i)];
+        zlin[4*i + 64 + 1] = xr[1 + 18*(1 + i)];
+        zlin[4*i - 64 + 2] = xl[18*(1 + i)];
+        zlin[4*i - 64 + 3] = xr[18*(1 + i)];
+        MA_DR_MP3_V0(0) MA_DR_MP3_V2(1) MA_DR_MP3_V1(2) MA_DR_MP3_V2(3) MA_DR_MP3_V1(4) MA_DR_MP3_V2(5) MA_DR_MP3_V1(6) MA_DR_MP3_V2(7)
+        {
+#ifndef MA_DR_MP3_FLOAT_OUTPUT
+#if MA_DR_MP3_HAVE_SSE
+            static const ma_dr_mp3_f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
+            static const ma_dr_mp3_f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
+            __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
+                                           _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
+            dstr[(15 - i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 1);
+            dstr[(17 + i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 5);
+            dstl[(15 - i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 0);
+            dstl[(17 + i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 4);
+            dstr[(47 - i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 3);
+            dstr[(49 + i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 7);
+            dstl[(47 - i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 2);
+            dstl[(49 + i)*nch] = (ma_int16)_mm_extract_epi16(pcm8, 6);
+#else
+            int16x4_t pcma, pcmb;
+            a = MA_DR_MP3_VADD(a, MA_DR_MP3_VSET(0.5f));
+            b = MA_DR_MP3_VADD(b, MA_DR_MP3_VSET(0.5f));
+            pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, MA_DR_MP3_VSET(0)))));
+            pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, MA_DR_MP3_VSET(0)))));
+            vst1_lane_s16(dstr + (15 - i)*nch, pcma, 1);
+            vst1_lane_s16(dstr + (17 + i)*nch, pcmb, 1);
+            vst1_lane_s16(dstl + (15 - i)*nch, pcma, 0);
+            vst1_lane_s16(dstl + (17 + i)*nch, pcmb, 0);
+            vst1_lane_s16(dstr + (47 - i)*nch, pcma, 3);
+            vst1_lane_s16(dstr + (49 + i)*nch, pcmb, 3);
+            vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2);
+            vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2);
+#endif
+#else
+        #if MA_DR_MP3_HAVE_SSE
+            static const ma_dr_mp3_f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f };
+        #else
+            const ma_dr_mp3_f4 g_scale = vdupq_n_f32(1.0f/32768.0f);
+        #endif
+            a = MA_DR_MP3_VMUL(a, g_scale);
+            b = MA_DR_MP3_VMUL(b, g_scale);
+#if MA_DR_MP3_HAVE_SSE
+            _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
+            _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
+            _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)));
+            _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0)));
+            _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
+            _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3)));
+            _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+            _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2)));
+#else
+            vst1q_lane_f32(dstr + (15 - i)*nch, a, 1);
+            vst1q_lane_f32(dstr + (17 + i)*nch, b, 1);
+            vst1q_lane_f32(dstl + (15 - i)*nch, a, 0);
+            vst1q_lane_f32(dstl + (17 + i)*nch, b, 0);
+            vst1q_lane_f32(dstr + (47 - i)*nch, a, 3);
+            vst1q_lane_f32(dstr + (49 + i)*nch, b, 3);
+            vst1q_lane_f32(dstl + (47 - i)*nch, a, 2);
+            vst1q_lane_f32(dstl + (49 + i)*nch, b, 2);
+#endif
+#endif
+        }
+    } else
+#endif
+#ifdef MA_DR_MP3_ONLY_SIMD
+    {}
+#else
+    for (i = 14; i >= 0; i--)
+    {
+#define MA_DR_MP3_LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64];
+#define MA_DR_MP3_S0(k) { int j; MA_DR_MP3_LOAD(k); for (j = 0; j < 4; j++) b[j]  = vz[j]*w1 + vy[j]*w0, a[j]  = vz[j]*w0 - vy[j]*w1; }
+#define MA_DR_MP3_S1(k) { int j; MA_DR_MP3_LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; }
+#define MA_DR_MP3_S2(k) { int j; MA_DR_MP3_LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; }
+        float a[4], b[4];
+        zlin[4*i]     = xl[18*(31 - i)];
+        zlin[4*i + 1] = xr[18*(31 - i)];
+        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
+        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
+        zlin[4*(i + 16)]   = xl[1 + 18*(1 + i)];
+        zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)];
+        zlin[4*(i - 16) + 2] = xl[18*(1 + i)];
+        zlin[4*(i - 16) + 3] = xr[18*(1 + i)];
+        MA_DR_MP3_S0(0) MA_DR_MP3_S2(1) MA_DR_MP3_S1(2) MA_DR_MP3_S2(3) MA_DR_MP3_S1(4) MA_DR_MP3_S2(5) MA_DR_MP3_S1(6) MA_DR_MP3_S2(7)
+        dstr[(15 - i)*nch] = ma_dr_mp3d_scale_pcm(a[1]);
+        dstr[(17 + i)*nch] = ma_dr_mp3d_scale_pcm(b[1]);
+        dstl[(15 - i)*nch] = ma_dr_mp3d_scale_pcm(a[0]);
+        dstl[(17 + i)*nch] = ma_dr_mp3d_scale_pcm(b[0]);
+        dstr[(47 - i)*nch] = ma_dr_mp3d_scale_pcm(a[3]);
+        dstr[(49 + i)*nch] = ma_dr_mp3d_scale_pcm(b[3]);
+        dstl[(47 - i)*nch] = ma_dr_mp3d_scale_pcm(a[2]);
+        dstl[(49 + i)*nch] = ma_dr_mp3d_scale_pcm(b[2]);
+    }
+#endif
+}
+static void ma_dr_mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, ma_dr_mp3d_sample_t *pcm, float *lins)
+{
+    int i;
+    for (i = 0; i < nch; i++)
+    {
+        ma_dr_mp3d_DCT_II(grbuf + 576*i, nbands);
+    }
+    MA_DR_MP3_COPY_MEMORY(lins, qmf_state, sizeof(float)*15*64);
+    for (i = 0; i < nbands; i += 2)
+    {
+        ma_dr_mp3d_synth(grbuf + i, pcm + 32*nch*i, nch, lins + i*64);
+    }
+#ifndef MA_DR_MP3_NONSTANDARD_BUT_LOGICAL
+    if (nch == 1)
+    {
+        for (i = 0; i < 15*64; i += 2)
+        {
+            qmf_state[i] = lins[nbands*64 + i];
+        }
+    } else
+#endif
+    {
+        MA_DR_MP3_COPY_MEMORY(qmf_state, lins + nbands*64, sizeof(float)*15*64);
+    }
+}
+static int ma_dr_mp3d_match_frame(const ma_uint8 *hdr, int mp3_bytes, int frame_bytes)
+{
+    int i, nmatch;
+    for (i = 0, nmatch = 0; nmatch < MA_DR_MP3_MAX_FRAME_SYNC_MATCHES; nmatch++)
+    {
+        i += ma_dr_mp3_hdr_frame_bytes(hdr + i, frame_bytes) + ma_dr_mp3_hdr_padding(hdr + i);
+        if (i + MA_DR_MP3_HDR_SIZE > mp3_bytes)
+            return nmatch > 0;
+        if (!ma_dr_mp3_hdr_compare(hdr, hdr + i))
+            return 0;
+    }
+    return 1;
+}
+static int ma_dr_mp3d_find_frame(const ma_uint8 *mp3, int mp3_bytes, int *free_format_bytes, int *ptr_frame_bytes)
+{
+    int i, k;
+    for (i = 0; i < mp3_bytes - MA_DR_MP3_HDR_SIZE; i++, mp3++)
+    {
+        if (ma_dr_mp3_hdr_valid(mp3))
+        {
+            int frame_bytes = ma_dr_mp3_hdr_frame_bytes(mp3, *free_format_bytes);
+            int frame_and_padding = frame_bytes + ma_dr_mp3_hdr_padding(mp3);
+            for (k = MA_DR_MP3_HDR_SIZE; !frame_bytes && k < MA_DR_MP3_MAX_FREE_FORMAT_FRAME_SIZE && i + 2*k < mp3_bytes - MA_DR_MP3_HDR_SIZE; k++)
+            {
+                if (ma_dr_mp3_hdr_compare(mp3, mp3 + k))
+                {
+                    int fb = k - ma_dr_mp3_hdr_padding(mp3);
+                    int nextfb = fb + ma_dr_mp3_hdr_padding(mp3 + k);
+                    if (i + k + nextfb + MA_DR_MP3_HDR_SIZE > mp3_bytes || !ma_dr_mp3_hdr_compare(mp3, mp3 + k + nextfb))
+                        continue;
+                    frame_and_padding = k;
+                    frame_bytes = fb;
+                    *free_format_bytes = fb;
+                }
+            }
+            if ((frame_bytes && i + frame_and_padding <= mp3_bytes &&
+                ma_dr_mp3d_match_frame(mp3, mp3_bytes - i, frame_bytes)) ||
+                (!i && frame_and_padding == mp3_bytes))
+            {
+                *ptr_frame_bytes = frame_and_padding;
+                return i;
+            }
+            *free_format_bytes = 0;
+        }
+    }
+    *ptr_frame_bytes = 0;
+    return mp3_bytes;
+}
+MA_API void ma_dr_mp3dec_init(ma_dr_mp3dec *dec)
+{
+    dec->header[0] = 0;
+}
+MA_API int ma_dr_mp3dec_decode_frame(ma_dr_mp3dec *dec, const ma_uint8 *mp3, int mp3_bytes, void *pcm, ma_dr_mp3dec_frame_info *info)
+{
+    int i = 0, igr, frame_size = 0, success = 1;
+    const ma_uint8 *hdr;
+    ma_dr_mp3_bs bs_frame[1];
+    ma_dr_mp3dec_scratch scratch;
+    if (mp3_bytes > 4 && dec->header[0] == 0xff && ma_dr_mp3_hdr_compare(dec->header, mp3))
+    {
+        frame_size = ma_dr_mp3_hdr_frame_bytes(mp3, dec->free_format_bytes) + ma_dr_mp3_hdr_padding(mp3);
+        if (frame_size != mp3_bytes && (frame_size + MA_DR_MP3_HDR_SIZE > mp3_bytes || !ma_dr_mp3_hdr_compare(mp3, mp3 + frame_size)))
+        {
+            frame_size = 0;
+        }
+    }
+    if (!frame_size)
+    {
+        MA_DR_MP3_ZERO_MEMORY(dec, sizeof(ma_dr_mp3dec));
+        i = ma_dr_mp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size);
+        if (!frame_size || i + frame_size > mp3_bytes)
+        {
+            info->frame_bytes = i;
+            return 0;
+        }
+    }
+    hdr = mp3 + i;
+    MA_DR_MP3_COPY_MEMORY(dec->header, hdr, MA_DR_MP3_HDR_SIZE);
+    info->frame_bytes = i + frame_size;
+    info->channels = MA_DR_MP3_HDR_IS_MONO(hdr) ? 1 : 2;
+    info->hz = ma_dr_mp3_hdr_sample_rate_hz(hdr);
+    info->layer = 4 - MA_DR_MP3_HDR_GET_LAYER(hdr);
+    info->bitrate_kbps = ma_dr_mp3_hdr_bitrate_kbps(hdr);
+    ma_dr_mp3_bs_init(bs_frame, hdr + MA_DR_MP3_HDR_SIZE, frame_size - MA_DR_MP3_HDR_SIZE);
+    if (MA_DR_MP3_HDR_IS_CRC(hdr))
+    {
+        ma_dr_mp3_bs_get_bits(bs_frame, 16);
+    }
+    if (info->layer == 3)
+    {
+        int main_data_begin = ma_dr_mp3_L3_read_side_info(bs_frame, scratch.gr_info, hdr);
+        if (main_data_begin < 0 || bs_frame->pos > bs_frame->limit)
+        {
+            ma_dr_mp3dec_init(dec);
+            return 0;
+        }
+        success = ma_dr_mp3_L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin);
+        if (success && pcm != NULL)
+        {
+            for (igr = 0; igr < (MA_DR_MP3_HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm = MA_DR_MP3_OFFSET_PTR(pcm, sizeof(ma_dr_mp3d_sample_t)*576*info->channels))
+            {
+                MA_DR_MP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
+                ma_dr_mp3_L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels);
+                ma_dr_mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, (ma_dr_mp3d_sample_t*)pcm, scratch.syn[0]);
+            }
+        }
+        ma_dr_mp3_L3_save_reservoir(dec, &scratch);
+    } else
+    {
+#ifdef MA_DR_MP3_ONLY_MP3
+        return 0;
+#else
+        ma_dr_mp3_L12_scale_info sci[1];
+        if (pcm == NULL) {
+            return ma_dr_mp3_hdr_frame_samples(hdr);
+        }
+        ma_dr_mp3_L12_read_scale_info(hdr, bs_frame, sci);
+        MA_DR_MP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
+        for (i = 0, igr = 0; igr < 3; igr++)
+        {
+            if (12 == (i += ma_dr_mp3_L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1)))
+            {
+                i = 0;
+                ma_dr_mp3_L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]);
+                ma_dr_mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, (ma_dr_mp3d_sample_t*)pcm, scratch.syn[0]);
+                MA_DR_MP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
+                pcm = MA_DR_MP3_OFFSET_PTR(pcm, sizeof(ma_dr_mp3d_sample_t)*384*info->channels);
+            }
+            if (bs_frame->pos > bs_frame->limit)
+            {
+                ma_dr_mp3dec_init(dec);
+                return 0;
+            }
+        }
+#endif
+    }
+    return success*ma_dr_mp3_hdr_frame_samples(dec->header);
+}
+MA_API void ma_dr_mp3dec_f32_to_s16(const float *in, ma_int16 *out, size_t num_samples)
+{
+    size_t i = 0;
+#if MA_DR_MP3_HAVE_SIMD
+    size_t aligned_count = num_samples & ~7;
+    for(; i < aligned_count; i+=8)
+    {
+        ma_dr_mp3_f4 scale = MA_DR_MP3_VSET(32768.0f);
+        ma_dr_mp3_f4 a = MA_DR_MP3_VMUL(MA_DR_MP3_VLD(&in[i  ]), scale);
+        ma_dr_mp3_f4 b = MA_DR_MP3_VMUL(MA_DR_MP3_VLD(&in[i+4]), scale);
+#if MA_DR_MP3_HAVE_SSE
+        ma_dr_mp3_f4 s16max = MA_DR_MP3_VSET( 32767.0f);
+        ma_dr_mp3_f4 s16min = MA_DR_MP3_VSET(-32768.0f);
+        __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, s16max), s16min)),
+                                        _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, s16max), s16min)));
+        out[i  ] = (ma_int16)_mm_extract_epi16(pcm8, 0);
+        out[i+1] = (ma_int16)_mm_extract_epi16(pcm8, 1);
+        out[i+2] = (ma_int16)_mm_extract_epi16(pcm8, 2);
+        out[i+3] = (ma_int16)_mm_extract_epi16(pcm8, 3);
+        out[i+4] = (ma_int16)_mm_extract_epi16(pcm8, 4);
+        out[i+5] = (ma_int16)_mm_extract_epi16(pcm8, 5);
+        out[i+6] = (ma_int16)_mm_extract_epi16(pcm8, 6);
+        out[i+7] = (ma_int16)_mm_extract_epi16(pcm8, 7);
+#else
+        int16x4_t pcma, pcmb;
+        a = MA_DR_MP3_VADD(a, MA_DR_MP3_VSET(0.5f));
+        b = MA_DR_MP3_VADD(b, MA_DR_MP3_VSET(0.5f));
+        pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, MA_DR_MP3_VSET(0)))));
+        pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, MA_DR_MP3_VSET(0)))));
+        vst1_lane_s16(out+i  , pcma, 0);
+        vst1_lane_s16(out+i+1, pcma, 1);
+        vst1_lane_s16(out+i+2, pcma, 2);
+        vst1_lane_s16(out+i+3, pcma, 3);
+        vst1_lane_s16(out+i+4, pcmb, 0);
+        vst1_lane_s16(out+i+5, pcmb, 1);
+        vst1_lane_s16(out+i+6, pcmb, 2);
+        vst1_lane_s16(out+i+7, pcmb, 3);
+#endif
+    }
+#endif
+    for(; i < num_samples; i++)
+    {
+        float sample = in[i] * 32768.0f;
+        if (sample >=  32766.5f)
+            out[i] = (ma_int16) 32767;
+        else if (sample <= -32767.5f)
+            out[i] = (ma_int16)-32768;
+        else
+        {
+            short s = (ma_int16)(sample + .5f);
+            s -= (s < 0);
+            out[i] = s;
+        }
+    }
+}
+#ifndef MA_DR_MP3_SEEK_LEADING_MP3_FRAMES
+#define MA_DR_MP3_SEEK_LEADING_MP3_FRAMES   2
+#endif
+#define MA_DR_MP3_MIN_DATA_CHUNK_SIZE   16384
+#ifndef MA_DR_MP3_DATA_CHUNK_SIZE
+#define MA_DR_MP3_DATA_CHUNK_SIZE  (MA_DR_MP3_MIN_DATA_CHUNK_SIZE*4)
+#endif
+#define MA_DR_MP3_COUNTOF(x)        (sizeof(x) / sizeof(x[0]))
+#define MA_DR_MP3_CLAMP(x, lo, hi)  (MA_DR_MP3_MAX(lo, MA_DR_MP3_MIN(x, hi)))
+#ifndef MA_DR_MP3_PI_D
+#define MA_DR_MP3_PI_D    3.14159265358979323846264
+#endif
+#define MA_DR_MP3_DEFAULT_RESAMPLER_LPF_ORDER   2
+static MA_INLINE float ma_dr_mp3_mix_f32(float x, float y, float a)
+{
+    return x*(1-a) + y*a;
+}
+static MA_INLINE float ma_dr_mp3_mix_f32_fast(float x, float y, float a)
+{
+    float r0 = (y - x);
+    float r1 = r0*a;
+    return x + r1;
+}
+static MA_INLINE ma_uint32 ma_dr_mp3_gcf_u32(ma_uint32 a, ma_uint32 b)
+{
+    for (;;) {
+        if (b == 0) {
+            break;
+        } else {
+            ma_uint32 t = a;
+            a = b;
+            b = t % a;
+        }
+    }
+    return a;
+}
+static void* ma_dr_mp3__malloc_default(size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return MA_DR_MP3_MALLOC(sz);
+}
+static void* ma_dr_mp3__realloc_default(void* p, size_t sz, void* pUserData)
+{
+    (void)pUserData;
+    return MA_DR_MP3_REALLOC(p, sz);
+}
+static void ma_dr_mp3__free_default(void* p, void* pUserData)
+{
+    (void)pUserData;
+    MA_DR_MP3_FREE(p);
+}
+static void* ma_dr_mp3__malloc_from_callbacks(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+    if (pAllocationCallbacks->onMalloc != NULL) {
+        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
+    }
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
+    }
+    return NULL;
+}
+static void* ma_dr_mp3__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks == NULL) {
+        return NULL;
+    }
+    if (pAllocationCallbacks->onRealloc != NULL) {
+        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
+    }
+    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
+        void* p2;
+        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
+        if (p2 == NULL) {
+            return NULL;
+        }
+        if (p != NULL) {
+            MA_DR_MP3_COPY_MEMORY(p2, p, szOld);
+            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+        }
+        return p2;
+    }
+    return NULL;
+}
+static void ma_dr_mp3__free_from_callbacks(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (p == NULL || pAllocationCallbacks == NULL) {
+        return;
+    }
+    if (pAllocationCallbacks->onFree != NULL) {
+        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
+    }
+}
+static ma_allocation_callbacks ma_dr_mp3_copy_allocation_callbacks_or_defaults(const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        return *pAllocationCallbacks;
+    } else {
+        ma_allocation_callbacks allocationCallbacks;
+        allocationCallbacks.pUserData = NULL;
+        allocationCallbacks.onMalloc  = ma_dr_mp3__malloc_default;
+        allocationCallbacks.onRealloc = ma_dr_mp3__realloc_default;
+        allocationCallbacks.onFree    = ma_dr_mp3__free_default;
+        return allocationCallbacks;
+    }
+}
+static size_t ma_dr_mp3__on_read(ma_dr_mp3* pMP3, void* pBufferOut, size_t bytesToRead)
+{
+    size_t bytesRead = pMP3->onRead(pMP3->pUserData, pBufferOut, bytesToRead);
+    pMP3->streamCursor += bytesRead;
+    return bytesRead;
+}
+static ma_bool32 ma_dr_mp3__on_seek(ma_dr_mp3* pMP3, int offset, ma_dr_mp3_seek_origin origin)
+{
+    MA_DR_MP3_ASSERT(offset >= 0);
+    if (!pMP3->onSeek(pMP3->pUserData, offset, origin)) {
+        return MA_FALSE;
+    }
+    if (origin == ma_dr_mp3_seek_origin_start) {
+        pMP3->streamCursor = (ma_uint64)offset;
+    } else {
+        pMP3->streamCursor += offset;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_mp3__on_seek_64(ma_dr_mp3* pMP3, ma_uint64 offset, ma_dr_mp3_seek_origin origin)
+{
+    if (offset <= 0x7FFFFFFF) {
+        return ma_dr_mp3__on_seek(pMP3, (int)offset, origin);
+    }
+    if (!ma_dr_mp3__on_seek(pMP3, 0x7FFFFFFF, ma_dr_mp3_seek_origin_start)) {
+        return MA_FALSE;
+    }
+    offset -= 0x7FFFFFFF;
+    while (offset > 0) {
+        if (offset <= 0x7FFFFFFF) {
+            if (!ma_dr_mp3__on_seek(pMP3, (int)offset, ma_dr_mp3_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            offset = 0;
+        } else {
+            if (!ma_dr_mp3__on_seek(pMP3, 0x7FFFFFFF, ma_dr_mp3_seek_origin_current)) {
+                return MA_FALSE;
+            }
+            offset -= 0x7FFFFFFF;
+        }
+    }
+    return MA_TRUE;
+}
+static ma_uint32 ma_dr_mp3_decode_next_frame_ex__callbacks(ma_dr_mp3* pMP3, ma_dr_mp3d_sample_t* pPCMFrames)
+{
+    ma_uint32 pcmFramesRead = 0;
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    MA_DR_MP3_ASSERT(pMP3->onRead != NULL);
+    if (pMP3->atEnd) {
+        return 0;
+    }
+    for (;;) {
+        ma_dr_mp3dec_frame_info info;
+        if (pMP3->dataSize < MA_DR_MP3_MIN_DATA_CHUNK_SIZE) {
+            size_t bytesRead;
+            if (pMP3->pData != NULL) {
+                MA_DR_MP3_MOVE_MEMORY(pMP3->pData, pMP3->pData + pMP3->dataConsumed, pMP3->dataSize);
+            }
+            pMP3->dataConsumed = 0;
+            if (pMP3->dataCapacity < MA_DR_MP3_DATA_CHUNK_SIZE) {
+                ma_uint8* pNewData;
+                size_t newDataCap;
+                newDataCap = MA_DR_MP3_DATA_CHUNK_SIZE;
+                pNewData = (ma_uint8*)ma_dr_mp3__realloc_from_callbacks(pMP3->pData, newDataCap, pMP3->dataCapacity, &pMP3->allocationCallbacks);
+                if (pNewData == NULL) {
+                    return 0;
+                }
+                pMP3->pData = pNewData;
+                pMP3->dataCapacity = newDataCap;
+            }
+            bytesRead = ma_dr_mp3__on_read(pMP3, pMP3->pData + pMP3->dataSize, (pMP3->dataCapacity - pMP3->dataSize));
+            if (bytesRead == 0) {
+                if (pMP3->dataSize == 0) {
+                    pMP3->atEnd = MA_TRUE;
+                    return 0;
+                }
+            }
+            pMP3->dataSize += bytesRead;
+        }
+        if (pMP3->dataSize > INT_MAX) {
+            pMP3->atEnd = MA_TRUE;
+            return 0;
+        }
+        MA_DR_MP3_ASSERT(pMP3->pData != NULL);
+        MA_DR_MP3_ASSERT(pMP3->dataCapacity > 0);
+        if (pMP3->pData == NULL) {
+            return 0;
+        }
+        pcmFramesRead = ma_dr_mp3dec_decode_frame(&pMP3->decoder, pMP3->pData + pMP3->dataConsumed, (int)pMP3->dataSize, pPCMFrames, &info);
+        if (info.frame_bytes > 0) {
+            pMP3->dataConsumed += (size_t)info.frame_bytes;
+            pMP3->dataSize     -= (size_t)info.frame_bytes;
+        }
+        if (pcmFramesRead > 0) {
+            pcmFramesRead = ma_dr_mp3_hdr_frame_samples(pMP3->decoder.header);
+            pMP3->pcmFramesConsumedInMP3Frame = 0;
+            pMP3->pcmFramesRemainingInMP3Frame = pcmFramesRead;
+            pMP3->mp3FrameChannels = info.channels;
+            pMP3->mp3FrameSampleRate = info.hz;
+            break;
+        } else if (info.frame_bytes == 0) {
+            size_t bytesRead;
+            MA_DR_MP3_MOVE_MEMORY(pMP3->pData, pMP3->pData + pMP3->dataConsumed, pMP3->dataSize);
+            pMP3->dataConsumed = 0;
+            if (pMP3->dataCapacity == pMP3->dataSize) {
+                ma_uint8* pNewData;
+                size_t newDataCap;
+                newDataCap = pMP3->dataCapacity + MA_DR_MP3_DATA_CHUNK_SIZE;
+                pNewData = (ma_uint8*)ma_dr_mp3__realloc_from_callbacks(pMP3->pData, newDataCap, pMP3->dataCapacity, &pMP3->allocationCallbacks);
+                if (pNewData == NULL) {
+                    return 0;
+                }
+                pMP3->pData = pNewData;
+                pMP3->dataCapacity = newDataCap;
+            }
+            bytesRead = ma_dr_mp3__on_read(pMP3, pMP3->pData + pMP3->dataSize, (pMP3->dataCapacity - pMP3->dataSize));
+            if (bytesRead == 0) {
+                pMP3->atEnd = MA_TRUE;
+                return 0;
+            }
+            pMP3->dataSize += bytesRead;
+        }
+    };
+    return pcmFramesRead;
+}
+static ma_uint32 ma_dr_mp3_decode_next_frame_ex__memory(ma_dr_mp3* pMP3, ma_dr_mp3d_sample_t* pPCMFrames)
+{
+    ma_uint32 pcmFramesRead = 0;
+    ma_dr_mp3dec_frame_info info;
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    MA_DR_MP3_ASSERT(pMP3->memory.pData != NULL);
+    if (pMP3->atEnd) {
+        return 0;
+    }
+    for (;;) {
+        pcmFramesRead = ma_dr_mp3dec_decode_frame(&pMP3->decoder, pMP3->memory.pData + pMP3->memory.currentReadPos, (int)(pMP3->memory.dataSize - pMP3->memory.currentReadPos), pPCMFrames, &info);
+        if (pcmFramesRead > 0) {
+            pcmFramesRead = ma_dr_mp3_hdr_frame_samples(pMP3->decoder.header);
+            pMP3->pcmFramesConsumedInMP3Frame  = 0;
+            pMP3->pcmFramesRemainingInMP3Frame = pcmFramesRead;
+            pMP3->mp3FrameChannels             = info.channels;
+            pMP3->mp3FrameSampleRate           = info.hz;
+            break;
+        } else if (info.frame_bytes > 0) {
+            pMP3->memory.currentReadPos += (size_t)info.frame_bytes;
+        } else {
+            break;
+        }
+    }
+    pMP3->memory.currentReadPos += (size_t)info.frame_bytes;
+    return pcmFramesRead;
+}
+static ma_uint32 ma_dr_mp3_decode_next_frame_ex(ma_dr_mp3* pMP3, ma_dr_mp3d_sample_t* pPCMFrames)
+{
+    if (pMP3->memory.pData != NULL && pMP3->memory.dataSize > 0) {
+        return ma_dr_mp3_decode_next_frame_ex__memory(pMP3, pPCMFrames);
+    } else {
+        return ma_dr_mp3_decode_next_frame_ex__callbacks(pMP3, pPCMFrames);
+    }
+}
+static ma_uint32 ma_dr_mp3_decode_next_frame(ma_dr_mp3* pMP3)
+{
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    return ma_dr_mp3_decode_next_frame_ex(pMP3, (ma_dr_mp3d_sample_t*)pMP3->pcmFrames);
+}
+#if 0
+static ma_uint32 ma_dr_mp3_seek_next_frame(ma_dr_mp3* pMP3)
+{
+    ma_uint32 pcmFrameCount;
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    pcmFrameCount = ma_dr_mp3_decode_next_frame_ex(pMP3, NULL);
+    if (pcmFrameCount == 0) {
+        return 0;
+    }
+    pMP3->currentPCMFrame             += pcmFrameCount;
+    pMP3->pcmFramesConsumedInMP3Frame  = pcmFrameCount;
+    pMP3->pcmFramesRemainingInMP3Frame = 0;
+    return pcmFrameCount;
+}
+#endif
+static ma_bool32 ma_dr_mp3_init_internal(ma_dr_mp3* pMP3, ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    MA_DR_MP3_ASSERT(onRead != NULL);
+    ma_dr_mp3dec_init(&pMP3->decoder);
+    pMP3->onRead = onRead;
+    pMP3->onSeek = onSeek;
+    pMP3->pUserData = pUserData;
+    pMP3->allocationCallbacks = ma_dr_mp3_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
+    if (pMP3->allocationCallbacks.onFree == NULL || (pMP3->allocationCallbacks.onMalloc == NULL && pMP3->allocationCallbacks.onRealloc == NULL)) {
+        return MA_FALSE;
+    }
+    if (ma_dr_mp3_decode_next_frame(pMP3) == 0) {
+        ma_dr_mp3__free_from_callbacks(pMP3->pData, &pMP3->allocationCallbacks);
+        return MA_FALSE;
+    }
+    pMP3->channels   = pMP3->mp3FrameChannels;
+    pMP3->sampleRate = pMP3->mp3FrameSampleRate;
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_mp3_init(ma_dr_mp3* pMP3, ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pMP3 == NULL || onRead == NULL) {
+        return MA_FALSE;
+    }
+    MA_DR_MP3_ZERO_OBJECT(pMP3);
+    return ma_dr_mp3_init_internal(pMP3, onRead, onSeek, pUserData, pAllocationCallbacks);
+}
+static size_t ma_dr_mp3__on_read_memory(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    ma_dr_mp3* pMP3 = (ma_dr_mp3*)pUserData;
+    size_t bytesRemaining;
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    MA_DR_MP3_ASSERT(pMP3->memory.dataSize >= pMP3->memory.currentReadPos);
+    bytesRemaining = pMP3->memory.dataSize - pMP3->memory.currentReadPos;
+    if (bytesToRead > bytesRemaining) {
+        bytesToRead = bytesRemaining;
+    }
+    if (bytesToRead > 0) {
+        MA_DR_MP3_COPY_MEMORY(pBufferOut, pMP3->memory.pData + pMP3->memory.currentReadPos, bytesToRead);
+        pMP3->memory.currentReadPos += bytesToRead;
+    }
+    return bytesToRead;
+}
+static ma_bool32 ma_dr_mp3__on_seek_memory(void* pUserData, int byteOffset, ma_dr_mp3_seek_origin origin)
+{
+    ma_dr_mp3* pMP3 = (ma_dr_mp3*)pUserData;
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    if (origin == ma_dr_mp3_seek_origin_current) {
+        if (byteOffset > 0) {
+            if (pMP3->memory.currentReadPos + byteOffset > pMP3->memory.dataSize) {
+                byteOffset = (int)(pMP3->memory.dataSize - pMP3->memory.currentReadPos);
+            }
+        } else {
+            if (pMP3->memory.currentReadPos < (size_t)-byteOffset) {
+                byteOffset = -(int)pMP3->memory.currentReadPos;
+            }
+        }
+        pMP3->memory.currentReadPos += byteOffset;
+    } else {
+        if ((ma_uint32)byteOffset <= pMP3->memory.dataSize) {
+            pMP3->memory.currentReadPos = byteOffset;
+        } else {
+            pMP3->memory.currentReadPos = pMP3->memory.dataSize;
+        }
+    }
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_mp3_init_memory(ma_dr_mp3* pMP3, const void* pData, size_t dataSize, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pMP3 == NULL) {
+        return MA_FALSE;
+    }
+    MA_DR_MP3_ZERO_OBJECT(pMP3);
+    if (pData == NULL || dataSize == 0) {
+        return MA_FALSE;
+    }
+    pMP3->memory.pData = (const ma_uint8*)pData;
+    pMP3->memory.dataSize = dataSize;
+    pMP3->memory.currentReadPos = 0;
+    return ma_dr_mp3_init_internal(pMP3, ma_dr_mp3__on_read_memory, ma_dr_mp3__on_seek_memory, pMP3, pAllocationCallbacks);
+}
+#ifndef MA_DR_MP3_NO_STDIO
+#include <stdio.h>
+#include <wchar.h>
+static size_t ma_dr_mp3__on_read_stdio(void* pUserData, void* pBufferOut, size_t bytesToRead)
+{
+    return fread(pBufferOut, 1, bytesToRead, (FILE*)pUserData);
+}
+static ma_bool32 ma_dr_mp3__on_seek_stdio(void* pUserData, int offset, ma_dr_mp3_seek_origin origin)
+{
+    return fseek((FILE*)pUserData, offset, (origin == ma_dr_mp3_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
+}
+MA_API ma_bool32 ma_dr_mp3_init_file(ma_dr_mp3* pMP3, const char* pFilePath, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_bool32 result;
+    FILE* pFile;
+    if (ma_fopen(&pFile, pFilePath, "rb") != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    result = ma_dr_mp3_init(pMP3, ma_dr_mp3__on_read_stdio, ma_dr_mp3__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (result != MA_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_mp3_init_file_w(ma_dr_mp3* pMP3, const wchar_t* pFilePath, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_bool32 result;
+    FILE* pFile;
+    if (ma_wfopen(&pFile, pFilePath, L"rb", pAllocationCallbacks) != MA_SUCCESS) {
+        return MA_FALSE;
+    }
+    result = ma_dr_mp3_init(pMP3, ma_dr_mp3__on_read_stdio, ma_dr_mp3__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
+    if (result != MA_TRUE) {
+        fclose(pFile);
+        return result;
+    }
+    return MA_TRUE;
+}
+#endif
+MA_API void ma_dr_mp3_uninit(ma_dr_mp3* pMP3)
+{
+    if (pMP3 == NULL) {
+        return;
+    }
+#ifndef MA_DR_MP3_NO_STDIO
+    if (pMP3->onRead == ma_dr_mp3__on_read_stdio) {
+        FILE* pFile = (FILE*)pMP3->pUserData;
+        if (pFile != NULL) {
+            fclose(pFile);
+            pMP3->pUserData = NULL;
+        }
+    }
+#endif
+    ma_dr_mp3__free_from_callbacks(pMP3->pData, &pMP3->allocationCallbacks);
+}
+#if defined(MA_DR_MP3_FLOAT_OUTPUT)
+static void ma_dr_mp3_f32_to_s16(ma_int16* dst, const float* src, ma_uint64 sampleCount)
+{
+    ma_uint64 i;
+    ma_uint64 i4;
+    ma_uint64 sampleCount4;
+    i = 0;
+    sampleCount4 = sampleCount >> 2;
+    for (i4 = 0; i4 < sampleCount4; i4 += 1) {
+        float x0 = src[i+0];
+        float x1 = src[i+1];
+        float x2 = src[i+2];
+        float x3 = src[i+3];
+        x0 = ((x0 < -1) ? -1 : ((x0 > 1) ? 1 : x0));
+        x1 = ((x1 < -1) ? -1 : ((x1 > 1) ? 1 : x1));
+        x2 = ((x2 < -1) ? -1 : ((x2 > 1) ? 1 : x2));
+        x3 = ((x3 < -1) ? -1 : ((x3 > 1) ? 1 : x3));
+        x0 = x0 * 32767.0f;
+        x1 = x1 * 32767.0f;
+        x2 = x2 * 32767.0f;
+        x3 = x3 * 32767.0f;
+        dst[i+0] = (ma_int16)x0;
+        dst[i+1] = (ma_int16)x1;
+        dst[i+2] = (ma_int16)x2;
+        dst[i+3] = (ma_int16)x3;
+        i += 4;
+    }
+    for (; i < sampleCount; i += 1) {
+        float x = src[i];
+        x = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
+        x = x * 32767.0f;
+        dst[i] = (ma_int16)x;
+    }
+}
+#endif
+#if !defined(MA_DR_MP3_FLOAT_OUTPUT)
+static void ma_dr_mp3_s16_to_f32(float* dst, const ma_int16* src, ma_uint64 sampleCount)
+{
+    ma_uint64 i;
+    for (i = 0; i < sampleCount; i += 1) {
+        float x = (float)src[i];
+        x = x * 0.000030517578125f;
+        dst[i] = x;
+    }
+}
+#endif
+static ma_uint64 ma_dr_mp3_read_pcm_frames_raw(ma_dr_mp3* pMP3, ma_uint64 framesToRead, void* pBufferOut)
+{
+    ma_uint64 totalFramesRead = 0;
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    MA_DR_MP3_ASSERT(pMP3->onRead != NULL);
+    while (framesToRead > 0) {
+        ma_uint32 framesToConsume = (ma_uint32)MA_DR_MP3_MIN(pMP3->pcmFramesRemainingInMP3Frame, framesToRead);
+        if (pBufferOut != NULL) {
+        #if defined(MA_DR_MP3_FLOAT_OUTPUT)
+            float* pFramesOutF32 = (float*)MA_DR_MP3_OFFSET_PTR(pBufferOut,          sizeof(float) * totalFramesRead                   * pMP3->channels);
+            float* pFramesInF32  = (float*)MA_DR_MP3_OFFSET_PTR(&pMP3->pcmFrames[0], sizeof(float) * pMP3->pcmFramesConsumedInMP3Frame * pMP3->mp3FrameChannels);
+            MA_DR_MP3_COPY_MEMORY(pFramesOutF32, pFramesInF32, sizeof(float) * framesToConsume * pMP3->channels);
+        #else
+            ma_int16* pFramesOutS16 = (ma_int16*)MA_DR_MP3_OFFSET_PTR(pBufferOut,          sizeof(ma_int16) * totalFramesRead                   * pMP3->channels);
+            ma_int16* pFramesInS16  = (ma_int16*)MA_DR_MP3_OFFSET_PTR(&pMP3->pcmFrames[0], sizeof(ma_int16) * pMP3->pcmFramesConsumedInMP3Frame * pMP3->mp3FrameChannels);
+            MA_DR_MP3_COPY_MEMORY(pFramesOutS16, pFramesInS16, sizeof(ma_int16) * framesToConsume * pMP3->channels);
+        #endif
+        }
+        pMP3->currentPCMFrame              += framesToConsume;
+        pMP3->pcmFramesConsumedInMP3Frame  += framesToConsume;
+        pMP3->pcmFramesRemainingInMP3Frame -= framesToConsume;
+        totalFramesRead                    += framesToConsume;
+        framesToRead                       -= framesToConsume;
+        if (framesToRead == 0) {
+            break;
+        }
+        MA_DR_MP3_ASSERT(pMP3->pcmFramesRemainingInMP3Frame == 0);
+        if (ma_dr_mp3_decode_next_frame(pMP3) == 0) {
+            break;
+        }
+    }
+    return totalFramesRead;
+}
+MA_API ma_uint64 ma_dr_mp3_read_pcm_frames_f32(ma_dr_mp3* pMP3, ma_uint64 framesToRead, float* pBufferOut)
+{
+    if (pMP3 == NULL || pMP3->onRead == NULL) {
+        return 0;
+    }
+#if defined(MA_DR_MP3_FLOAT_OUTPUT)
+    return ma_dr_mp3_read_pcm_frames_raw(pMP3, framesToRead, pBufferOut);
+#else
+    {
+        ma_int16 pTempS16[8192];
+        ma_uint64 totalPCMFramesRead = 0;
+        while (totalPCMFramesRead < framesToRead) {
+            ma_uint64 framesJustRead;
+            ma_uint64 framesRemaining = framesToRead - totalPCMFramesRead;
+            ma_uint64 framesToReadNow = MA_DR_MP3_COUNTOF(pTempS16) / pMP3->channels;
+            if (framesToReadNow > framesRemaining) {
+                framesToReadNow = framesRemaining;
+            }
+            framesJustRead = ma_dr_mp3_read_pcm_frames_raw(pMP3, framesToReadNow, pTempS16);
+            if (framesJustRead == 0) {
+                break;
+            }
+            ma_dr_mp3_s16_to_f32((float*)MA_DR_MP3_OFFSET_PTR(pBufferOut, sizeof(float) * totalPCMFramesRead * pMP3->channels), pTempS16, framesJustRead * pMP3->channels);
+            totalPCMFramesRead += framesJustRead;
+        }
+        return totalPCMFramesRead;
+    }
+#endif
+}
+MA_API ma_uint64 ma_dr_mp3_read_pcm_frames_s16(ma_dr_mp3* pMP3, ma_uint64 framesToRead, ma_int16* pBufferOut)
+{
+    if (pMP3 == NULL || pMP3->onRead == NULL) {
+        return 0;
+    }
+#if !defined(MA_DR_MP3_FLOAT_OUTPUT)
+    return ma_dr_mp3_read_pcm_frames_raw(pMP3, framesToRead, pBufferOut);
+#else
+    {
+        float pTempF32[4096];
+        ma_uint64 totalPCMFramesRead = 0;
+        while (totalPCMFramesRead < framesToRead) {
+            ma_uint64 framesJustRead;
+            ma_uint64 framesRemaining = framesToRead - totalPCMFramesRead;
+            ma_uint64 framesToReadNow = MA_DR_MP3_COUNTOF(pTempF32) / pMP3->channels;
+            if (framesToReadNow > framesRemaining) {
+                framesToReadNow = framesRemaining;
+            }
+            framesJustRead = ma_dr_mp3_read_pcm_frames_raw(pMP3, framesToReadNow, pTempF32);
+            if (framesJustRead == 0) {
+                break;
+            }
+            ma_dr_mp3_f32_to_s16((ma_int16*)MA_DR_MP3_OFFSET_PTR(pBufferOut, sizeof(ma_int16) * totalPCMFramesRead * pMP3->channels), pTempF32, framesJustRead * pMP3->channels);
+            totalPCMFramesRead += framesJustRead;
+        }
+        return totalPCMFramesRead;
+    }
+#endif
+}
+static void ma_dr_mp3_reset(ma_dr_mp3* pMP3)
+{
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    pMP3->pcmFramesConsumedInMP3Frame = 0;
+    pMP3->pcmFramesRemainingInMP3Frame = 0;
+    pMP3->currentPCMFrame = 0;
+    pMP3->dataSize = 0;
+    pMP3->atEnd = MA_FALSE;
+    ma_dr_mp3dec_init(&pMP3->decoder);
+}
+static ma_bool32 ma_dr_mp3_seek_to_start_of_stream(ma_dr_mp3* pMP3)
+{
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    MA_DR_MP3_ASSERT(pMP3->onSeek != NULL);
+    if (!ma_dr_mp3__on_seek(pMP3, 0, ma_dr_mp3_seek_origin_start)) {
+        return MA_FALSE;
+    }
+    ma_dr_mp3_reset(pMP3);
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_mp3_seek_forward_by_pcm_frames__brute_force(ma_dr_mp3* pMP3, ma_uint64 frameOffset)
+{
+    ma_uint64 framesRead;
+#if defined(MA_DR_MP3_FLOAT_OUTPUT)
+    framesRead = ma_dr_mp3_read_pcm_frames_f32(pMP3, frameOffset, NULL);
+#else
+    framesRead = ma_dr_mp3_read_pcm_frames_s16(pMP3, frameOffset, NULL);
+#endif
+    if (framesRead != frameOffset) {
+        return MA_FALSE;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_mp3_seek_to_pcm_frame__brute_force(ma_dr_mp3* pMP3, ma_uint64 frameIndex)
+{
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    if (frameIndex == pMP3->currentPCMFrame) {
+        return MA_TRUE;
+    }
+    if (frameIndex < pMP3->currentPCMFrame) {
+        if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
+            return MA_FALSE;
+        }
+    }
+    MA_DR_MP3_ASSERT(frameIndex >= pMP3->currentPCMFrame);
+    return ma_dr_mp3_seek_forward_by_pcm_frames__brute_force(pMP3, (frameIndex - pMP3->currentPCMFrame));
+}
+static ma_bool32 ma_dr_mp3_find_closest_seek_point(ma_dr_mp3* pMP3, ma_uint64 frameIndex, ma_uint32* pSeekPointIndex)
+{
+    ma_uint32 iSeekPoint;
+    MA_DR_MP3_ASSERT(pSeekPointIndex != NULL);
+    *pSeekPointIndex = 0;
+    if (frameIndex < pMP3->pSeekPoints[0].pcmFrameIndex) {
+        return MA_FALSE;
+    }
+    for (iSeekPoint = 0; iSeekPoint < pMP3->seekPointCount; ++iSeekPoint) {
+        if (pMP3->pSeekPoints[iSeekPoint].pcmFrameIndex > frameIndex) {
+            break;
+        }
+        *pSeekPointIndex = iSeekPoint;
+    }
+    return MA_TRUE;
+}
+static ma_bool32 ma_dr_mp3_seek_to_pcm_frame__seek_table(ma_dr_mp3* pMP3, ma_uint64 frameIndex)
+{
+    ma_dr_mp3_seek_point seekPoint;
+    ma_uint32 priorSeekPointIndex;
+    ma_uint16 iMP3Frame;
+    ma_uint64 leftoverFrames;
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    MA_DR_MP3_ASSERT(pMP3->pSeekPoints != NULL);
+    MA_DR_MP3_ASSERT(pMP3->seekPointCount > 0);
+    if (ma_dr_mp3_find_closest_seek_point(pMP3, frameIndex, &priorSeekPointIndex)) {
+        seekPoint = pMP3->pSeekPoints[priorSeekPointIndex];
+    } else {
+        seekPoint.seekPosInBytes     = 0;
+        seekPoint.pcmFrameIndex      = 0;
+        seekPoint.mp3FramesToDiscard = 0;
+        seekPoint.pcmFramesToDiscard = 0;
+    }
+    if (!ma_dr_mp3__on_seek_64(pMP3, seekPoint.seekPosInBytes, ma_dr_mp3_seek_origin_start)) {
+        return MA_FALSE;
+    }
+    ma_dr_mp3_reset(pMP3);
+    for (iMP3Frame = 0; iMP3Frame < seekPoint.mp3FramesToDiscard; ++iMP3Frame) {
+        ma_uint32 pcmFramesRead;
+        ma_dr_mp3d_sample_t* pPCMFrames;
+        pPCMFrames = NULL;
+        if (iMP3Frame == seekPoint.mp3FramesToDiscard-1) {
+            pPCMFrames = (ma_dr_mp3d_sample_t*)pMP3->pcmFrames;
+        }
+        pcmFramesRead = ma_dr_mp3_decode_next_frame_ex(pMP3, pPCMFrames);
+        if (pcmFramesRead == 0) {
+            return MA_FALSE;
+        }
+    }
+    pMP3->currentPCMFrame = seekPoint.pcmFrameIndex - seekPoint.pcmFramesToDiscard;
+    leftoverFrames = frameIndex - pMP3->currentPCMFrame;
+    return ma_dr_mp3_seek_forward_by_pcm_frames__brute_force(pMP3, leftoverFrames);
+}
+MA_API ma_bool32 ma_dr_mp3_seek_to_pcm_frame(ma_dr_mp3* pMP3, ma_uint64 frameIndex)
+{
+    if (pMP3 == NULL || pMP3->onSeek == NULL) {
+        return MA_FALSE;
+    }
+    if (frameIndex == 0) {
+        return ma_dr_mp3_seek_to_start_of_stream(pMP3);
+    }
+    if (pMP3->pSeekPoints != NULL && pMP3->seekPointCount > 0) {
+        return ma_dr_mp3_seek_to_pcm_frame__seek_table(pMP3, frameIndex);
+    } else {
+        return ma_dr_mp3_seek_to_pcm_frame__brute_force(pMP3, frameIndex);
+    }
+}
+MA_API ma_bool32 ma_dr_mp3_get_mp3_and_pcm_frame_count(ma_dr_mp3* pMP3, ma_uint64* pMP3FrameCount, ma_uint64* pPCMFrameCount)
+{
+    ma_uint64 currentPCMFrame;
+    ma_uint64 totalPCMFrameCount;
+    ma_uint64 totalMP3FrameCount;
+    if (pMP3 == NULL) {
+        return MA_FALSE;
+    }
+    if (pMP3->onSeek == NULL) {
+        return MA_FALSE;
+    }
+    currentPCMFrame = pMP3->currentPCMFrame;
+    if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
+        return MA_FALSE;
+    }
+    totalPCMFrameCount = 0;
+    totalMP3FrameCount = 0;
+    for (;;) {
+        ma_uint32 pcmFramesInCurrentMP3Frame;
+        pcmFramesInCurrentMP3Frame = ma_dr_mp3_decode_next_frame_ex(pMP3, NULL);
+        if (pcmFramesInCurrentMP3Frame == 0) {
+            break;
+        }
+        totalPCMFrameCount += pcmFramesInCurrentMP3Frame;
+        totalMP3FrameCount += 1;
+    }
+    if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
+        return MA_FALSE;
+    }
+    if (!ma_dr_mp3_seek_to_pcm_frame(pMP3, currentPCMFrame)) {
+        return MA_FALSE;
+    }
+    if (pMP3FrameCount != NULL) {
+        *pMP3FrameCount = totalMP3FrameCount;
+    }
+    if (pPCMFrameCount != NULL) {
+        *pPCMFrameCount = totalPCMFrameCount;
+    }
+    return MA_TRUE;
+}
+MA_API ma_uint64 ma_dr_mp3_get_pcm_frame_count(ma_dr_mp3* pMP3)
+{
+    ma_uint64 totalPCMFrameCount;
+    if (!ma_dr_mp3_get_mp3_and_pcm_frame_count(pMP3, NULL, &totalPCMFrameCount)) {
+        return 0;
+    }
+    return totalPCMFrameCount;
+}
+MA_API ma_uint64 ma_dr_mp3_get_mp3_frame_count(ma_dr_mp3* pMP3)
+{
+    ma_uint64 totalMP3FrameCount;
+    if (!ma_dr_mp3_get_mp3_and_pcm_frame_count(pMP3, &totalMP3FrameCount, NULL)) {
+        return 0;
+    }
+    return totalMP3FrameCount;
+}
+static void ma_dr_mp3__accumulate_running_pcm_frame_count(ma_dr_mp3* pMP3, ma_uint32 pcmFrameCountIn, ma_uint64* pRunningPCMFrameCount, float* pRunningPCMFrameCountFractionalPart)
+{
+    float srcRatio;
+    float pcmFrameCountOutF;
+    ma_uint32 pcmFrameCountOut;
+    srcRatio = (float)pMP3->mp3FrameSampleRate / (float)pMP3->sampleRate;
+    MA_DR_MP3_ASSERT(srcRatio > 0);
+    pcmFrameCountOutF = *pRunningPCMFrameCountFractionalPart + (pcmFrameCountIn / srcRatio);
+    pcmFrameCountOut  = (ma_uint32)pcmFrameCountOutF;
+    *pRunningPCMFrameCountFractionalPart = pcmFrameCountOutF - pcmFrameCountOut;
+    *pRunningPCMFrameCount += pcmFrameCountOut;
+}
+typedef struct
+{
+    ma_uint64 bytePos;
+    ma_uint64 pcmFrameIndex;
+} ma_dr_mp3__seeking_mp3_frame_info;
+MA_API ma_bool32 ma_dr_mp3_calculate_seek_points(ma_dr_mp3* pMP3, ma_uint32* pSeekPointCount, ma_dr_mp3_seek_point* pSeekPoints)
+{
+    ma_uint32 seekPointCount;
+    ma_uint64 currentPCMFrame;
+    ma_uint64 totalMP3FrameCount;
+    ma_uint64 totalPCMFrameCount;
+    if (pMP3 == NULL || pSeekPointCount == NULL || pSeekPoints == NULL) {
+        return MA_FALSE;
+    }
+    seekPointCount = *pSeekPointCount;
+    if (seekPointCount == 0) {
+        return MA_FALSE;
+    }
+    currentPCMFrame = pMP3->currentPCMFrame;
+    if (!ma_dr_mp3_get_mp3_and_pcm_frame_count(pMP3, &totalMP3FrameCount, &totalPCMFrameCount)) {
+        return MA_FALSE;
+    }
+    if (totalMP3FrameCount < MA_DR_MP3_SEEK_LEADING_MP3_FRAMES+1) {
+        seekPointCount = 1;
+        pSeekPoints[0].seekPosInBytes     = 0;
+        pSeekPoints[0].pcmFrameIndex      = 0;
+        pSeekPoints[0].mp3FramesToDiscard = 0;
+        pSeekPoints[0].pcmFramesToDiscard = 0;
+    } else {
+        ma_uint64 pcmFramesBetweenSeekPoints;
+        ma_dr_mp3__seeking_mp3_frame_info mp3FrameInfo[MA_DR_MP3_SEEK_LEADING_MP3_FRAMES+1];
+        ma_uint64 runningPCMFrameCount = 0;
+        float runningPCMFrameCountFractionalPart = 0;
+        ma_uint64 nextTargetPCMFrame;
+        ma_uint32 iMP3Frame;
+        ma_uint32 iSeekPoint;
+        if (seekPointCount > totalMP3FrameCount-1) {
+            seekPointCount = (ma_uint32)totalMP3FrameCount-1;
+        }
+        pcmFramesBetweenSeekPoints = totalPCMFrameCount / (seekPointCount+1);
+        if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
+            return MA_FALSE;
+        }
+        for (iMP3Frame = 0; iMP3Frame < MA_DR_MP3_SEEK_LEADING_MP3_FRAMES+1; ++iMP3Frame) {
+            ma_uint32 pcmFramesInCurrentMP3FrameIn;
+            MA_DR_MP3_ASSERT(pMP3->streamCursor >= pMP3->dataSize);
+            mp3FrameInfo[iMP3Frame].bytePos       = pMP3->streamCursor - pMP3->dataSize;
+            mp3FrameInfo[iMP3Frame].pcmFrameIndex = runningPCMFrameCount;
+            pcmFramesInCurrentMP3FrameIn = ma_dr_mp3_decode_next_frame_ex(pMP3, NULL);
+            if (pcmFramesInCurrentMP3FrameIn == 0) {
+                return MA_FALSE;
+            }
+            ma_dr_mp3__accumulate_running_pcm_frame_count(pMP3, pcmFramesInCurrentMP3FrameIn, &runningPCMFrameCount, &runningPCMFrameCountFractionalPart);
+        }
+        nextTargetPCMFrame = 0;
+        for (iSeekPoint = 0; iSeekPoint < seekPointCount; ++iSeekPoint) {
+            nextTargetPCMFrame += pcmFramesBetweenSeekPoints;
+            for (;;) {
+                if (nextTargetPCMFrame < runningPCMFrameCount) {
+                    pSeekPoints[iSeekPoint].seekPosInBytes     = mp3FrameInfo[0].bytePos;
+                    pSeekPoints[iSeekPoint].pcmFrameIndex      = nextTargetPCMFrame;
+                    pSeekPoints[iSeekPoint].mp3FramesToDiscard = MA_DR_MP3_SEEK_LEADING_MP3_FRAMES;
+                    pSeekPoints[iSeekPoint].pcmFramesToDiscard = (ma_uint16)(nextTargetPCMFrame - mp3FrameInfo[MA_DR_MP3_SEEK_LEADING_MP3_FRAMES-1].pcmFrameIndex);
+                    break;
+                } else {
+                    size_t i;
+                    ma_uint32 pcmFramesInCurrentMP3FrameIn;
+                    for (i = 0; i < MA_DR_MP3_COUNTOF(mp3FrameInfo)-1; ++i) {
+                        mp3FrameInfo[i] = mp3FrameInfo[i+1];
+                    }
+                    mp3FrameInfo[MA_DR_MP3_COUNTOF(mp3FrameInfo)-1].bytePos       = pMP3->streamCursor - pMP3->dataSize;
+                    mp3FrameInfo[MA_DR_MP3_COUNTOF(mp3FrameInfo)-1].pcmFrameIndex = runningPCMFrameCount;
+                    pcmFramesInCurrentMP3FrameIn = ma_dr_mp3_decode_next_frame_ex(pMP3, NULL);
+                    if (pcmFramesInCurrentMP3FrameIn == 0) {
+                        pSeekPoints[iSeekPoint].seekPosInBytes     = mp3FrameInfo[0].bytePos;
+                        pSeekPoints[iSeekPoint].pcmFrameIndex      = nextTargetPCMFrame;
+                        pSeekPoints[iSeekPoint].mp3FramesToDiscard = MA_DR_MP3_SEEK_LEADING_MP3_FRAMES;
+                        pSeekPoints[iSeekPoint].pcmFramesToDiscard = (ma_uint16)(nextTargetPCMFrame - mp3FrameInfo[MA_DR_MP3_SEEK_LEADING_MP3_FRAMES-1].pcmFrameIndex);
+                        break;
+                    }
+                    ma_dr_mp3__accumulate_running_pcm_frame_count(pMP3, pcmFramesInCurrentMP3FrameIn, &runningPCMFrameCount, &runningPCMFrameCountFractionalPart);
+                }
+            }
+        }
+        if (!ma_dr_mp3_seek_to_start_of_stream(pMP3)) {
+            return MA_FALSE;
+        }
+        if (!ma_dr_mp3_seek_to_pcm_frame(pMP3, currentPCMFrame)) {
+            return MA_FALSE;
+        }
+    }
+    *pSeekPointCount = seekPointCount;
+    return MA_TRUE;
+}
+MA_API ma_bool32 ma_dr_mp3_bind_seek_table(ma_dr_mp3* pMP3, ma_uint32 seekPointCount, ma_dr_mp3_seek_point* pSeekPoints)
+{
+    if (pMP3 == NULL) {
+        return MA_FALSE;
+    }
+    if (seekPointCount == 0 || pSeekPoints == NULL) {
+        pMP3->seekPointCount = 0;
+        pMP3->pSeekPoints = NULL;
+    } else {
+        pMP3->seekPointCount = seekPointCount;
+        pMP3->pSeekPoints = pSeekPoints;
+    }
+    return MA_TRUE;
+}
+static float* ma_dr_mp3__full_read_and_close_f32(ma_dr_mp3* pMP3, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount)
+{
+    ma_uint64 totalFramesRead = 0;
+    ma_uint64 framesCapacity = 0;
+    float* pFrames = NULL;
+    float temp[4096];
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    for (;;) {
+        ma_uint64 framesToReadRightNow = MA_DR_MP3_COUNTOF(temp) / pMP3->channels;
+        ma_uint64 framesJustRead = ma_dr_mp3_read_pcm_frames_f32(pMP3, framesToReadRightNow, temp);
+        if (framesJustRead == 0) {
+            break;
+        }
+        if (framesCapacity < totalFramesRead + framesJustRead) {
+            ma_uint64 oldFramesBufferSize;
+            ma_uint64 newFramesBufferSize;
+            ma_uint64 newFramesCap;
+            float* pNewFrames;
+            newFramesCap = framesCapacity * 2;
+            if (newFramesCap < totalFramesRead + framesJustRead) {
+                newFramesCap = totalFramesRead + framesJustRead;
+            }
+            oldFramesBufferSize = framesCapacity * pMP3->channels * sizeof(float);
+            newFramesBufferSize = newFramesCap   * pMP3->channels * sizeof(float);
+            if (newFramesBufferSize > (ma_uint64)MA_SIZE_MAX) {
+                break;
+            }
+            pNewFrames = (float*)ma_dr_mp3__realloc_from_callbacks(pFrames, (size_t)newFramesBufferSize, (size_t)oldFramesBufferSize, &pMP3->allocationCallbacks);
+            if (pNewFrames == NULL) {
+                ma_dr_mp3__free_from_callbacks(pFrames, &pMP3->allocationCallbacks);
+                break;
+            }
+            pFrames = pNewFrames;
+            framesCapacity = newFramesCap;
+        }
+        MA_DR_MP3_COPY_MEMORY(pFrames + totalFramesRead*pMP3->channels, temp, (size_t)(framesJustRead*pMP3->channels*sizeof(float)));
+        totalFramesRead += framesJustRead;
+        if (framesJustRead != framesToReadRightNow) {
+            break;
+        }
+    }
+    if (pConfig != NULL) {
+        pConfig->channels   = pMP3->channels;
+        pConfig->sampleRate = pMP3->sampleRate;
+    }
+    ma_dr_mp3_uninit(pMP3);
+    if (pTotalFrameCount) {
+        *pTotalFrameCount = totalFramesRead;
+    }
+    return pFrames;
+}
+static ma_int16* ma_dr_mp3__full_read_and_close_s16(ma_dr_mp3* pMP3, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount)
+{
+    ma_uint64 totalFramesRead = 0;
+    ma_uint64 framesCapacity = 0;
+    ma_int16* pFrames = NULL;
+    ma_int16 temp[4096];
+    MA_DR_MP3_ASSERT(pMP3 != NULL);
+    for (;;) {
+        ma_uint64 framesToReadRightNow = MA_DR_MP3_COUNTOF(temp) / pMP3->channels;
+        ma_uint64 framesJustRead = ma_dr_mp3_read_pcm_frames_s16(pMP3, framesToReadRightNow, temp);
+        if (framesJustRead == 0) {
+            break;
+        }
+        if (framesCapacity < totalFramesRead + framesJustRead) {
+            ma_uint64 newFramesBufferSize;
+            ma_uint64 oldFramesBufferSize;
+            ma_uint64 newFramesCap;
+            ma_int16* pNewFrames;
+            newFramesCap = framesCapacity * 2;
+            if (newFramesCap < totalFramesRead + framesJustRead) {
+                newFramesCap = totalFramesRead + framesJustRead;
+            }
+            oldFramesBufferSize = framesCapacity * pMP3->channels * sizeof(ma_int16);
+            newFramesBufferSize = newFramesCap   * pMP3->channels * sizeof(ma_int16);
+            if (newFramesBufferSize > (ma_uint64)MA_SIZE_MAX) {
+                break;
+            }
+            pNewFrames = (ma_int16*)ma_dr_mp3__realloc_from_callbacks(pFrames, (size_t)newFramesBufferSize, (size_t)oldFramesBufferSize, &pMP3->allocationCallbacks);
+            if (pNewFrames == NULL) {
+                ma_dr_mp3__free_from_callbacks(pFrames, &pMP3->allocationCallbacks);
+                break;
+            }
+            pFrames = pNewFrames;
+            framesCapacity = newFramesCap;
+        }
+        MA_DR_MP3_COPY_MEMORY(pFrames + totalFramesRead*pMP3->channels, temp, (size_t)(framesJustRead*pMP3->channels*sizeof(ma_int16)));
+        totalFramesRead += framesJustRead;
+        if (framesJustRead != framesToReadRightNow) {
+            break;
+        }
+    }
+    if (pConfig != NULL) {
+        pConfig->channels   = pMP3->channels;
+        pConfig->sampleRate = pMP3->sampleRate;
+    }
+    ma_dr_mp3_uninit(pMP3);
+    if (pTotalFrameCount) {
+        *pTotalFrameCount = totalFramesRead;
+    }
+    return pFrames;
+}
+MA_API float* ma_dr_mp3_open_and_read_pcm_frames_f32(ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_mp3 mp3;
+    if (!ma_dr_mp3_init(&mp3, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_mp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
+}
+MA_API ma_int16* ma_dr_mp3_open_and_read_pcm_frames_s16(ma_dr_mp3_read_proc onRead, ma_dr_mp3_seek_proc onSeek, void* pUserData, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_mp3 mp3;
+    if (!ma_dr_mp3_init(&mp3, onRead, onSeek, pUserData, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_mp3__full_read_and_close_s16(&mp3, pConfig, pTotalFrameCount);
+}
+MA_API float* ma_dr_mp3_open_memory_and_read_pcm_frames_f32(const void* pData, size_t dataSize, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_mp3 mp3;
+    if (!ma_dr_mp3_init_memory(&mp3, pData, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_mp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
+}
+MA_API ma_int16* ma_dr_mp3_open_memory_and_read_pcm_frames_s16(const void* pData, size_t dataSize, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_mp3 mp3;
+    if (!ma_dr_mp3_init_memory(&mp3, pData, dataSize, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_mp3__full_read_and_close_s16(&mp3, pConfig, pTotalFrameCount);
+}
+#ifndef MA_DR_MP3_NO_STDIO
+MA_API float* ma_dr_mp3_open_file_and_read_pcm_frames_f32(const char* filePath, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_mp3 mp3;
+    if (!ma_dr_mp3_init_file(&mp3, filePath, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_mp3__full_read_and_close_f32(&mp3, pConfig, pTotalFrameCount);
+}
+MA_API ma_int16* ma_dr_mp3_open_file_and_read_pcm_frames_s16(const char* filePath, ma_dr_mp3_config* pConfig, ma_uint64* pTotalFrameCount, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    ma_dr_mp3 mp3;
+    if (!ma_dr_mp3_init_file(&mp3, filePath, pAllocationCallbacks)) {
+        return NULL;
+    }
+    return ma_dr_mp3__full_read_and_close_s16(&mp3, pConfig, pTotalFrameCount);
+}
+#endif
+MA_API void* ma_dr_mp3_malloc(size_t sz, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        return ma_dr_mp3__malloc_from_callbacks(sz, pAllocationCallbacks);
+    } else {
+        return ma_dr_mp3__malloc_default(sz, NULL);
+    }
+}
+MA_API void ma_dr_mp3_free(void* p, const ma_allocation_callbacks* pAllocationCallbacks)
+{
+    if (pAllocationCallbacks != NULL) {
+        ma_dr_mp3__free_from_callbacks(p, pAllocationCallbacks);
+    } else {
+        ma_dr_mp3__free_default(p, NULL);
+    }
+}
+#endif
+/* dr_mp3_c end */
+#endif  /* MA_DR_MP3_IMPLEMENTATION */
+#endif  /* MA_NO_MP3 */
+
+
+/* End globally disabled warnings. */
+#if defined(_MSC_VER)
+    #pragma warning(pop)
+#endif
+
+#endif  /* miniaudio_c */
+#endif  /* MINIAUDIO_IMPLEMENTATION */
+
+
+/*
+This software is available as a choice of the following licenses. Choose
+whichever you prefer.
+
+===============================================================================
+ALTERNATIVE 1 - Public Domain (www.unlicense.org)
+===============================================================================
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+
+===============================================================================
+ALTERNATIVE 2 - MIT No Attribution
+===============================================================================
+Copyright 2025 David Reid
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
diff --git a/common/minja/chat-template.hpp b/vendor/minja/chat-template.hpp
similarity index 99%
rename from common/minja/chat-template.hpp
rename to vendor/minja/chat-template.hpp
index c930a587a..ab5b521dd 100644
--- a/common/minja/chat-template.hpp
+++ b/vendor/minja/chat-template.hpp
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 
 using json = nlohmann::ordered_json;
 
diff --git a/common/minja/minja.hpp b/vendor/minja/minja.hpp
similarity index 99%
rename from common/minja/minja.hpp
rename to vendor/minja/minja.hpp
index b3b00547d..f9658ddc0 100644
--- a/common/minja/minja.hpp
+++ b/vendor/minja/minja.hpp
@@ -29,7 +29,7 @@
 #include <utility>
 #include <vector>
 
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 
 using json = nlohmann::ordered_json;
 
diff --git a/common/json.hpp b/vendor/nlohmann/json.hpp
similarity index 94%
rename from common/json.hpp
rename to vendor/nlohmann/json.hpp
index a858728c4..82d69f7c5 100644
--- a/common/json.hpp
+++ b/vendor/nlohmann/json.hpp
@@ -1,9 +1,9 @@
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 /****************************************************************************\
@@ -34,10 +34,10 @@
 // #include <nlohmann/adl_serializer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -47,10 +47,10 @@
 // #include <nlohmann/detail/abi_macros.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -59,20 +59,24 @@
 
 #ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
     #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
-        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
+        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 12 || NLOHMANN_JSON_VERSION_PATCH != 0
             #warning "Already included a different version of the library!"
         #endif
     #endif
 #endif
 
 #define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
-#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
-#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_MINOR 12  // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_PATCH 0   // NOLINT(modernize-macro-to-enum)
 
 #ifndef JSON_DIAGNOSTICS
     #define JSON_DIAGNOSTICS 0
 #endif
 
+#ifndef JSON_DIAGNOSTIC_POSITIONS
+    #define JSON_DIAGNOSTIC_POSITIONS 0
+#endif
+
 #ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
     #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
 #endif
@@ -83,6 +87,12 @@
     #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
 #endif
 
+#if JSON_DIAGNOSTIC_POSITIONS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
+#endif
+
 #if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
     #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
 #else
@@ -94,14 +104,15 @@
 #endif
 
 // Construct the namespace ABI tags component
-#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b) json_abi ## a ## b
-#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b) \
-    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b)
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)
 
 #define NLOHMANN_JSON_ABI_TAGS                                       \
     NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
             NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
-            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON)
+            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)
 
 // Construct the namespace version component
 #define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
@@ -149,10 +160,10 @@
 // #include <nlohmann/detail/conversions/from_json.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -162,6 +173,9 @@
 #include <forward_list> // forward_list
 #include <iterator> // inserter, front_inserter, end
 #include <map> // map
+#ifdef JSON_HAS_CPP_17
+    #include <optional> // optional
+#endif
 #include <string> // string
 #include <tuple> // tuple, make_tuple
 #include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
@@ -172,10 +186,10 @@
 // #include <nlohmann/detail/exceptions.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -192,10 +206,10 @@
 // #include <nlohmann/detail/value_t.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -208,10 +222,10 @@
 // #include <nlohmann/detail/macro_scope.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -220,10 +234,10 @@
 // #include <nlohmann/detail/meta/detected.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -233,10 +247,10 @@
 // #include <nlohmann/detail/meta/void_t.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -320,11 +334,11 @@ NLOHMANN_JSON_NAMESPACE_END
 
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
-// SPDX-FileCopyrightText: 2016-2021 Evan Nemerson <evan@nemerson.com>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2016 - 2021 Evan Nemerson <evan@nemerson.com>
 // SPDX-License-Identifier: MIT
 
 /* Hedley - https://nemequ.github.io/hedley
@@ -2384,15 +2398,20 @@ JSON_HEDLEY_DIAGNOSTIC_POP
 
 // C++ language standard detection
 // if the user manually specified the used c++ version this is skipped
-#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11)
-    #if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#if !defined(JSON_HAS_CPP_23) && !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11)
+    #if (defined(__cplusplus) && __cplusplus > 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG > 202002L)
+        #define JSON_HAS_CPP_23
         #define JSON_HAS_CPP_20
         #define JSON_HAS_CPP_17
         #define JSON_HAS_CPP_14
-    #elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+    #elif (defined(__cplusplus) && __cplusplus > 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG > 201703L)
+        #define JSON_HAS_CPP_20
         #define JSON_HAS_CPP_17
         #define JSON_HAS_CPP_14
-    #elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
+    #elif (defined(__cplusplus) && __cplusplus > 201402L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+        #define JSON_HAS_CPP_17
+        #define JSON_HAS_CPP_14
+    #elif (defined(__cplusplus) && __cplusplus > 201103L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
         #define JSON_HAS_CPP_14
     #endif
     // the cpp 11 flag is always specified because it is the minimal required version
@@ -2568,7 +2587,9 @@ JSON_HEDLEY_DIAGNOSTIC_POP
     template<typename BasicJsonType>                                                            \
     inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
     {                                                                                           \
+        /* NOLINTNEXTLINE(modernize-type-traits) we use C++11 */                                \
         static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        /* NOLINTNEXTLINE(modernize-avoid-c-arrays) we don't want to depend on <array> */       \
         static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
         auto it = std::find_if(std::begin(m), std::end(m),                                      \
                                [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
@@ -2580,7 +2601,9 @@ JSON_HEDLEY_DIAGNOSTIC_POP
     template<typename BasicJsonType>                                                            \
     inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
     {                                                                                           \
+        /* NOLINTNEXTLINE(modernize-type-traits) we use C++11 */                                \
         static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        /* NOLINTNEXTLINE(modernize-avoid-c-arrays) we don't want to depend on <array> */       \
         static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
         auto it = std::find_if(std::begin(m), std::end(m),                                      \
                                [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
@@ -2743,42 +2766,146 @@ JSON_HEDLEY_DIAGNOSTIC_POP
 
 #define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
 #define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);
-#define NLOHMANN_JSON_FROM_WITH_DEFAULT(v1) nlohmann_json_t.v1 = nlohmann_json_j.value(#v1, nlohmann_json_default_obj.v1);
+#define NLOHMANN_JSON_FROM_WITH_DEFAULT(v1) nlohmann_json_t.v1 = !nlohmann_json_j.is_null() ? nlohmann_json_j.value(#v1, nlohmann_json_default_obj.v1) : nlohmann_json_default_obj.v1;
 
 /*!
 @brief macro
 @def NLOHMANN_DEFINE_TYPE_INTRUSIVE
 @since version 3.9.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_type_intrusive/
 */
 #define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void from_json(const BasicJsonType& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
 
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
+@since version 3.11.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_type_intrusive/
+*/
 #define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void from_json(const BasicJsonType& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
 
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE
+@since version 3.11.3
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_type_intrusive/
+*/
 #define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
-    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
 
 /*!
 @brief macro
 @def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
 @since version 3.9.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_type_non_intrusive/
 */
 #define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
-
-#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void from_json(const BasicJsonType& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
 
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT
+@since version 3.11.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_type_non_intrusive/
+*/
 #define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...)  \
-    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
-    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void from_json(const BasicJsonType& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE
+@since version 3.11.3
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_type_non_intrusive/
+*/
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...)  \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE
+@since version 3.12.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_derived_type/
+*/
+#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE(Type, BaseType, ...)  \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void from_json(const BasicJsonType& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_WITH_DEFAULT
+@since version 3.12.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_derived_type/
+*/
+#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...)  \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void from_json(const BasicJsonType& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_ONLY_SERIALIZE
+@since version 3.12.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_derived_type/
+*/
+#define NLOHMANN_DEFINE_DERIVED_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, BaseType, ...)  \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    friend void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE
+@since version 3.12.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_derived_type/
+*/
+#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE(Type, BaseType, ...)  \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void from_json(const BasicJsonType& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_WITH_DEFAULT
+@since version 3.12.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_derived_type/
+*/
+#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, BaseType, ...)  \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void from_json(const BasicJsonType& nlohmann_json_j, Type& nlohmann_json_t) { nlohmann::from_json(nlohmann_json_j, static_cast<BaseType&>(nlohmann_json_t)); const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE
+@since version 3.12.0
+@sa https://json.nlohmann.me/api/macros/nlohmann_define_derived_type/
+*/
+#define NLOHMANN_DEFINE_DERIVED_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, BaseType, ...)  \
+    template<typename BasicJsonType, nlohmann::detail::enable_if_t<nlohmann::detail::is_basic_json<BasicJsonType>::value, int> = 0> \
+    void to_json(BasicJsonType& nlohmann_json_j, const Type& nlohmann_json_t) { nlohmann::to_json(nlohmann_json_j, static_cast<const BaseType &>(nlohmann_json_t)); NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) }
 
 // inspired from https://stackoverflow.com/a/26745591
-// allows to call any std function as if (e.g. with begin):
+// allows calling any std function as if (e.g., with begin):
 // using std::begin; begin(x);
 //
 // it allows using the detected idiom to retrieve the return type
@@ -2939,10 +3066,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/string_escape.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3014,10 +3141,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/position_t.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3056,10 +3183,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/cpp_future.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-FileCopyrightText: 2018 The Abseil Authors
 // SPDX-License-Identifier: MIT
 
@@ -3219,7 +3346,7 @@ struct static_const
 #endif
 
 template<typename T, typename... Args>
-inline constexpr std::array<T, sizeof...(Args)> make_array(Args&& ... args)
+constexpr std::array<T, sizeof...(Args)> make_array(Args&& ... args)
 {
     return std::array<T, sizeof...(Args)> {{static_cast<T>(std::forward<Args>(args))...}};
 }
@@ -3230,27 +3357,27 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/type_traits.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
 
 #include <limits> // numeric_limits
+#include <string> // char_traits
+#include <tuple> // tuple
 #include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
 #include <utility> // declval
-#include <tuple> // tuple
-#include <string> // char_traits
 
 // #include <nlohmann/detail/iterators/iterator_traits.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3293,7 +3420,7 @@ struct iterator_traits
 
 template<typename T>
 struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
-            : iterator_types<T>
+    : iterator_types<T>
 {
 };
 
@@ -3315,10 +3442,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/call_std/begin.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3335,10 +3462,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/call_std/end.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -3359,10 +3486,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/json_fwd.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 #ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
@@ -3624,7 +3751,7 @@ struct char_traits<unsigned char> : std::char_traits<char>
 
     static constexpr int_type eof() noexcept
     {
-        return static_cast<int_type>(EOF);
+        return static_cast<int_type>(std::char_traits<char>::eof());
     }
 };
 
@@ -3648,7 +3775,7 @@ struct char_traits<signed char> : std::char_traits<char>
 
     static constexpr int_type eof() noexcept
     {
-        return static_cast<int_type>(EOF);
+        return static_cast<int_type>(std::char_traits<char>::eof());
     }
 };
 
@@ -3674,19 +3801,19 @@ struct is_default_constructible : std::is_default_constructible<T> {};
 
 template <typename T1, typename T2>
 struct is_default_constructible<std::pair<T1, T2>>
-            : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
+    : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
 
 template <typename T1, typename T2>
 struct is_default_constructible<const std::pair<T1, T2>>
-            : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
+    : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
 
 template <typename... Ts>
 struct is_default_constructible<std::tuple<Ts...>>
-            : conjunction<is_default_constructible<Ts>...> {};
+    : conjunction<is_default_constructible<Ts>...> {};
 
 template <typename... Ts>
 struct is_default_constructible<const std::tuple<Ts...>>
-            : conjunction<is_default_constructible<Ts>...> {};
+    : conjunction<is_default_constructible<Ts>...> {};
 
 template <typename T, typename... Args>
 struct is_constructible : std::is_constructible<T, Args...> {};
@@ -3884,8 +4011,8 @@ is_detected<range_value_t, ConstructibleArrayType>::value&&
 // special case for types like std::filesystem::path whose iterator's value_type are themselves
 // c.f. https://github.com/nlohmann/json/pull/3073
 !std::is_same<ConstructibleArrayType, detected_t<range_value_t, ConstructibleArrayType>>::value&&
-        is_complete_type <
-        detected_t<range_value_t, ConstructibleArrayType >>::value >>
+is_complete_type <
+detected_t<range_value_t, ConstructibleArrayType >>::value >>
 {
     using value_type = range_value_t<ConstructibleArrayType>;
 
@@ -4008,12 +4135,12 @@ using is_usable_as_key_type = typename std::conditional <
 template<typename BasicJsonType, typename KeyTypeCVRef, bool RequireTransparentComparator = true,
          bool ExcludeObjectKeyType = RequireTransparentComparator, typename KeyType = uncvref_t<KeyTypeCVRef>>
 using is_usable_as_basic_json_key_type = typename std::conditional <
-        is_usable_as_key_type<typename BasicJsonType::object_comparator_t,
-        typename BasicJsonType::object_t::key_type, KeyTypeCVRef,
-        RequireTransparentComparator, ExcludeObjectKeyType>::value
-        && !is_json_iterator_of<BasicJsonType, KeyType>::value,
-        std::true_type,
-        std::false_type >::type;
+    is_usable_as_key_type<typename BasicJsonType::object_comparator_t,
+    typename BasicJsonType::object_t::key_type, KeyTypeCVRef,
+    RequireTransparentComparator, ExcludeObjectKeyType>::value
+    && !is_json_iterator_of<BasicJsonType, KeyType>::value,
+    std::true_type,
+    std::false_type >::type;
 
 template<typename ObjectType, typename KeyType>
 using detect_erase_with_key_type = decltype(std::declval<ObjectType&>().erase(std::declval<KeyType>()));
@@ -4147,7 +4274,7 @@ struct value_in_range_of_impl1<OfType, T, true>
 };
 
 template<typename OfType, typename T>
-inline constexpr bool value_in_range_of(T val)
+constexpr bool value_in_range_of(T val)
 {
     return value_in_range_of_impl1<OfType, T>::test(val);
 }
@@ -4163,7 +4290,7 @@ namespace impl
 {
 
 template<typename T>
-inline constexpr bool is_c_string()
+constexpr bool is_c_string()
 {
     using TUnExt = typename std::remove_extent<T>::type;
     using TUnCVExt = typename std::remove_cv<TUnExt>::type;
@@ -4191,7 +4318,7 @@ namespace impl
 {
 
 template<typename T>
-inline constexpr bool is_transparent()
+constexpr bool is_transparent()
 {
     return is_detected<detect_is_transparent, T>::value;
 }
@@ -4210,10 +4337,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/string_concat.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -4358,6 +4485,18 @@ inline OutStringType concat(Args && ... args)
 NLOHMANN_JSON_NAMESPACE_END
 
 
+// With -Wweak-vtables, Clang will complain about the exception classes as they
+// have no out-of-line virtual method definitions and their vtable will be
+// emitted in every translation unit. This issue cannot be fixed with a
+// header-only library as there is no implementation file to move these
+// functions to. As a result, we suppress this warning here to avoid client
+// code to stumble over this. See https://github.com/nlohmann/json/issues/4087
+// for a discussion.
+#if defined(__clang__)
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
 NLOHMANN_JSON_NAMESPACE_BEGIN
 namespace detail
 {
@@ -4452,16 +4591,34 @@ class exception : public std::exception
         {
             return concat(a, '/', detail::escape(b));
         });
-        return concat('(', str, ") ");
+
+        return concat('(', str, ") ", get_byte_positions(leaf_element));
 #else
-        static_cast<void>(leaf_element);
-        return "";
+        return get_byte_positions(leaf_element);
 #endif
     }
 
   private:
     /// an exception object as storage for error messages
     std::runtime_error m;
+#if JSON_DIAGNOSTIC_POSITIONS
+    template<typename BasicJsonType>
+    static std::string get_byte_positions(const BasicJsonType* leaf_element)
+    {
+        if ((leaf_element->start_pos() != std::string::npos) && (leaf_element->end_pos() != std::string::npos))
+        {
+            return concat("(bytes ", std::to_string(leaf_element->start_pos()), "-", std::to_string(leaf_element->end_pos()), ") ");
+        }
+        return "";
+    }
+#else
+    template<typename BasicJsonType>
+    static std::string get_byte_positions(const BasicJsonType* leaf_element)
+    {
+        static_cast<void>(leaf_element);
+        return "";
+    }
+#endif
 };
 
 /// @brief exception indicating a parse error
@@ -4589,6 +4746,10 @@ class other_error : public exception
 }  // namespace detail
 NLOHMANN_JSON_NAMESPACE_END
 
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+
 // #include <nlohmann/detail/macro_scope.hpp>
 
 // #include <nlohmann/detail/meta/cpp_future.hpp>
@@ -4596,10 +4757,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/identity_tag.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -4620,10 +4781,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/meta/std_fs.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -4640,7 +4801,7 @@ namespace std_fs = std::experimental::filesystem;
 }  // namespace detail
 NLOHMANN_JSON_NAMESPACE_END
 #elif JSON_HAS_FILESYSTEM
-#include <filesystem>
+#include <filesystem> // NOLINT(build/c++17)
 NLOHMANN_JSON_NAMESPACE_BEGIN
 namespace detail
 {
@@ -4670,6 +4831,24 @@ inline void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
     n = nullptr;
 }
 
+#ifdef JSON_HAS_CPP_17
+#ifndef JSON_USE_IMPLICIT_CONVERSIONS
+template<typename BasicJsonType, typename T>
+void from_json(const BasicJsonType& j, std::optional<T>& opt)
+{
+    if (j.is_null())
+    {
+        opt = std::nullopt;
+    }
+    else
+    {
+        opt.emplace(j.template get<T>());
+    }
+}
+
+#endif // JSON_USE_IMPLICIT_CONVERSIONS
+#endif // JSON_HAS_CPP_17
+
 // overloads for basic_json template parameters
 template < typename BasicJsonType, typename ArithmeticType,
            enable_if_t < std::is_arithmetic<ArithmeticType>::value&&
@@ -4817,6 +4996,54 @@ auto from_json(const BasicJsonType& j, T (&arr)[N])  // NOLINT(cppcoreguidelines
     }
 }
 
+template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2>
+auto from_json(const BasicJsonType& j, T (&arr)[N1][N2])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i1 = 0; i1 < N1; ++i1)
+    {
+        for (std::size_t i2 = 0; i2 < N2; ++i2)
+        {
+            arr[i1][i2] = j.at(i1).at(i2).template get<T>();
+        }
+    }
+}
+
+template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2, std::size_t N3>
+auto from_json(const BasicJsonType& j, T (&arr)[N1][N2][N3])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i1 = 0; i1 < N1; ++i1)
+    {
+        for (std::size_t i2 = 0; i2 < N2; ++i2)
+        {
+            for (std::size_t i3 = 0; i3 < N3; ++i3)
+            {
+                arr[i1][i2][i3] = j.at(i1).at(i2).at(i3).template get<T>();
+            }
+        }
+    }
+}
+
+template<typename BasicJsonType, typename T, std::size_t N1, std::size_t N2, std::size_t N3, std::size_t N4>
+auto from_json(const BasicJsonType& j, T (&arr)[N1][N2][N3][N4])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i1 = 0; i1 < N1; ++i1)
+    {
+        for (std::size_t i2 = 0; i2 < N2; ++i2)
+        {
+            for (std::size_t i3 = 0; i3 < N3; ++i3)
+            {
+                for (std::size_t i4 = 0; i4 < N4; ++i4)
+                {
+                    arr[i1][i2][i3][i4] = j.at(i1).at(i2).at(i3).at(i4).template get<T>();
+                }
+            }
+        }
+    }
+}
+
 template<typename BasicJsonType>
 inline void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
 {
@@ -4902,7 +5129,7 @@ void())
 
 template < typename BasicJsonType, typename T, std::size_t... Idx >
 std::array<T, sizeof...(Idx)> from_json_inplace_array_impl(BasicJsonType&& j,
-        identity_tag<std::array<T, sizeof...(Idx)>> /*unused*/, index_sequence<Idx...> /*unused*/)
+                     identity_tag<std::array<T, sizeof...(Idx)>> /*unused*/, index_sequence<Idx...> /*unused*/)
 {
     return { { std::forward<BasicJsonType>(j).at(Idx).template get<T>()... } };
 }
@@ -5006,6 +5233,12 @@ std::tuple<Args...> from_json_tuple_impl_base(BasicJsonType&& j, index_sequence<
     return std::make_tuple(std::forward<BasicJsonType>(j).at(Idx).template get<Args>()...);
 }
 
+template<typename BasicJsonType>
+std::tuple<> from_json_tuple_impl_base(BasicJsonType& /*unused*/, index_sequence<> /*unused*/)
+{
+    return {};
+}
+
 template < typename BasicJsonType, class A1, class A2 >
 std::pair<A1, A2> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::pair<A1, A2>> /*unused*/, priority_tag<0> /*unused*/)
 {
@@ -5091,7 +5324,12 @@ inline void from_json(const BasicJsonType& j, std_fs::path& p)
     {
         JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
     }
-    p = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+    const auto& s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+#ifdef JSON_HAS_CPP_20
+    p = std_fs::path(std::u8string_view(reinterpret_cast<const char8_t*>(s.data()), s.size()));
+#else
+    p = std_fs::u8path(s); // accepts UTF-8 encoded std::string in C++17, deprecated in C++20
+#endif
 }
 #endif
 
@@ -5126,14 +5364,20 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/conversions/to_json.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
 
+// #include <nlohmann/detail/macro_scope.hpp>
+// JSON_HAS_CPP_17
+#ifdef JSON_HAS_CPP_17
+    #include <optional> // optional
+#endif
+
 #include <algorithm> // copy
 #include <iterator> // begin, end
 #include <string> // string
@@ -5146,17 +5390,16 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/iteration_proxy.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
 
 #include <cstddef> // size_t
-#include <iterator> // input_iterator_tag
-#include <string> // string, to_string
+#include <iterator> // forward_iterator_tag
 #include <tuple> // tuple_size, get, tuple_element
 #include <utility> // move
 
@@ -5168,6 +5411,46 @@ NLOHMANN_JSON_NAMESPACE_END
 
 // #include <nlohmann/detail/meta/type_traits.hpp>
 
+// #include <nlohmann/detail/string_utils.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.12.0
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+#include <cstddef> // size_t
+#include <string> // string, to_string
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail
+{
+
+template<typename StringType>
+void int_to_string(StringType& target, std::size_t value)
+{
+    // For ADL
+    using std::to_string;
+    target = to_string(value);
+}
+
+template<typename StringType>
+StringType to_string(std::size_t value)
+{
+    StringType result;
+    int_to_string(result, value);
+    return result;
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
 // #include <nlohmann/detail/value_t.hpp>
 
 
@@ -5175,13 +5458,6 @@ NLOHMANN_JSON_NAMESPACE_BEGIN
 namespace detail
 {
 
-template<typename string_type>
-void int_to_string( string_type& target, std::size_t value )
-{
-    // For ADL
-    using std::to_string;
-    target = to_string(value);
-}
 template<typename IteratorType> class iteration_proxy_value
 {
   public:
@@ -5189,7 +5465,7 @@ template<typename IteratorType> class iteration_proxy_value
     using value_type = iteration_proxy_value;
     using pointer = value_type *;
     using reference = value_type &;
-    using iterator_category = std::input_iterator_tag;
+    using iterator_category = std::forward_iterator_tag;
     using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;
 
   private:
@@ -5369,7 +5645,7 @@ namespace std
 #endif
 template<typename IteratorType>
 class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>> // NOLINT(cert-dcl58-cpp)
-            : public std::integral_constant<std::size_t, 2> {};
+    : public std::integral_constant<std::size_t, 2> {};
 
 template<std::size_t N, typename IteratorType>
 class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >> // NOLINT(cert-dcl58-cpp)
@@ -5390,8 +5666,6 @@ class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >>
     inline constexpr bool ::std::ranges::enable_borrowed_range<::nlohmann::detail::iteration_proxy<IteratorType>> = true;
 #endif
 
-// #include <nlohmann/detail/macro_scope.hpp>
-
 // #include <nlohmann/detail/meta/cpp_future.hpp>
 
 // #include <nlohmann/detail/meta/std_fs.hpp>
@@ -5637,6 +5911,22 @@ struct external_constructor<value_t::object>
 // to_json //
 /////////////
 
+#ifdef JSON_HAS_CPP_17
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_constructible<BasicJsonType, T>::value, int> = 0>
+void to_json(BasicJsonType& j, const std::optional<T>& opt)
+{
+    if (opt.has_value())
+    {
+        j = *opt;
+    }
+    else
+    {
+        j = nullptr;
+    }
+}
+#endif
+
 template<typename BasicJsonType, typename T,
          enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
 inline void to_json(BasicJsonType& j, T b) noexcept
@@ -5783,6 +6073,13 @@ inline void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<
     j = { std::get<Idx>(t)... };
 }
 
+template<typename BasicJsonType, typename Tuple>
+inline void to_json_tuple_impl(BasicJsonType& j, const Tuple& /*unused*/, index_sequence<> /*unused*/)
+{
+    using array_t = typename BasicJsonType::array_t;
+    j = array_t();
+}
+
 template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
 inline void to_json(BasicJsonType& j, const T& t)
 {
@@ -5793,7 +6090,12 @@ inline void to_json(BasicJsonType& j, const T& t)
 template<typename BasicJsonType>
 inline void to_json(BasicJsonType& j, const std_fs::path& p)
 {
-    j = p.string();
+#ifdef JSON_HAS_CPP_20
+    const std::u8string s = p.u8string();
+    j = std::string(s.begin(), s.end());
+#else
+    j = p.u8string(); // returns std::string in C++17
+#endif
 }
 #endif
 
@@ -5868,10 +6170,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/byte_container_with_subtype.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -5980,10 +6282,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/hash.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -6113,10 +6415,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/binary_reader.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -6133,16 +6435,19 @@ NLOHMANN_JSON_NAMESPACE_END
 #include <string> // char_traits, string
 #include <utility> // make_pair, move
 #include <vector> // vector
+#ifdef __cpp_lib_byteswap
+    #include <bit>  //byteswap
+#endif
 
 // #include <nlohmann/detail/exceptions.hpp>
 
 // #include <nlohmann/detail/input/input_adapters.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -6162,6 +6467,8 @@ NLOHMANN_JSON_NAMESPACE_END
     #include <istream>  // istream
 #endif                  // JSON_NO_IO
 
+// #include <nlohmann/detail/exceptions.hpp>
+
 // #include <nlohmann/detail/iterators/iterator_traits.hpp>
 
 // #include <nlohmann/detail/macro_scope.hpp>
@@ -6209,6 +6516,13 @@ class file_input_adapter
         return std::fgetc(m_file);
     }
 
+    // returns the number of characters successfully read
+    template<class T>
+    std::size_t get_elements(T* dest, std::size_t count = 1)
+    {
+        return fread(dest, 1, sizeof(T) * count, m_file);
+    }
+
   private:
     /// the file pointer to read from
     std::FILE* m_file;
@@ -6268,6 +6582,17 @@ class input_stream_adapter
         return res;
     }
 
+    template<class T>
+    std::size_t get_elements(T* dest, std::size_t count = 1)
+    {
+        auto res = static_cast<std::size_t>(sb->sgetn(reinterpret_cast<char*>(dest), static_cast<std::streamsize>(count * sizeof(T))));
+        if (JSON_HEDLEY_UNLIKELY(res < count * sizeof(T)))
+        {
+            is->clear(is->rdstate() | std::ios::eofbit);
+        }
+        return res;
+    }
+
   private:
     /// the associated input stream
     std::istream* is = nullptr;
@@ -6299,6 +6624,26 @@ class iterator_input_adapter
         return char_traits<char_type>::eof();
     }
 
+    // for general iterators, we cannot really do something better than falling back to processing the range one-by-one
+    template<class T>
+    std::size_t get_elements(T* dest, std::size_t count = 1)
+    {
+        auto* ptr = reinterpret_cast<char*>(dest);
+        for (std::size_t read_index = 0; read_index < count * sizeof(T); ++read_index)
+        {
+            if (JSON_HEDLEY_LIKELY(current != end))
+            {
+                ptr[read_index] = static_cast<char>(*current);
+                std::advance(current, 1);
+            }
+            else
+            {
+                return read_index;
+            }
+        }
+        return count * sizeof(T);
+    }
+
   private:
     IteratorType current;
     IteratorType end;
@@ -6462,6 +6807,13 @@ class wide_string_input_adapter
         return utf8_bytes[utf8_bytes_index++];
     }
 
+    // parsing binary with wchar doesn't make sense, but since the parsing mode can be runtime, we need something here
+    template<class T>
+    std::size_t get_elements(T* /*dest*/, std::size_t /*count*/ = 1)
+    {
+        JSON_THROW(parse_error::create(112, 1, "wide string type cannot be interpreted as binary data", nullptr));
+    }
+
   private:
     BaseInputAdapter base_adapter;
 
@@ -6558,10 +6910,17 @@ typename container_input_adapter_factory_impl::container_input_adapter_factory<C
     return container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::create(container);
 }
 
+// specialization for std::string
+using string_input_adapter_type = decltype(input_adapter(std::declval<std::string>()));
+
 #ifndef JSON_NO_IO
 // Special cases with fast paths
 inline file_input_adapter input_adapter(std::FILE* file)
 {
+    if (file == nullptr)
+    {
+        JSON_THROW(parse_error::create(101, 0, "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
+    }
     return file_input_adapter(file);
 }
 
@@ -6588,9 +6947,13 @@ template < typename CharT,
                int >::type = 0 >
 contiguous_bytes_input_adapter input_adapter(CharT b)
 {
+    if (b == nullptr)
+    {
+        JSON_THROW(parse_error::create(101, 0, "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr));
+    }
     auto length = std::strlen(reinterpret_cast<const char*>(b));
     const auto* ptr = reinterpret_cast<const char*>(b);
-    return input_adapter(ptr, ptr + length);
+    return input_adapter(ptr, ptr + length); // cppcheck-suppress[nullPointerArithmeticRedundantCheck]
 }
 
 template<typename T, std::size_t N>
@@ -6636,742 +6999,29 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/json_sax.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
 
 #include <cstddef>
 #include <string> // string
+#include <type_traits> // enable_if_t
 #include <utility> // move
 #include <vector> // vector
 
 // #include <nlohmann/detail/exceptions.hpp>
 
-// #include <nlohmann/detail/macro_scope.hpp>
-
-// #include <nlohmann/detail/string_concat.hpp>
-
-
-NLOHMANN_JSON_NAMESPACE_BEGIN
-
-/*!
-@brief SAX interface
-
-This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
-Each function is called in different situations while the input is parsed. The
-boolean return value informs the parser whether to continue processing the
-input.
-*/
-template<typename BasicJsonType>
-struct json_sax
-{
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-
-    /*!
-    @brief a null value was read
-    @return whether parsing should proceed
-    */
-    virtual bool null() = 0;
-
-    /*!
-    @brief a boolean value was read
-    @param[in] val  boolean value
-    @return whether parsing should proceed
-    */
-    virtual bool boolean(bool val) = 0;
-
-    /*!
-    @brief an integer number was read
-    @param[in] val  integer value
-    @return whether parsing should proceed
-    */
-    virtual bool number_integer(number_integer_t val) = 0;
-
-    /*!
-    @brief an unsigned integer number was read
-    @param[in] val  unsigned integer value
-    @return whether parsing should proceed
-    */
-    virtual bool number_unsigned(number_unsigned_t val) = 0;
-
-    /*!
-    @brief a floating-point number was read
-    @param[in] val  floating-point value
-    @param[in] s    raw token value
-    @return whether parsing should proceed
-    */
-    virtual bool number_float(number_float_t val, const string_t& s) = 0;
-
-    /*!
-    @brief a string value was read
-    @param[in] val  string value
-    @return whether parsing should proceed
-    @note It is safe to move the passed string value.
-    */
-    virtual bool string(string_t& val) = 0;
-
-    /*!
-    @brief a binary value was read
-    @param[in] val  binary value
-    @return whether parsing should proceed
-    @note It is safe to move the passed binary value.
-    */
-    virtual bool binary(binary_t& val) = 0;
-
-    /*!
-    @brief the beginning of an object was read
-    @param[in] elements  number of object elements or -1 if unknown
-    @return whether parsing should proceed
-    @note binary formats may report the number of elements
-    */
-    virtual bool start_object(std::size_t elements) = 0;
-
-    /*!
-    @brief an object key was read
-    @param[in] val  object key
-    @return whether parsing should proceed
-    @note It is safe to move the passed string.
-    */
-    virtual bool key(string_t& val) = 0;
-
-    /*!
-    @brief the end of an object was read
-    @return whether parsing should proceed
-    */
-    virtual bool end_object() = 0;
-
-    /*!
-    @brief the beginning of an array was read
-    @param[in] elements  number of array elements or -1 if unknown
-    @return whether parsing should proceed
-    @note binary formats may report the number of elements
-    */
-    virtual bool start_array(std::size_t elements) = 0;
-
-    /*!
-    @brief the end of an array was read
-    @return whether parsing should proceed
-    */
-    virtual bool end_array() = 0;
-
-    /*!
-    @brief a parse error occurred
-    @param[in] position    the position in the input where the error occurs
-    @param[in] last_token  the last read token
-    @param[in] ex          an exception object describing the error
-    @return whether parsing should proceed (must return false)
-    */
-    virtual bool parse_error(std::size_t position,
-                             const std::string& last_token,
-                             const detail::exception& ex) = 0;
-
-    json_sax() = default;
-    json_sax(const json_sax&) = default;
-    json_sax(json_sax&&) noexcept = default;
-    json_sax& operator=(const json_sax&) = default;
-    json_sax& operator=(json_sax&&) noexcept = default;
-    virtual ~json_sax() = default;
-};
-
-namespace detail
-{
-/*!
-@brief SAX implementation to create a JSON value from SAX events
-
-This class implements the @ref json_sax interface and processes the SAX events
-to create a JSON value which makes it basically a DOM parser. The structure or
-hierarchy of the JSON value is managed by the stack `ref_stack` which contains
-a pointer to the respective array or object for each recursion depth.
-
-After successful parsing, the value that is passed by reference to the
-constructor contains the parsed value.
-
-@tparam BasicJsonType  the JSON type
-*/
-template<typename BasicJsonType>
-class json_sax_dom_parser
-{
-  public:
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-
-    /*!
-    @param[in,out] r  reference to a JSON value that is manipulated while
-                       parsing
-    @param[in] allow_exceptions_  whether parse errors yield exceptions
-    */
-    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
-        : root(r), allow_exceptions(allow_exceptions_)
-    {}
-
-    // make class move-only
-    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
-    json_sax_dom_parser(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
-    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    ~json_sax_dom_parser() = default;
-
-    bool null()
-    {
-        handle_value(nullptr);
-        return true;
-    }
-
-    bool boolean(bool val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_integer(number_integer_t val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_unsigned(number_unsigned_t val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_float(number_float_t val, const string_t& /*unused*/)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool string(string_t& val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool binary(binary_t& val)
-    {
-        handle_value(std::move(val));
-        return true;
-    }
-
-    bool start_object(std::size_t len)
-    {
-        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
-
-        if (JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size()))
-        {
-            JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
-        }
-
-        return true;
-    }
-
-    bool key(string_t& val)
-    {
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(ref_stack.back()->is_object());
-
-        // add null at given key and store the reference for later
-        object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val));
-        return true;
-    }
-
-    bool end_object()
-    {
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(ref_stack.back()->is_object());
-
-        ref_stack.back()->set_parents();
-        ref_stack.pop_back();
-        return true;
-    }
-
-    bool start_array(std::size_t len)
-    {
-        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
-
-        if (JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size()))
-        {
-            JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
-        }
-
-        return true;
-    }
-
-    bool end_array()
-    {
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(ref_stack.back()->is_array());
-
-        ref_stack.back()->set_parents();
-        ref_stack.pop_back();
-        return true;
-    }
-
-    template<class Exception>
-    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
-                     const Exception& ex)
-    {
-        errored = true;
-        static_cast<void>(ex);
-        if (allow_exceptions)
-        {
-            JSON_THROW(ex);
-        }
-        return false;
-    }
-
-    constexpr bool is_errored() const
-    {
-        return errored;
-    }
-
-  private:
-    /*!
-    @invariant If the ref stack is empty, then the passed value will be the new
-               root.
-    @invariant If the ref stack contains a value, then it is an array or an
-               object to which we can add elements
-    */
-    template<typename Value>
-    JSON_HEDLEY_RETURNS_NON_NULL
-    BasicJsonType* handle_value(Value&& v)
-    {
-        if (ref_stack.empty())
-        {
-            root = BasicJsonType(std::forward<Value>(v));
-            return &root;
-        }
-
-        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
-
-        if (ref_stack.back()->is_array())
-        {
-            ref_stack.back()->m_data.m_value.array->emplace_back(std::forward<Value>(v));
-            return &(ref_stack.back()->m_data.m_value.array->back());
-        }
-
-        JSON_ASSERT(ref_stack.back()->is_object());
-        JSON_ASSERT(object_element);
-        *object_element = BasicJsonType(std::forward<Value>(v));
-        return object_element;
-    }
-
-    /// the parsed JSON value
-    BasicJsonType& root;
-    /// stack to model hierarchy of values
-    std::vector<BasicJsonType*> ref_stack {};
-    /// helper to hold the reference for the next object element
-    BasicJsonType* object_element = nullptr;
-    /// whether a syntax error occurred
-    bool errored = false;
-    /// whether to throw exceptions in case of errors
-    const bool allow_exceptions = true;
-};
-
-template<typename BasicJsonType>
-class json_sax_dom_callback_parser
-{
-  public:
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-    using parser_callback_t = typename BasicJsonType::parser_callback_t;
-    using parse_event_t = typename BasicJsonType::parse_event_t;
-
-    json_sax_dom_callback_parser(BasicJsonType& r,
-                                 const parser_callback_t cb,
-                                 const bool allow_exceptions_ = true)
-        : root(r), callback(cb), allow_exceptions(allow_exceptions_)
-    {
-        keep_stack.push_back(true);
-    }
-
-    // make class move-only
-    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
-    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
-    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
-    ~json_sax_dom_callback_parser() = default;
-
-    bool null()
-    {
-        handle_value(nullptr);
-        return true;
-    }
-
-    bool boolean(bool val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_integer(number_integer_t val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_unsigned(number_unsigned_t val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool number_float(number_float_t val, const string_t& /*unused*/)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool string(string_t& val)
-    {
-        handle_value(val);
-        return true;
-    }
-
-    bool binary(binary_t& val)
-    {
-        handle_value(std::move(val));
-        return true;
-    }
-
-    bool start_object(std::size_t len)
-    {
-        // check callback for object start
-        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
-        keep_stack.push_back(keep);
-
-        auto val = handle_value(BasicJsonType::value_t::object, true);
-        ref_stack.push_back(val.second);
-
-        // check object limit
-        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size()))
-        {
-            JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
-        }
-
-        return true;
-    }
-
-    bool key(string_t& val)
-    {
-        BasicJsonType k = BasicJsonType(val);
-
-        // check callback for key
-        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
-        key_keep_stack.push_back(keep);
-
-        // add discarded value at given key and store the reference for later
-        if (keep && ref_stack.back())
-        {
-            object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded);
-        }
-
-        return true;
-    }
-
-    bool end_object()
-    {
-        if (ref_stack.back())
-        {
-            if (!callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
-            {
-                // discard object
-                *ref_stack.back() = discarded;
-            }
-            else
-            {
-                ref_stack.back()->set_parents();
-            }
-        }
-
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(!keep_stack.empty());
-        ref_stack.pop_back();
-        keep_stack.pop_back();
-
-        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
-        {
-            // remove discarded value
-            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
-            {
-                if (it->is_discarded())
-                {
-                    ref_stack.back()->erase(it);
-                    break;
-                }
-            }
-        }
-
-        return true;
-    }
-
-    bool start_array(std::size_t len)
-    {
-        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
-        keep_stack.push_back(keep);
-
-        auto val = handle_value(BasicJsonType::value_t::array, true);
-        ref_stack.push_back(val.second);
-
-        // check array limit
-        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size()))
-        {
-            JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
-        }
-
-        return true;
-    }
-
-    bool end_array()
-    {
-        bool keep = true;
-
-        if (ref_stack.back())
-        {
-            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
-            if (keep)
-            {
-                ref_stack.back()->set_parents();
-            }
-            else
-            {
-                // discard array
-                *ref_stack.back() = discarded;
-            }
-        }
-
-        JSON_ASSERT(!ref_stack.empty());
-        JSON_ASSERT(!keep_stack.empty());
-        ref_stack.pop_back();
-        keep_stack.pop_back();
-
-        // remove discarded value
-        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
-        {
-            ref_stack.back()->m_data.m_value.array->pop_back();
-        }
-
-        return true;
-    }
-
-    template<class Exception>
-    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
-                     const Exception& ex)
-    {
-        errored = true;
-        static_cast<void>(ex);
-        if (allow_exceptions)
-        {
-            JSON_THROW(ex);
-        }
-        return false;
-    }
-
-    constexpr bool is_errored() const
-    {
-        return errored;
-    }
-
-  private:
-    /*!
-    @param[in] v  value to add to the JSON value we build during parsing
-    @param[in] skip_callback  whether we should skip calling the callback
-               function; this is required after start_array() and
-               start_object() SAX events, because otherwise we would call the
-               callback function with an empty array or object, respectively.
-
-    @invariant If the ref stack is empty, then the passed value will be the new
-               root.
-    @invariant If the ref stack contains a value, then it is an array or an
-               object to which we can add elements
-
-    @return pair of boolean (whether value should be kept) and pointer (to the
-            passed value in the ref_stack hierarchy; nullptr if not kept)
-    */
-    template<typename Value>
-    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
-    {
-        JSON_ASSERT(!keep_stack.empty());
-
-        // do not handle this value if we know it would be added to a discarded
-        // container
-        if (!keep_stack.back())
-        {
-            return {false, nullptr};
-        }
-
-        // create value
-        auto value = BasicJsonType(std::forward<Value>(v));
-
-        // check callback
-        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
-
-        // do not handle this value if we just learnt it shall be discarded
-        if (!keep)
-        {
-            return {false, nullptr};
-        }
-
-        if (ref_stack.empty())
-        {
-            root = std::move(value);
-            return {true, & root};
-        }
-
-        // skip this value if we already decided to skip the parent
-        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
-        if (!ref_stack.back())
-        {
-            return {false, nullptr};
-        }
-
-        // we now only expect arrays and objects
-        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
-
-        // array
-        if (ref_stack.back()->is_array())
-        {
-            ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value));
-            return {true, & (ref_stack.back()->m_data.m_value.array->back())};
-        }
-
-        // object
-        JSON_ASSERT(ref_stack.back()->is_object());
-        // check if we should store an element for the current key
-        JSON_ASSERT(!key_keep_stack.empty());
-        const bool store_element = key_keep_stack.back();
-        key_keep_stack.pop_back();
-
-        if (!store_element)
-        {
-            return {false, nullptr};
-        }
-
-        JSON_ASSERT(object_element);
-        *object_element = std::move(value);
-        return {true, object_element};
-    }
-
-    /// the parsed JSON value
-    BasicJsonType& root;
-    /// stack to model hierarchy of values
-    std::vector<BasicJsonType*> ref_stack {};
-    /// stack to manage which values to keep
-    std::vector<bool> keep_stack {}; // NOLINT(readability-redundant-member-init)
-    /// stack to manage which object keys to keep
-    std::vector<bool> key_keep_stack {}; // NOLINT(readability-redundant-member-init)
-    /// helper to hold the reference for the next object element
-    BasicJsonType* object_element = nullptr;
-    /// whether a syntax error occurred
-    bool errored = false;
-    /// callback function
-    const parser_callback_t callback = nullptr;
-    /// whether to throw exceptions in case of errors
-    const bool allow_exceptions = true;
-    /// a discarded value for the callback
-    BasicJsonType discarded = BasicJsonType::value_t::discarded;
-};
-
-template<typename BasicJsonType>
-class json_sax_acceptor
-{
-  public:
-    using number_integer_t = typename BasicJsonType::number_integer_t;
-    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-    using number_float_t = typename BasicJsonType::number_float_t;
-    using string_t = typename BasicJsonType::string_t;
-    using binary_t = typename BasicJsonType::binary_t;
-
-    bool null()
-    {
-        return true;
-    }
-
-    bool boolean(bool /*unused*/)
-    {
-        return true;
-    }
-
-    bool number_integer(number_integer_t /*unused*/)
-    {
-        return true;
-    }
-
-    bool number_unsigned(number_unsigned_t /*unused*/)
-    {
-        return true;
-    }
-
-    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
-    {
-        return true;
-    }
-
-    bool string(string_t& /*unused*/)
-    {
-        return true;
-    }
-
-    bool binary(binary_t& /*unused*/)
-    {
-        return true;
-    }
-
-    bool start_object(std::size_t /*unused*/ = static_cast<std::size_t>(-1))
-    {
-        return true;
-    }
-
-    bool key(string_t& /*unused*/)
-    {
-        return true;
-    }
-
-    bool end_object()
-    {
-        return true;
-    }
-
-    bool start_array(std::size_t /*unused*/ = static_cast<std::size_t>(-1))
-    {
-        return true;
-    }
-
-    bool end_array()
-    {
-        return true;
-    }
-
-    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
-    {
-        return false;
-    }
-};
-
-}  // namespace detail
-NLOHMANN_JSON_NAMESPACE_END
-
 // #include <nlohmann/detail/input/lexer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -8339,7 +7989,7 @@ class lexer : public lexer_base<BasicJsonType>
           locale's decimal point is used instead of `.` to work with the
           locale-dependent converters.
     */
-    token_type scan_number()  // lgtm [cpp/use-of-goto]
+    token_type scan_number()  // lgtm [cpp/use-of-goto] `goto` is used in this function to implement the number-parsing state machine described above. By design, any finite input will eventually reach the "done" state or return token_type::parse_error. In each intermediate state, 1 byte of the input is appended to the token_buffer vector, and only the already initialized variables token_buffer, number_type, and error_message are manipulated.
     {
         // reset token_buffer to store the number's bytes
         reset();
@@ -8421,6 +8071,7 @@ scan_number_zero:
             case '.':
             {
                 add(decimal_point_char);
+                decimal_point_position = token_buffer.size() - 1;
                 goto scan_number_decimal1;
             }
 
@@ -8457,6 +8108,7 @@ scan_number_any1:
             case '.':
             {
                 add(decimal_point_char);
+                decimal_point_position = token_buffer.size() - 1;
                 goto scan_number_decimal1;
             }
 
@@ -8617,7 +8269,7 @@ scan_number_done:
         // we are done scanning a number)
         unget();
 
-        char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        char* endptr = nullptr; // NOLINT(misc-const-correctness,cppcoreguidelines-pro-type-vararg,hicpp-vararg)
         errno = 0;
 
         // try to parse integers first and fall back to floats
@@ -8628,7 +8280,7 @@ scan_number_done:
             // we checked the number format before
             JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
 
-            if (errno == 0)
+            if (errno != ERANGE)
             {
                 value_unsigned = static_cast<number_unsigned_t>(x);
                 if (value_unsigned == x)
@@ -8644,7 +8296,7 @@ scan_number_done:
             // we checked the number format before
             JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
 
-            if (errno == 0)
+            if (errno != ERANGE)
             {
                 value_integer = static_cast<number_integer_t>(x);
                 if (value_integer == x)
@@ -8694,6 +8346,7 @@ scan_number_done:
     {
         token_buffer.clear();
         token_string.clear();
+        decimal_point_position = std::string::npos;
         token_string.push_back(char_traits<char_type>::to_char_type(current));
     }
 
@@ -8802,6 +8455,11 @@ scan_number_done:
     /// return current string value (implicitly resets the token; useful only once)
     string_t& get_string()
     {
+        // translate decimal points from locale back to '.' (#4084)
+        if (decimal_point_char != '.' && decimal_point_position != std::string::npos)
+        {
+            token_buffer[decimal_point_position] = '.';
+        }
         return token_buffer;
     }
 
@@ -8999,6 +8657,8 @@ scan_number_done:
 
     /// the decimal point
     const char_int_type decimal_point_char = '.';
+    /// the position of the decimal point in the input
+    std::size_t decimal_point_position = std::string::npos;
 };
 
 }  // namespace detail
@@ -9006,13 +8666,986 @@ NLOHMANN_JSON_NAMESPACE_END
 
 // #include <nlohmann/detail/macro_scope.hpp>
 
+// #include <nlohmann/detail/string_concat.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief SAX interface
+
+This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
+Each function is called in different situations while the input is parsed. The
+boolean return value informs the parser whether to continue processing the
+input.
+*/
+template<typename BasicJsonType>
+struct json_sax
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    /*!
+    @brief a null value was read
+    @return whether parsing should proceed
+    */
+    virtual bool null() = 0;
+
+    /*!
+    @brief a boolean value was read
+    @param[in] val  boolean value
+    @return whether parsing should proceed
+    */
+    virtual bool boolean(bool val) = 0;
+
+    /*!
+    @brief an integer number was read
+    @param[in] val  integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_integer(number_integer_t val) = 0;
+
+    /*!
+    @brief an unsigned integer number was read
+    @param[in] val  unsigned integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_unsigned(number_unsigned_t val) = 0;
+
+    /*!
+    @brief a floating-point number was read
+    @param[in] val  floating-point value
+    @param[in] s    raw token value
+    @return whether parsing should proceed
+    */
+    virtual bool number_float(number_float_t val, const string_t& s) = 0;
+
+    /*!
+    @brief a string value was read
+    @param[in] val  string value
+    @return whether parsing should proceed
+    @note It is safe to move the passed string value.
+    */
+    virtual bool string(string_t& val) = 0;
+
+    /*!
+    @brief a binary value was read
+    @param[in] val  binary value
+    @return whether parsing should proceed
+    @note It is safe to move the passed binary value.
+    */
+    virtual bool binary(binary_t& val) = 0;
+
+    /*!
+    @brief the beginning of an object was read
+    @param[in] elements  number of object elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_object(std::size_t elements) = 0;
+
+    /*!
+    @brief an object key was read
+    @param[in] val  object key
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool key(string_t& val) = 0;
+
+    /*!
+    @brief the end of an object was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_object() = 0;
+
+    /*!
+    @brief the beginning of an array was read
+    @param[in] elements  number of array elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_array(std::size_t elements) = 0;
+
+    /*!
+    @brief the end of an array was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_array() = 0;
+
+    /*!
+    @brief a parse error occurred
+    @param[in] position    the position in the input where the error occurs
+    @param[in] last_token  the last read token
+    @param[in] ex          an exception object describing the error
+    @return whether parsing should proceed (must return false)
+    */
+    virtual bool parse_error(std::size_t position,
+                             const std::string& last_token,
+                             const detail::exception& ex) = 0;
+
+    json_sax() = default;
+    json_sax(const json_sax&) = default;
+    json_sax(json_sax&&) noexcept = default;
+    json_sax& operator=(const json_sax&) = default;
+    json_sax& operator=(json_sax&&) noexcept = default;
+    virtual ~json_sax() = default;
+};
+
+namespace detail
+{
+constexpr std::size_t unknown_size()
+{
+    return (std::numeric_limits<std::size_t>::max)();
+}
+
+/*!
+@brief SAX implementation to create a JSON value from SAX events
+
+This class implements the @ref json_sax interface and processes the SAX events
+to create a JSON value which makes it basically a DOM parser. The structure or
+hierarchy of the JSON value is managed by the stack `ref_stack` which contains
+a pointer to the respective array or object for each recursion depth.
+
+After successful parsing, the value that is passed by reference to the
+constructor contains the parsed value.
+
+@tparam BasicJsonType  the JSON type
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class json_sax_dom_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
+
+    /*!
+    @param[in,out] r  reference to a JSON value that is manipulated while
+                       parsing
+    @param[in] allow_exceptions_  whether parse errors yield exceptions
+    */
+    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true, lexer_t* lexer_ = nullptr)
+        : root(r), allow_exceptions(allow_exceptions_), m_lexer_ref(lexer_)
+    {}
+
+    // make class move-only
+    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~json_sax_dom_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        // Manually set the start position of the object here.
+        // Ensure this is after the call to handle_value to ensure correct start position.
+        if (m_lexer_ref)
+        {
+            // Lexer has read the first character of the object, so
+            // subtract 1 from the position to get the correct start position.
+            ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
+        }
+#endif
+
+        if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_object());
+
+        // add null at given key and store the reference for later
+        object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val));
+        return true;
+    }
+
+    bool end_object()
+    {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_object());
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        if (m_lexer_ref)
+        {
+            // Lexer's position is past the closing brace, so set that as the end position.
+            ref_stack.back()->end_position = m_lexer_ref->get_position();
+        }
+#endif
+
+        ref_stack.back()->set_parents();
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        // Manually set the start position of the array here.
+        // Ensure this is after the call to handle_value to ensure correct start position.
+        if (m_lexer_ref)
+        {
+            ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
+        }
+#endif
+
+        if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_array());
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        if (m_lexer_ref)
+        {
+            // Lexer's position is past the closing bracket, so set that as the end position.
+            ref_stack.back()->end_position = m_lexer_ref->get_position();
+        }
+#endif
+
+        ref_stack.back()->set_parents();
+        ref_stack.pop_back();
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    void handle_diagnostic_positions_for_json_value(BasicJsonType& v)
+    {
+        if (m_lexer_ref)
+        {
+            // Lexer has read past the current field value, so set the end position to the current position.
+            // The start position will be set below based on the length of the string representation
+            // of the value.
+            v.end_position = m_lexer_ref->get_position();
+
+            switch (v.type())
+            {
+                case value_t::boolean:
+                {
+                    // 4 and 5 are the string length of "true" and "false"
+                    v.start_position = v.end_position - (v.m_data.m_value.boolean ? 4 : 5);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    // 4 is the string length of "null"
+                    v.start_position = v.end_position - 4;
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    // include the length of the quotes, which is 2
+                    v.start_position = v.end_position - v.m_data.m_value.string->size() - 2;
+                    break;
+                }
+
+                // As we handle the start and end positions for values created during parsing,
+                // we do not expect the following value type to be called. Regardless, set the positions
+                // in case this is created manually or through a different constructor. Exclude from lcov
+                // since the exact condition of this switch is esoteric.
+                // LCOV_EXCL_START
+                case value_t::discarded:
+                {
+                    v.end_position = std::string::npos;
+                    v.start_position = v.end_position;
+                    break;
+                }
+                // LCOV_EXCL_STOP
+                case value_t::binary:
+                case value_t::number_integer:
+                case value_t::number_unsigned:
+                case value_t::number_float:
+                {
+                    v.start_position = v.end_position - m_lexer_ref->get_string().size();
+                    break;
+                }
+                case value_t::object:
+                case value_t::array:
+                {
+                    // object and array are handled in start_object() and start_array() handlers
+                    // skip setting the values here.
+                    break;
+                }
+                default: // LCOV_EXCL_LINE
+                    // Handle all possible types discretely, default handler should never be reached.
+                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert,-warnings-as-errors) LCOV_EXCL_LINE
+            }
+        }
+    }
+#endif
+
+    /*!
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+    */
+    template<typename Value>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    BasicJsonType* handle_value(Value&& v)
+    {
+        if (ref_stack.empty())
+        {
+            root = BasicJsonType(std::forward<Value>(v));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+            handle_diagnostic_positions_for_json_value(root);
+#endif
+
+            return &root;
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_data.m_value.array->emplace_back(std::forward<Value>(v));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+            handle_diagnostic_positions_for_json_value(ref_stack.back()->m_data.m_value.array->back());
+#endif
+
+            return &(ref_stack.back()->m_data.m_value.array->back());
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_object());
+        JSON_ASSERT(object_element);
+        *object_element = BasicJsonType(std::forward<Value>(v));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        handle_diagnostic_positions_for_json_value(*object_element);
+#endif
+
+        return object_element;
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// the lexer reference to obtain the current position
+    lexer_t* m_lexer_ref = nullptr;
+};
+
+template<typename BasicJsonType, typename InputAdapterType>
+class json_sax_dom_callback_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using parser_callback_t = typename BasicJsonType::parser_callback_t;
+    using parse_event_t = typename BasicJsonType::parse_event_t;
+    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
+
+    json_sax_dom_callback_parser(BasicJsonType& r,
+                                 parser_callback_t cb,
+                                 const bool allow_exceptions_ = true,
+                                 lexer_t* lexer_ = nullptr)
+        : root(r), callback(std::move(cb)), allow_exceptions(allow_exceptions_), m_lexer_ref(lexer_)
+    {
+        keep_stack.push_back(true);
+    }
+
+    // make class move-only
+    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~json_sax_dom_callback_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        // check callback for object start
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::object, true);
+        ref_stack.push_back(val.second);
+
+        if (ref_stack.back())
+        {
+
+#if JSON_DIAGNOSTIC_POSITIONS
+            // Manually set the start position of the object here.
+            // Ensure this is after the call to handle_value to ensure correct start position.
+            if (m_lexer_ref)
+            {
+                // Lexer has read the first character of the object, so
+                // subtract 1 from the position to get the correct start position.
+                ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
+            }
+#endif
+
+            // check object limit
+            if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
+            {
+                JSON_THROW(out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
+            }
+        }
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        BasicJsonType k = BasicJsonType(val);
+
+        // check callback for key
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
+        key_keep_stack.push_back(keep);
+
+        // add discarded value at given key and store the reference for later
+        if (keep && ref_stack.back())
+        {
+            object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded);
+        }
+
+        return true;
+    }
+
+    bool end_object()
+    {
+        if (ref_stack.back())
+        {
+            if (!callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
+            {
+                // discard object
+                *ref_stack.back() = discarded;
+
+#if JSON_DIAGNOSTIC_POSITIONS
+                // Set start/end positions for discarded object.
+                handle_diagnostic_positions_for_json_value(*ref_stack.back());
+#endif
+            }
+            else
+            {
+
+#if JSON_DIAGNOSTIC_POSITIONS
+                if (m_lexer_ref)
+                {
+                    // Lexer's position is past the closing brace, so set that as the end position.
+                    ref_stack.back()->end_position = m_lexer_ref->get_position();
+                }
+#endif
+
+                ref_stack.back()->set_parents();
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
+        {
+            // remove discarded value
+            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
+            {
+                if (it->is_discarded())
+                {
+                    ref_stack.back()->erase(it);
+                    break;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::array, true);
+        ref_stack.push_back(val.second);
+
+        if (ref_stack.back())
+        {
+
+#if JSON_DIAGNOSTIC_POSITIONS
+            // Manually set the start position of the array here.
+            // Ensure this is after the call to handle_value to ensure correct start position.
+            if (m_lexer_ref)
+            {
+                // Lexer has read the first character of the array, so
+                // subtract 1 from the position to get the correct start position.
+                ref_stack.back()->start_position = m_lexer_ref->get_position() - 1;
+            }
+#endif
+
+            // check array limit
+            if (JSON_HEDLEY_UNLIKELY(len != detail::unknown_size() && len > ref_stack.back()->max_size()))
+            {
+                JSON_THROW(out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
+            }
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        bool keep = true;
+
+        if (ref_stack.back())
+        {
+            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
+            if (keep)
+            {
+
+#if JSON_DIAGNOSTIC_POSITIONS
+                if (m_lexer_ref)
+                {
+                    // Lexer's position is past the closing bracket, so set that as the end position.
+                    ref_stack.back()->end_position = m_lexer_ref->get_position();
+                }
+#endif
+
+                ref_stack.back()->set_parents();
+            }
+            else
+            {
+                // discard array
+                *ref_stack.back() = discarded;
+
+#if JSON_DIAGNOSTIC_POSITIONS
+                // Set start/end positions for discarded array.
+                handle_diagnostic_positions_for_json_value(*ref_stack.back());
+#endif
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        // remove discarded value
+        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_data.m_value.array->pop_back();
+        }
+
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    void handle_diagnostic_positions_for_json_value(BasicJsonType& v)
+    {
+        if (m_lexer_ref)
+        {
+            // Lexer has read past the current field value, so set the end position to the current position.
+            // The start position will be set below based on the length of the string representation
+            // of the value.
+            v.end_position = m_lexer_ref->get_position();
+
+            switch (v.type())
+            {
+                case value_t::boolean:
+                {
+                    // 4 and 5 are the string length of "true" and "false"
+                    v.start_position = v.end_position - (v.m_data.m_value.boolean ? 4 : 5);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    // 4 is the string length of "null"
+                    v.start_position = v.end_position - 4;
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    // include the length of the quotes, which is 2
+                    v.start_position = v.end_position - v.m_data.m_value.string->size() - 2;
+                    break;
+                }
+
+                case value_t::discarded:
+                {
+                    v.end_position = std::string::npos;
+                    v.start_position = v.end_position;
+                    break;
+                }
+
+                case value_t::binary:
+                case value_t::number_integer:
+                case value_t::number_unsigned:
+                case value_t::number_float:
+                {
+                    v.start_position = v.end_position - m_lexer_ref->get_string().size();
+                    break;
+                }
+
+                case value_t::object:
+                case value_t::array:
+                {
+                    // object and array are handled in start_object() and start_array() handlers
+                    // skip setting the values here.
+                    break;
+                }
+                default: // LCOV_EXCL_LINE
+                    // Handle all possible types discretely, default handler should never be reached.
+                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert,-warnings-as-errors) LCOV_EXCL_LINE
+            }
+        }
+    }
+#endif
+
+    /*!
+    @param[in] v  value to add to the JSON value we build during parsing
+    @param[in] skip_callback  whether we should skip calling the callback
+               function; this is required after start_array() and
+               start_object() SAX events, because otherwise we would call the
+               callback function with an empty array or object, respectively.
+
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+
+    @return pair of boolean (whether value should be kept) and pointer (to the
+            passed value in the ref_stack hierarchy; nullptr if not kept)
+    */
+    template<typename Value>
+    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
+    {
+        JSON_ASSERT(!keep_stack.empty());
+
+        // do not handle this value if we know it would be added to a discarded
+        // container
+        if (!keep_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // create value
+        auto value = BasicJsonType(std::forward<Value>(v));
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        handle_diagnostic_positions_for_json_value(value);
+#endif
+
+        // check callback
+        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
+
+        // do not handle this value if we just learnt it shall be discarded
+        if (!keep)
+        {
+            return {false, nullptr};
+        }
+
+        if (ref_stack.empty())
+        {
+            root = std::move(value);
+            return {true, & root};
+        }
+
+        // skip this value if we already decided to skip the parent
+        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
+        if (!ref_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // we now only expect arrays and objects
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        // array
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value));
+            return {true, & (ref_stack.back()->m_data.m_value.array->back())};
+        }
+
+        // object
+        JSON_ASSERT(ref_stack.back()->is_object());
+        // check if we should store an element for the current key
+        JSON_ASSERT(!key_keep_stack.empty());
+        const bool store_element = key_keep_stack.back();
+        key_keep_stack.pop_back();
+
+        if (!store_element)
+        {
+            return {false, nullptr};
+        }
+
+        JSON_ASSERT(object_element);
+        *object_element = std::move(value);
+        return {true, object_element};
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// stack to manage which values to keep
+    std::vector<bool> keep_stack {}; // NOLINT(readability-redundant-member-init)
+    /// stack to manage which object keys to keep
+    std::vector<bool> key_keep_stack {}; // NOLINT(readability-redundant-member-init)
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// a discarded value for the callback
+    BasicJsonType discarded = BasicJsonType::value_t::discarded;
+    /// the lexer reference to obtain the current position
+    lexer_t* m_lexer_ref = nullptr;
+};
+
+template<typename BasicJsonType>
+class json_sax_acceptor
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    bool null()
+    {
+        return true;
+    }
+
+    bool boolean(bool /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_integer(number_integer_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool string(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool binary(binary_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool start_object(std::size_t /*unused*/ = detail::unknown_size())
+    {
+        return true;
+    }
+
+    bool key(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool end_object()
+    {
+        return true;
+    }
+
+    bool start_array(std::size_t /*unused*/ = detail::unknown_size())
+    {
+        return true;
+    }
+
+    bool end_array()
+    {
+        return true;
+    }
+
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
+    {
+        return false;
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
 // #include <nlohmann/detail/meta/is_sax.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -9208,7 +9841,7 @@ static inline bool little_endianness(int num = 1) noexcept
 /*!
 @brief deserialization of CBOR, MessagePack, and UBJSON values
 */
-template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType>>
+template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType, InputAdapterType>>
 class binary_reader
 {
     using number_integer_t = typename BasicJsonType::number_integer_t;
@@ -9315,7 +9948,7 @@ class binary_reader
         std::int32_t document_size{};
         get_number<std::int32_t, true>(input_format_t::bson, document_size);
 
-        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1))))
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
         {
             return false;
         }
@@ -9471,6 +10104,12 @@ class binary_reader
                 return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
             }
 
+            case 0x11: // uint64
+            {
+                std::uint64_t value{};
+                return get_number<std::uint64_t, true>(input_format_t::bson, value) && sax->number_unsigned(value);
+            }
+
             default: // anything else not supported (yet)
             {
                 std::array<char, 3> cr{{}};
@@ -9537,7 +10176,7 @@ class binary_reader
         std::int32_t document_size{};
         get_number<std::int32_t, true>(input_format_t::bson, document_size);
 
-        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1))))
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
         {
             return false;
         }
@@ -9797,7 +10436,7 @@ class binary_reader
             }
 
             case 0x9F: // array (indefinite length)
-                return get_cbor_array(static_cast<std::size_t>(-1), tag_handler);
+                return get_cbor_array(detail::unknown_size(), tag_handler);
 
             // map (0x00..0x17 pairs of data items follow)
             case 0xA0:
@@ -9851,7 +10490,7 @@ class binary_reader
             }
 
             case 0xBF: // map (indefinite length)
-                return get_cbor_object(static_cast<std::size_t>(-1), tag_handler);
+                return get_cbor_object(detail::unknown_size(), tag_handler);
 
             case 0xC6: // tagged item
             case 0xC7:
@@ -10239,7 +10878,7 @@ class binary_reader
     }
 
     /*!
-    @param[in] len  the length of the array or static_cast<std::size_t>(-1) for an
+    @param[in] len  the length of the array or detail::unknown_size() for an
                     array of indefinite size
     @param[in] tag_handler how CBOR tags should be treated
     @return whether array creation completed
@@ -10252,7 +10891,7 @@ class binary_reader
             return false;
         }
 
-        if (len != static_cast<std::size_t>(-1))
+        if (len != detail::unknown_size())
         {
             for (std::size_t i = 0; i < len; ++i)
             {
@@ -10277,7 +10916,7 @@ class binary_reader
     }
 
     /*!
-    @param[in] len  the length of the object or static_cast<std::size_t>(-1) for an
+    @param[in] len  the length of the object or detail::unknown_size() for an
                     object of indefinite size
     @param[in] tag_handler how CBOR tags should be treated
     @return whether object creation completed
@@ -10293,7 +10932,7 @@ class binary_reader
         if (len != 0)
         {
             string_t key;
-            if (len != static_cast<std::size_t>(-1))
+            if (len != detail::unknown_size())
             {
                 for (std::size_t i = 0; i < len; ++i)
                 {
@@ -11456,6 +12095,16 @@ class binary_reader
             case 'Z':  // null
                 return sax->null();
 
+            case 'B':  // byte
+            {
+                if (input_format != input_format_t::bjdata)
+                {
+                    break;
+                }
+                std::uint8_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
             case 'U':
             {
                 std::uint8_t number{};
@@ -11656,7 +12305,7 @@ class binary_reader
                 return false;
             }
 
-            if (size_and_type.second == 'C')
+            if (size_and_type.second == 'C' || size_and_type.second == 'B')
             {
                 size_and_type.second = 'U';
             }
@@ -11678,6 +12327,13 @@ class binary_reader
             return (sax->end_array() && sax->end_object());
         }
 
+        // If BJData type marker is 'B' decode as binary
+        if (input_format == input_format_t::bjdata && size_and_type.first != npos && size_and_type.second == 'B')
+        {
+            binary_t result;
+            return get_binary(input_format, size_and_type.first, result) && sax->binary(result);
+        }
+
         if (size_and_type.first != npos)
         {
             if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
@@ -11711,7 +12367,7 @@ class binary_reader
         }
         else
         {
-            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1))))
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
             {
                 return false;
             }
@@ -11789,7 +12445,7 @@ class binary_reader
         }
         else
         {
-            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1))))
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
             {
                 return false;
             }
@@ -11900,6 +12556,29 @@ class binary_reader
         return current = ia.get_character();
     }
 
+    /*!
+    @brief get_to read into a primitive type
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns false instead
+
+    @return bool, whether the read was successful
+    */
+    template<class T>
+    bool get_to(T& dest, const input_format_t format, const char* context)
+    {
+        auto new_chars_read = ia.get_elements(&dest);
+        chars_read += new_chars_read;
+        if (JSON_HEDLEY_UNLIKELY(new_chars_read < sizeof(T)))
+        {
+            // in case of failure, advance position by 1 to report failing location
+            ++chars_read;
+            sax->parse_error(chars_read, "<end of file>", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr));
+            return false;
+        }
+        return true;
+    }
+
     /*!
     @return character read from the input after ignoring all 'N' entries
     */
@@ -11914,6 +12593,28 @@ class binary_reader
         return current;
     }
 
+    template<class NumberType>
+    static void byte_swap(NumberType& number)
+    {
+        constexpr std::size_t sz = sizeof(number);
+#ifdef __cpp_lib_byteswap
+        if constexpr (sz == 1)
+        {
+            return;
+        }
+        if constexpr(std::is_integral_v<NumberType>)
+        {
+            number = std::byteswap(number);
+            return;
+        }
+#endif
+        auto* ptr = reinterpret_cast<std::uint8_t*>(&number);
+        for (std::size_t i = 0; i < sz / 2; ++i)
+        {
+            std::swap(ptr[i], ptr[sz - i - 1]);
+        }
+    }
+
     /*
     @brief read a number from the input
 
@@ -11932,29 +12633,16 @@ class binary_reader
     template<typename NumberType, bool InputIsLittleEndian = false>
     bool get_number(const input_format_t format, NumberType& result)
     {
-        // step 1: read input into array with system's byte order
-        std::array<std::uint8_t, sizeof(NumberType)> vec{};
-        for (std::size_t i = 0; i < sizeof(NumberType); ++i)
+        // read in the original format
+
+        if (JSON_HEDLEY_UNLIKELY(!get_to(result, format, "number")))
         {
-            get();
-            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "number")))
-            {
-                return false;
-            }
-
-            // reverse byte order prior to conversion if necessary
-            if (is_little_endian != (InputIsLittleEndian || format == input_format_t::bjdata))
-            {
-                vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
-            }
-            else
-            {
-                vec[i] = static_cast<std::uint8_t>(current); // LCOV_EXCL_LINE
-            }
+            return false;
+        }
+        if (is_little_endian != (InputIsLittleEndian || format == input_format_t::bjdata))
+        {
+            byte_swap(result);
         }
-
-        // step 2: convert array into number of type T and return
-        std::memcpy(&result, vec.data(), sizeof(NumberType));
         return true;
     }
 
@@ -12093,7 +12781,7 @@ class binary_reader
     }
 
   private:
-    static JSON_INLINE_VARIABLE constexpr std::size_t npos = static_cast<std::size_t>(-1);
+    static JSON_INLINE_VARIABLE constexpr std::size_t npos = detail::unknown_size();
 
     /// input adapter
     InputAdapterType ia;
@@ -12119,6 +12807,7 @@ class binary_reader
 
 #define JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_ \
     make_array<bjd_type>(                      \
+    bjd_type{'B', "byte"},                     \
     bjd_type{'C', "char"},                     \
     bjd_type{'D', "double"},                   \
     bjd_type{'I', "int16"},                    \
@@ -12161,10 +12850,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/input/parser.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -12238,10 +12927,10 @@ class parser
   public:
     /// a parser reading from an input adapter
     explicit parser(InputAdapterType&& adapter,
-                    const parser_callback_t<BasicJsonType> cb = nullptr,
+                    parser_callback_t<BasicJsonType> cb = nullptr,
                     const bool allow_exceptions_ = true,
                     const bool skip_comments = false)
-        : callback(cb)
+        : callback(std::move(cb))
         , m_lexer(std::move(adapter), skip_comments)
         , allow_exceptions(allow_exceptions_)
     {
@@ -12263,7 +12952,7 @@ class parser
     {
         if (callback)
         {
-            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
+            json_sax_dom_callback_parser<BasicJsonType, InputAdapterType> sdp(result, callback, allow_exceptions, &m_lexer);
             sax_parse_internal(&sdp);
 
             // in strict mode, input must be completely read
@@ -12291,7 +12980,7 @@ class parser
         }
         else
         {
-            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
+            json_sax_dom_parser<BasicJsonType, InputAdapterType> sdp(result, allow_exceptions, &m_lexer);
             sax_parse_internal(&sdp);
 
             // in strict mode, input must be completely read
@@ -12363,7 +13052,7 @@ class parser
                 {
                     case token_type::begin_object:
                     {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1))))
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(detail::unknown_size())))
                         {
                             return false;
                         }
@@ -12408,7 +13097,7 @@ class parser
 
                     case token_type::begin_array:
                     {
-                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1))))
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(detail::unknown_size())))
                         {
                             return false;
                         }
@@ -12690,10 +13379,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/internal_iterator.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -12703,10 +13392,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/primitive_iterator.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -12862,10 +13551,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/iter_impl.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -13332,7 +14021,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
 
     /*!
     @brief comparison: equal
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
     */
     template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
     bool operator==(const IterImpl& other) const
@@ -13343,7 +14032,11 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
             JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
         }
 
-        JSON_ASSERT(m_object != nullptr);
+        // value-initialized forward iterators can be compared, and must compare equal to other value-initialized iterators of the same type #4493
+        if (m_object == nullptr)
+        {
+            return true;
+        }
 
         switch (m_object->m_data.m_type)
         {
@@ -13368,7 +14061,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
 
     /*!
     @brief comparison: not equal
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
     */
     template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
     bool operator!=(const IterImpl& other) const
@@ -13378,7 +14071,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
 
     /*!
     @brief comparison: smaller
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
     */
     bool operator<(const iter_impl& other) const
     {
@@ -13388,7 +14081,12 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
             JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
         }
 
-        JSON_ASSERT(m_object != nullptr);
+        // value-initialized forward iterators can be compared, and must compare equal to other value-initialized iterators of the same type #4493
+        if (m_object == nullptr)
+        {
+            // the iterators are both value-initialized and are to be considered equal, but this function checks for smaller, so we return false
+            return false;
+        }
 
         switch (m_object->m_data.m_type)
         {
@@ -13413,7 +14111,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
 
     /*!
     @brief comparison: less than or equal
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
     */
     bool operator<=(const iter_impl& other) const
     {
@@ -13422,7 +14120,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
 
     /*!
     @brief comparison: greater than
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    @pre (1) Both iterators are initialized to point to the same object, or (2) both iterators are value-initialized.
     */
     bool operator>(const iter_impl& other) const
     {
@@ -13431,7 +14129,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci
 
     /*!
     @brief comparison: greater than or equal
-    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    @pre (1) The iterator is initialized; i.e. `m_object != nullptr`, or (2) both iterators are value-initialized.
     */
     bool operator>=(const iter_impl& other) const
     {
@@ -13624,10 +14322,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -13759,10 +14457,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/json_custom_base_class.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -13801,10 +14499,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/json_pointer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -14034,7 +14732,7 @@ class json_pointer
         }
 
         const char* p = s.c_str();
-        char* p_end = nullptr;
+        char* p_end = nullptr; // NOLINT(misc-const-correctness)
         errno = 0; // strtoull doesn't reset errno
         const unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int)
         if (p == p_end // invalid input or empty string
@@ -14556,7 +15254,7 @@ class json_pointer
                     // iterate array and use index as reference string
                     for (std::size_t i = 0; i < value.m_data.m_value.array->size(); ++i)
                     {
-                        flatten(detail::concat(reference_string, '/', std::to_string(i)),
+                        flatten(detail::concat<string_t>(reference_string, '/', std::to_string(i)),
                                 value.m_data.m_value.array->operator[](i), result);
                     }
                 }
@@ -14575,7 +15273,7 @@ class json_pointer
                     // iterate object and use keys as reference string
                     for (const auto& element : *value.m_data.m_value.object)
                     {
-                        flatten(detail::concat(reference_string, '/', detail::escape(element.first)), element.second, result);
+                        flatten(detail::concat<string_t>(reference_string, '/', detail::escape(element.first)), element.second, result);
                     }
                 }
                 break;
@@ -14796,10 +15494,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/json_ref.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -14881,6 +15579,8 @@ NLOHMANN_JSON_NAMESPACE_END
 
 // #include <nlohmann/detail/string_escape.hpp>
 
+// #include <nlohmann/detail/string_utils.hpp>
+
 // #include <nlohmann/detail/meta/cpp_future.hpp>
 
 // #include <nlohmann/detail/meta/type_traits.hpp>
@@ -14888,10 +15588,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/output/binary_writer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -14914,10 +15614,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/output/output_adapters.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -15068,6 +15768,13 @@ NLOHMANN_JSON_NAMESPACE_BEGIN
 namespace detail
 {
 
+/// how to encode BJData
+enum class bjdata_version_t
+{
+    draft2,
+    draft3,
+};
+
 ///////////////////
 // binary writer //
 ///////////////////
@@ -15652,7 +16359,7 @@ class binary_writer
             case value_t::binary:
             {
                 // step 0: determine if the binary type has a set subtype to
-                // determine whether or not to use the ext or fixext types
+                // determine whether to use the ext or fixext types
                 const bool use_ext = j.m_data.m_value.binary->has_subtype();
 
                 // step 1: write control byte and the byte string length
@@ -15775,11 +16482,14 @@ class binary_writer
     @param[in] use_type    whether to use '$' prefixes (optimized format)
     @param[in] add_prefix  whether prefixes need to be used for this value
     @param[in] use_bjdata  whether write in BJData format, default is false
+    @param[in] bjdata_version  which BJData version to use, default is draft2
     */
     void write_ubjson(const BasicJsonType& j, const bool use_count,
                       const bool use_type, const bool add_prefix = true,
-                      const bool use_bjdata = false)
+                      const bool use_bjdata = false, const bjdata_version_t bjdata_version = bjdata_version_t::draft2)
     {
+        const bool bjdata_draft3 = use_bjdata && bjdata_version == bjdata_version_t::draft3;
+
         switch (j.type())
         {
             case value_t::null:
@@ -15869,7 +16579,7 @@ class binary_writer
 
                 for (const auto& el : *j.m_data.m_value.array)
                 {
-                    write_ubjson(el, use_count, use_type, prefix_required, use_bjdata);
+                    write_ubjson(el, use_count, use_type, prefix_required, use_bjdata, bjdata_version);
                 }
 
                 if (!use_count)
@@ -15887,11 +16597,11 @@ class binary_writer
                     oa->write_character(to_char_type('['));
                 }
 
-                if (use_type && !j.m_data.m_value.binary->empty())
+                if (use_type && (bjdata_draft3 || !j.m_data.m_value.binary->empty()))
                 {
                     JSON_ASSERT(use_count);
                     oa->write_character(to_char_type('$'));
-                    oa->write_character('U');
+                    oa->write_character(bjdata_draft3 ? 'B' : 'U');
                 }
 
                 if (use_count)
@@ -15910,7 +16620,7 @@ class binary_writer
                 {
                     for (size_t i = 0; i < j.m_data.m_value.binary->size(); ++i)
                     {
-                        oa->write_character(to_char_type('U'));
+                        oa->write_character(to_char_type(bjdata_draft3 ? 'B' : 'U'));
                         oa->write_character(j.m_data.m_value.binary->data()[i]);
                     }
                 }
@@ -15927,7 +16637,7 @@ class binary_writer
             {
                 if (use_bjdata && j.m_data.m_value.object->size() == 3 && j.m_data.m_value.object->find("_ArrayType_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArraySize_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArrayData_") != j.m_data.m_value.object->end())
                 {
-                    if (!write_bjdata_ndarray(*j.m_data.m_value.object, use_count, use_type))  // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata)
+                    if (!write_bjdata_ndarray(*j.m_data.m_value.object, use_count, use_type, bjdata_version))  // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata)
                     {
                         break;
                     }
@@ -15971,7 +16681,7 @@ class binary_writer
                     oa->write_characters(
                         reinterpret_cast<const CharType*>(el.first.c_str()),
                         el.first.size());
-                    write_ubjson(el.second, use_count, use_type, prefix_required, use_bjdata);
+                    write_ubjson(el.second, use_count, use_type, prefix_required, use_bjdata, bjdata_version);
                 }
 
                 if (!use_count)
@@ -16127,7 +16837,8 @@ class binary_writer
         }
         else
         {
-            JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_data.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j));
+            write_bson_entry_header(name, 0x11 /* uint64 */);
+            write_number<std::uint64_t>(static_cast<std::uint64_t>(j.m_data.m_value.number_unsigned), true);
         }
     }
 
@@ -16655,10 +17366,11 @@ class binary_writer
     /*!
     @return false if the object is successfully converted to a bjdata ndarray, true if the type or size is invalid
     */
-    bool write_bjdata_ndarray(const typename BasicJsonType::object_t& value, const bool use_count, const bool use_type)
+    bool write_bjdata_ndarray(const typename BasicJsonType::object_t& value, const bool use_count, const bool use_type, const bjdata_version_t bjdata_version)
     {
         std::map<string_t, CharType> bjdtype = {{"uint8", 'U'},  {"int8", 'i'},  {"uint16", 'u'}, {"int16", 'I'},
-            {"uint32", 'm'}, {"int32", 'l'}, {"uint64", 'M'}, {"int64", 'L'}, {"single", 'd'}, {"double", 'D'}, {"char", 'C'}
+            {"uint32", 'm'}, {"int32", 'l'}, {"uint64", 'M'}, {"int64", 'L'}, {"single", 'd'}, {"double", 'D'},
+            {"char", 'C'}, {"byte", 'B'}
         };
 
         string_t key = "_ArrayType_";
@@ -16688,10 +17400,10 @@ class binary_writer
         oa->write_character('#');
 
         key = "_ArraySize_";
-        write_ubjson(value.at(key), use_count, use_type, true,  true);
+        write_ubjson(value.at(key), use_count, use_type, true,  true, bjdata_version);
 
         key = "_ArrayData_";
-        if (dtype == 'U' || dtype == 'C')
+        if (dtype == 'U' || dtype == 'C' || dtype == 'B')
         {
             for (const auto& el : value.at(key))
             {
@@ -16882,11 +17594,11 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/output/serializer.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2008-2009 Björn Hoehrmann <bjoern@hoehrmann.de>
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2008 - 2009 Björn Hoehrmann <bjoern@hoehrmann.de>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -16907,11 +17619,11 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/detail/conversions/to_chars.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
 // SPDX-FileCopyrightText: 2009 Florian Loitsch <https://florian.loitsch.com/>
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -17147,10 +17859,10 @@ boundaries compute_boundaries(FloatType value)
     //                       v-     m-     v             m+            v+
 
     const bool lower_boundary_is_closer = F == 0 && E > 1;
-    const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
+    const diyfp m_plus = diyfp((2 * v.f) + 1, v.e - 1);
     const diyfp m_minus = lower_boundary_is_closer
-                          ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
-                          : diyfp(2 * v.f - 1, v.e - 1); // (A)
+                          ? diyfp((4 * v.f) - 1, v.e - 2)  // (B)
+                          : diyfp((2 * v.f) - 1, v.e - 1); // (A)
 
     // Determine the normalized w+ = m+.
     const diyfp w_plus = diyfp::normalize(m_plus);
@@ -17380,7 +18092,7 @@ inline cached_power get_cached_power_for_binary_exponent(int e)
     JSON_ASSERT(e >= -1500);
     JSON_ASSERT(e <=  1500);
     const int f = kAlpha - e - 1;
-    const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
+    const int k = ((f * 78913) / (1 << 18)) + static_cast<int>(f > 0);
 
     const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
     JSON_ASSERT(index >= 0);
@@ -17858,15 +18570,15 @@ inline char* append_exponent(char* buf, int e)
     }
     else if (k < 100)
     {
-        *buf++ = static_cast<char>('0' + k / 10);
+        *buf++ = static_cast<char>('0' + (k / 10));
         k %= 10;
         *buf++ = static_cast<char>('0' + k);
     }
     else
     {
-        *buf++ = static_cast<char>('0' + k / 100);
+        *buf++ = static_cast<char>('0' + (k / 100));
         k %= 100;
-        *buf++ = static_cast<char>('0' + k / 10);
+        *buf++ = static_cast<char>('0' + (k / 10));
         k %= 10;
         *buf++ = static_cast<char>('0' + k);
     }
@@ -18652,7 +19364,7 @@ class serializer
     @param[in] x  unsigned integer number to count its digits
     @return    number of decimal digits
     */
-    inline unsigned int count_digits(number_unsigned_t x) noexcept
+    unsigned int count_digits(number_unsigned_t x) noexcept
     {
         unsigned int n_digits = 1;
         for (;;)
@@ -18935,7 +19647,7 @@ class serializer
                 ? (byte & 0x3fu) | (codep << 6u)
                 : (0xFFu >> type) & (byte);
 
-        const std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
+        const std::size_t index = 256u + (static_cast<size_t>(state) * 16u) + static_cast<size_t>(type);
         JSON_ASSERT(index < utf8d.size());
         state = utf8d[index];
         return state;
@@ -18961,7 +19673,7 @@ class serializer
      * absolute values of INT_MIN and INT_MAX are usually not the same. See
      * #1708 for details.
      */
-    inline number_unsigned_t remove_sign(number_integer_t x) noexcept
+    number_unsigned_t remove_sign(number_integer_t x) noexcept
     {
         JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)()); // NOLINT(misc-redundant-expression)
         return static_cast<number_unsigned_t>(-(x + 1)) + 1;
@@ -19003,10 +19715,10 @@ NLOHMANN_JSON_NAMESPACE_END
 // #include <nlohmann/ordered_map.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -19031,7 +19743,7 @@ NLOHMANN_JSON_NAMESPACE_BEGIN
 /// for use within nlohmann::basic_json<ordered_map>
 template <class Key, class T, class IgnoredLess = std::less<Key>,
           class Allocator = std::allocator<std::pair<const Key, T>>>
-                  struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
+              struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
 {
     using key_type = Key;
     using mapped_type = T;
@@ -19346,7 +20058,7 @@ template <class Key, class T, class IgnoredLess = std::less<Key>,
 
     template<typename InputIt>
     using require_input_iter = typename std::enable_if<std::is_convertible<typename std::iterator_traits<InputIt>::iterator_category,
-            std::input_iterator_tag>::value>::type;
+        std::input_iterator_tag>::value>::type;
 
     template<typename InputIt, typename = require_input_iter<InputIt>>
     void insert(InputIt first, InputIt last)
@@ -19417,9 +20129,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     friend class ::nlohmann::detail::binary_writer;
     template<typename BasicJsonType, typename InputType, typename SAX>
     friend class ::nlohmann::detail::binary_reader;
-    template<typename BasicJsonType>
+    template<typename BasicJsonType, typename InputAdapterType>
     friend class ::nlohmann::detail::json_sax_dom_parser;
-    template<typename BasicJsonType>
+    template<typename BasicJsonType, typename InputAdapterType>
     friend class ::nlohmann::detail::json_sax_dom_callback_parser;
     friend class ::nlohmann::detail::exception;
 
@@ -19440,7 +20152,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                  )
     {
         return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
-                std::move(cb), allow_exceptions, ignore_comments);
+            std::move(cb), allow_exceptions, ignore_comments);
     }
 
   private:
@@ -19473,6 +20185,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     using error_handler_t = detail::error_handler_t;
     /// how to treat CBOR tags
     using cbor_tag_handler_t = detail::cbor_tag_handler_t;
+    /// how to encode BJData
+    using bjdata_version_t = detail::bjdata_version_t;
     /// helper type for initializer lists of basic_json values
     using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
 
@@ -19552,7 +20266,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     {
         basic_json result;
 
-        result["copyright"] = "(C) 2013-2023 Niels Lohmann";
+        result["copyright"] = "(C) 2013-2025 Niels Lohmann";
         result["name"] = "JSON for Modern C++";
         result["url"] = "https://github.com/nlohmann/json";
         result["version"]["string"] =
@@ -19817,7 +20531,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                     object = nullptr;  // silence warning, see #821
                     if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
                     {
-                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.3", nullptr)); // LCOV_EXCL_LINE
+                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.12.0", nullptr)); // LCOV_EXCL_LINE
                     }
                     break;
                 }
@@ -20053,10 +20767,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         return it;
     }
 
-    reference set_parent(reference j, std::size_t old_capacity = static_cast<std::size_t>(-1))
+    reference set_parent(reference j, std::size_t old_capacity = detail::unknown_size())
     {
 #if JSON_DIAGNOSTICS
-        if (old_capacity != static_cast<std::size_t>(-1))
+        if (old_capacity != detail::unknown_size())
         {
             // see https://github.com/nlohmann/json/issues/2838
             JSON_ASSERT(type() == value_t::array);
@@ -20136,8 +20850,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                detail::enable_if_t <
                    !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
     basic_json(CompatibleType && val) noexcept(noexcept( // NOLINT(bugprone-forwarding-reference-overload,bugprone-exception-escape)
-                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
-                                           std::forward<CompatibleType>(val))))
+            JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
+                                       std::forward<CompatibleType>(val))))
     {
         JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
         set_parents();
@@ -20150,6 +20864,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                detail::enable_if_t <
                    detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
     basic_json(const BasicJsonType& val)
+#if JSON_DIAGNOSTIC_POSITIONS
+        : start_position(val.start_pos()),
+          end_position(val.end_pos())
+#endif
     {
         using other_boolean_t = typename BasicJsonType::boolean_t;
         using other_number_float_t = typename BasicJsonType::number_float_t;
@@ -20196,6 +20914,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
         }
         JSON_ASSERT(m_data.m_type == val.type());
+
         set_parents();
         assert_invariant();
     }
@@ -20332,7 +21051,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     template < class InputIT, typename std::enable_if <
                    std::is_same<InputIT, typename basic_json_t::iterator>::value ||
                    std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
-    basic_json(InputIT first, InputIT last)
+    basic_json(InputIT first, InputIT last) // NOLINT(performance-unnecessary-value-param)
     {
         JSON_ASSERT(first.m_object != nullptr);
         JSON_ASSERT(last.m_object != nullptr);
@@ -20447,6 +21166,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
     basic_json(const basic_json& other)
         : json_base_class_t(other)
+#if JSON_DIAGNOSTIC_POSITIONS
+        , start_position(other.start_position)
+        , end_position(other.end_position)
+#endif
     {
         m_data.m_type = other.m_data.m_type;
         // check of passed value is valid
@@ -20516,15 +21239,24 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
     basic_json(basic_json&& other) noexcept
         : json_base_class_t(std::forward<json_base_class_t>(other)),
-          m_data(std::move(other.m_data))
+          m_data(std::move(other.m_data)) // cppcheck-suppress[accessForwarded] TODO check
+#if JSON_DIAGNOSTIC_POSITIONS
+        , start_position(other.start_position) // cppcheck-suppress[accessForwarded] TODO check
+        , end_position(other.end_position) // cppcheck-suppress[accessForwarded] TODO check
+#endif
     {
         // check that passed value is valid
-        other.assert_invariant(false);
+        other.assert_invariant(false); // cppcheck-suppress[accessForwarded]
 
         // invalidate payload
         other.m_data.m_type = value_t::null;
         other.m_data.m_value = {};
 
+#if JSON_DIAGNOSTIC_POSITIONS
+        other.start_position = std::string::npos;
+        other.end_position = std::string::npos;
+#endif
+
         set_parents();
         assert_invariant();
     }
@@ -20545,6 +21277,12 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         using std::swap;
         swap(m_data.m_type, other.m_data.m_type);
         swap(m_data.m_value, other.m_data.m_value);
+
+#if JSON_DIAGNOSTIC_POSITIONS
+        swap(start_position, other.start_position);
+        swap(end_position, other.end_position);
+#endif
+
         json_base_class_t::operator=(std::move(other));
 
         set_parents();
@@ -20766,13 +21504,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// get a pointer to the value (integer number)
     number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
     {
-        return is_number_integer() ? &m_data.m_value.number_integer : nullptr;
+        return m_data.m_type == value_t::number_integer ? &m_data.m_value.number_integer : nullptr;
     }
 
     /// get a pointer to the value (integer number)
     constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
     {
-        return is_number_integer() ? &m_data.m_value.number_integer : nullptr;
+        return m_data.m_type == value_t::number_integer ? &m_data.m_value.number_integer : nullptr;
     }
 
     /// get a pointer to the value (unsigned number)
@@ -20907,7 +21645,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                    detail::has_from_json<basic_json_t, ValueType>::value,
                    int > = 0 >
     ValueType get_impl(detail::priority_tag<0> /*unused*/) const noexcept(noexcept(
-                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
+            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
     {
         auto ret = ValueType();
         JSONSerializer<ValueType>::from_json(*this, ret);
@@ -20949,7 +21687,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                    detail::has_non_default_from_json<basic_json_t, ValueType>::value,
                    int > = 0 >
     ValueType get_impl(detail::priority_tag<1> /*unused*/) const noexcept(noexcept(
-                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
+            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
     {
         return JSONSerializer<ValueType>::from_json(*this);
     }
@@ -21099,7 +21837,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                    detail::has_from_json<basic_json_t, ValueType>::value,
                    int > = 0 >
     ValueType & get_to(ValueType& v) const noexcept(noexcept(
-                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
+            JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
     {
         JSONSerializer<ValueType>::from_json(*this, v);
         return v;
@@ -21251,7 +21989,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             {
                 // create better exception explanation
                 JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
-            }
+            } // cppcheck-suppress[missingReturn]
         }
         else
         {
@@ -21274,7 +22012,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             {
                 // create better exception explanation
                 JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
-            }
+            } // cppcheck-suppress[missingReturn]
         }
         else
         {
@@ -21419,7 +22157,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief access specified object element
     /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
-    reference operator[](typename object_t::key_type key)
+    reference operator[](typename object_t::key_type key) // NOLINT(performance-unnecessary-value-param)
     {
         // implicitly convert null value to an empty object
         if (is_null())
@@ -21729,7 +22467,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     template < class IteratorType, detail::enable_if_t <
                    std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
                    std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int > = 0 >
-    IteratorType erase(IteratorType pos)
+    IteratorType erase(IteratorType pos) // NOLINT(performance-unnecessary-value-param)
     {
         // make sure iterator fits the current value
         if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
@@ -21799,7 +22537,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     template < class IteratorType, detail::enable_if_t <
                    std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
                    std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int > = 0 >
-    IteratorType erase(IteratorType first, IteratorType last)
+    IteratorType erase(IteratorType first, IteratorType last) // NOLINT(performance-unnecessary-value-param)
     {
         // make sure iterator fits the current value
         if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
@@ -22566,7 +23304,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @note: This uses std::distance to support GCC 4.8,
     ///        see https://github.com/nlohmann/json/pull/1257
     template<typename... Args>
-    iterator insert_iterator(const_iterator pos, Args&& ... args)
+    iterator insert_iterator(const_iterator pos, Args&& ... args) // NOLINT(performance-unnecessary-value-param)
     {
         iterator result(this);
         JSON_ASSERT(m_data.m_value.array != nullptr);
@@ -22585,7 +23323,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief inserts element into array
     /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, const basic_json& val)
+    iterator insert(const_iterator pos, const basic_json& val) // NOLINT(performance-unnecessary-value-param)
     {
         // insert only works for arrays
         if (JSON_HEDLEY_LIKELY(is_array()))
@@ -22605,14 +23343,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief inserts element into array
     /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, basic_json&& val)
+    iterator insert(const_iterator pos, basic_json&& val) // NOLINT(performance-unnecessary-value-param)
     {
         return insert(pos, val);
     }
 
     /// @brief inserts copies of element into array
     /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, size_type cnt, const basic_json& val)
+    iterator insert(const_iterator pos, size_type cnt, const basic_json& val) // NOLINT(performance-unnecessary-value-param)
     {
         // insert only works for arrays
         if (JSON_HEDLEY_LIKELY(is_array()))
@@ -22632,7 +23370,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief inserts range of elements into array
     /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
+    iterator insert(const_iterator pos, const_iterator first, const_iterator last) // NOLINT(performance-unnecessary-value-param)
     {
         // insert only works for arrays
         if (JSON_HEDLEY_UNLIKELY(!is_array()))
@@ -22663,7 +23401,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief inserts elements from initializer list into array
     /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    iterator insert(const_iterator pos, initializer_list_t ilist)
+    iterator insert(const_iterator pos, initializer_list_t ilist) // NOLINT(performance-unnecessary-value-param)
     {
         // insert only works for arrays
         if (JSON_HEDLEY_UNLIKELY(!is_array()))
@@ -22683,7 +23421,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief inserts range of elements into object
     /// @sa https://json.nlohmann.me/api/basic_json/insert/
-    void insert(const_iterator first, const_iterator last)
+    void insert(const_iterator first, const_iterator last) // NOLINT(performance-unnecessary-value-param)
     {
         // insert only works for objects
         if (JSON_HEDLEY_UNLIKELY(!is_object()))
@@ -22704,6 +23442,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         }
 
         m_data.m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
+        set_parents();
     }
 
     /// @brief updates a JSON object from another object, overwriting existing keys
@@ -22715,7 +23454,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
     /// @brief updates a JSON object from another object, overwriting existing keys
     /// @sa https://json.nlohmann.me/api/basic_json/update/
-    void update(const_iterator first, const_iterator last, bool merge_objects = false)
+    void update(const_iterator first, const_iterator last, bool merge_objects = false) // NOLINT(performance-unnecessary-value-param)
     {
         // implicitly convert null value to an empty object
         if (is_null())
@@ -23316,12 +24055,12 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     template<typename InputType>
     JSON_HEDLEY_WARN_UNUSED_RESULT
     static basic_json parse(InputType&& i,
-                            const parser_callback_t cb = nullptr,
+                            parser_callback_t cb = nullptr,
                             const bool allow_exceptions = true,
                             const bool ignore_comments = false)
     {
         basic_json result;
-        parser(detail::input_adapter(std::forward<InputType>(i)), cb, allow_exceptions, ignore_comments).parse(true, result);
+        parser(detail::input_adapter(std::forward<InputType>(i)), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved,accessForwarded]
         return result;
     }
 
@@ -23331,24 +24070,24 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     JSON_HEDLEY_WARN_UNUSED_RESULT
     static basic_json parse(IteratorType first,
                             IteratorType last,
-                            const parser_callback_t cb = nullptr,
+                            parser_callback_t cb = nullptr,
                             const bool allow_exceptions = true,
                             const bool ignore_comments = false)
     {
         basic_json result;
-        parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments).parse(true, result);
+        parser(detail::input_adapter(std::move(first), std::move(last)), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved]
         return result;
     }
 
     JSON_HEDLEY_WARN_UNUSED_RESULT
     JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
     static basic_json parse(detail::span_input_adapter&& i,
-                            const parser_callback_t cb = nullptr,
+                            parser_callback_t cb = nullptr,
                             const bool allow_exceptions = true,
                             const bool ignore_comments = false)
     {
         basic_json result;
-        parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result);
+        parser(i.get(), std::move(cb), allow_exceptions, ignore_comments).parse(true, result); // cppcheck-suppress[accessMoved]
         return result;
     }
 
@@ -23527,6 +24266,23 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     basic_json* m_parent = nullptr;
 #endif
 
+#if JSON_DIAGNOSTIC_POSITIONS
+    /// the start position of the value
+    std::size_t start_position = std::string::npos;
+    /// the end position of the value
+    std::size_t end_position = std::string::npos;
+  public:
+    constexpr std::size_t start_pos() const noexcept
+    {
+        return start_position;
+    }
+
+    constexpr std::size_t end_pos() const noexcept
+    {
+        return end_position;
+    }
+#endif
+
     //////////////////////////////////////////
     // binary serialization/deserialization //
     //////////////////////////////////////////
@@ -23612,27 +24368,30 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
     static std::vector<std::uint8_t> to_bjdata(const basic_json& j,
             const bool use_size = false,
-            const bool use_type = false)
+            const bool use_type = false,
+            const bjdata_version_t version = bjdata_version_t::draft2)
     {
         std::vector<std::uint8_t> result;
-        to_bjdata(j, result, use_size, use_type);
+        to_bjdata(j, result, use_size, use_type, version);
         return result;
     }
 
     /// @brief create a BJData serialization of a given JSON value
     /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
     static void to_bjdata(const basic_json& j, detail::output_adapter<std::uint8_t> o,
-                          const bool use_size = false, const bool use_type = false)
+                          const bool use_size = false, const bool use_type = false,
+                          const bjdata_version_t version = bjdata_version_t::draft2)
     {
-        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type, true, true);
+        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type, true, true, version);
     }
 
     /// @brief create a BJData serialization of a given JSON value
     /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
     static void to_bjdata(const basic_json& j, detail::output_adapter<char> o,
-                          const bool use_size = false, const bool use_type = false)
+                          const bool use_size = false, const bool use_type = false,
+                          const bjdata_version_t version = bjdata_version_t::draft2)
     {
-        binary_writer<char>(o).write_ubjson(j, use_size, use_type, true, true);
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type, true, true, version);
     }
 
     /// @brief create a BSON serialization of a given JSON value
@@ -23668,9 +24427,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                 const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23684,9 +24443,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                 const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23709,10 +24468,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                 const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = i.get();
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
         // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23725,9 +24484,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                    const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23740,9 +24499,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                    const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23763,10 +24522,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                    const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = i.get();
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
         // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack).sax_parse(input_format_t::msgpack, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23779,9 +24538,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                   const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23794,9 +24553,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                   const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23817,10 +24576,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                   const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = i.get();
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
         // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson).sax_parse(input_format_t::ubjson, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23833,9 +24592,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                   const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23848,9 +24607,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                   const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata).sax_parse(input_format_t::bjdata, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23863,9 +24622,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                 const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23878,9 +24637,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                 const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict);
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
 
@@ -23901,10 +24660,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                                 const bool allow_exceptions = true)
     {
         basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
         auto ia = i.get();
+        detail::json_sax_dom_parser<basic_json, decltype(ia)> sdp(result, allow_exceptions);
         // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
-        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict);
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson).sax_parse(input_format_t::bson, &sdp, strict); // cppcheck-suppress[accessMoved]
         return res ? result : basic_json(value_t::discarded);
     }
     /// @}
@@ -24005,7 +24764,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         // the valid JSON Patch operations
         enum class patch_operations {add, remove, replace, move, copy, test, invalid};
 
-        const auto get_op = [](const std::string & op)
+        const auto get_op = [](const string_t& op)
         {
             if (op == "add")
             {
@@ -24036,7 +24795,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         };
 
         // wrapper for "add" operation; add value at ptr
-        const auto operation_add = [&result](json_pointer & ptr, basic_json val)
+        const auto operation_add = [&result](json_pointer & ptr, const basic_json & val)
         {
             // adding to the root of the target document means replacing it
             if (ptr.empty())
@@ -24142,8 +24901,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
         for (const auto& val : json_patch)
         {
             // wrapper to get a value for an operation
-            const auto get_value = [&val](const std::string & op,
-                                          const std::string & member,
+            const auto get_value = [&val](const string_t& op,
+                                          const string_t& member,
                                           bool string_type) -> basic_json &
             {
                 // find value
@@ -24177,8 +24936,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
             }
 
             // collect mandatory members
-            const auto op = get_value("op", "op", true).template get<std::string>();
-            const auto path = get_value(op, "path", true).template get<std::string>();
+            const auto op = get_value("op", "op", true).template get<string_t>();
+            const auto path = get_value(op, "path", true).template get<string_t>();
             json_pointer ptr(path);
 
             switch (get_op(op))
@@ -24204,7 +24963,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
                 case patch_operations::move:
                 {
-                    const auto from_path = get_value("move", "from", true).template get<std::string>();
+                    const auto from_path = get_value("move", "from", true).template get<string_t>();
                     json_pointer from_ptr(from_path);
 
                     // the "from" location must exist - use at()
@@ -24221,7 +24980,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
 
                 case patch_operations::copy:
                 {
-                    const auto from_path = get_value("copy", "from", true).template get<std::string>();
+                    const auto from_path = get_value("copy", "from", true).template get<string_t>();
                     const json_pointer from_ptr(from_path);
 
                     // the "from" location must exist - use at()
@@ -24281,7 +25040,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
     /// @sa https://json.nlohmann.me/api/basic_json/diff/
     JSON_HEDLEY_WARN_UNUSED_RESULT
     static basic_json diff(const basic_json& source, const basic_json& target,
-                           const std::string& path = "")
+                           const string_t& path = "")
     {
         // the patch
         basic_json result(value_t::array);
@@ -24311,7 +25070,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 while (i < source.size() && i < target.size())
                 {
                     // recursive call to compare array values at index i
-                    auto temp_diff = diff(source[i], target[i], detail::concat(path, '/', std::to_string(i)));
+                    auto temp_diff = diff(source[i], target[i], detail::concat<string_t>(path, '/', detail::to_string<string_t>(i)));
                     result.insert(result.end(), temp_diff.begin(), temp_diff.end());
                     ++i;
                 }
@@ -24328,7 +25087,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                     result.insert(result.begin() + end_index, object(
                     {
                         {"op", "remove"},
-                        {"path", detail::concat(path, '/', std::to_string(i))}
+                        {"path", detail::concat<string_t>(path, '/', detail::to_string<string_t>(i))}
                     }));
                     ++i;
                 }
@@ -24339,7 +25098,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                     result.push_back(
                     {
                         {"op", "add"},
-                        {"path", detail::concat(path, "/-")},
+                        {"path", detail::concat<string_t>(path, "/-")},
                         {"value", target[i]}
                     });
                     ++i;
@@ -24354,7 +25113,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                 for (auto it = source.cbegin(); it != source.cend(); ++it)
                 {
                     // escape the key name to be used in a JSON patch
-                    const auto path_key = detail::concat(path, '/', detail::escape(it.key()));
+                    const auto path_key = detail::concat<string_t>(path, '/', detail::escape(it.key()));
 
                     if (target.find(it.key()) != target.end())
                     {
@@ -24378,7 +25137,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec
                     if (source.find(it.key()) == source.end())
                     {
                         // found a key that is not in this -> add it
-                        const auto path_key = detail::concat(path, '/', detail::escape(it.key()));
+                        const auto path_key = detail::concat<string_t>(path, '/', detail::escape(it.key()));
                         result.push_back(
                         {
                             {"op", "add"}, {"path", path_key},
@@ -24559,10 +25318,10 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC
 // #include <nlohmann/detail/macro_unscope.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
@@ -24593,6 +25352,7 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC
     #undef JSON_HAS_CPP_14
     #undef JSON_HAS_CPP_17
     #undef JSON_HAS_CPP_20
+    #undef JSON_HAS_CPP_23
     #undef JSON_HAS_FILESYSTEM
     #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
     #undef JSON_HAS_THREE_WAY_COMPARISON
@@ -24604,10 +25364,10 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC
 // #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
-// |  |  |__   |  |  | | | |  version 3.11.3
+// |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
-// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 
 
diff --git a/vendor/nlohmann/json_fwd.hpp b/vendor/nlohmann/json_fwd.hpp
new file mode 100644
index 000000000..942917139
--- /dev/null
+++ b/vendor/nlohmann/json_fwd.hpp
@@ -0,0 +1,187 @@
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.12.0
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+#include <cstdint> // int64_t, uint64_t
+#include <map> // map
+#include <memory> // allocator
+#include <string> // string
+#include <vector> // vector
+
+// #include <nlohmann/detail/abi_macros.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.12.0
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// This file contains all macro definitions affecting or depending on the ABI
+
+#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
+    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
+        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 12 || NLOHMANN_JSON_VERSION_PATCH != 0
+            #warning "Already included a different version of the library!"
+        #endif
+    #endif
+#endif
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_MINOR 12  // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_PATCH 0   // NOLINT(modernize-macro-to-enum)
+
+#ifndef JSON_DIAGNOSTICS
+    #define JSON_DIAGNOSTICS 0
+#endif
+
+#ifndef JSON_DIAGNOSTIC_POSITIONS
+    #define JSON_DIAGNOSTIC_POSITIONS 0
+#endif
+
+#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
+#endif
+
+#if JSON_DIAGNOSTICS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
+#endif
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
+#endif
+
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
+    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
+#endif
+
+// Construct the namespace ABI tags component
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)
+
+#define NLOHMANN_JSON_ABI_TAGS                                       \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
+            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)
+
+// Construct the namespace version component
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
+    _v ## major ## _ ## minor ## _ ## patch
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)
+
+#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
+#define NLOHMANN_JSON_NAMESPACE_VERSION
+#else
+#define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
+                                           NLOHMANN_JSON_VERSION_MINOR, \
+                                           NLOHMANN_JSON_VERSION_PATCH)
+#endif
+
+// Combine namespace components
+#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
+#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
+    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)
+
+#ifndef NLOHMANN_JSON_NAMESPACE
+#define NLOHMANN_JSON_NAMESPACE               \
+    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
+            NLOHMANN_JSON_ABI_TAGS,           \
+            NLOHMANN_JSON_NAMESPACE_VERSION)
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
+#define NLOHMANN_JSON_NAMESPACE_BEGIN                \
+    namespace nlohmann                               \
+    {                                                \
+    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
+                NLOHMANN_JSON_ABI_TAGS,              \
+                NLOHMANN_JSON_NAMESPACE_VERSION)     \
+    {
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_END
+#define NLOHMANN_JSON_NAMESPACE_END                                     \
+    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
+    }  // namespace nlohmann
+#endif
+
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template<typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+/// a class to store JSON values
+/// @sa https://json.nlohmann.me/api/basic_json/
+template<template<typename U, typename V, typename... Args> class ObjectType =
+         std::map,
+         template<typename U, typename... Args> class ArrayType = std::vector,
+         class StringType = std::string, class BooleanType = bool,
+         class NumberIntegerType = std::int64_t,
+         class NumberUnsignedType = std::uint64_t,
+         class NumberFloatType = double,
+         template<typename U> class AllocatorType = std::allocator,
+         template<typename T, typename SFINAE = void> class JSONSerializer =
+         adl_serializer,
+         class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
+         class CustomBaseClass = void>
+class basic_json;
+
+/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
+/// @sa https://json.nlohmann.me/api/json_pointer/
+template<typename RefStringType>
+class json_pointer;
+
+/*!
+@brief default specialization
+@sa https://json.nlohmann.me/api/json/
+*/
+using json = basic_json<>;
+
+/// @brief a minimal map-like container that preserves insertion order
+/// @sa https://json.nlohmann.me/api/ordered_map/
+template<class Key, class T, class IgnoredLess, class Allocator>
+struct ordered_map;
+
+/// @brief specialization that maintains the insertion order of object keys
+/// @sa https://json.nlohmann.me/api/ordered_json/
+using ordered_json = basic_json<nlohmann::ordered_map>;
+
+NLOHMANN_JSON_NAMESPACE_END
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
diff --git a/common/stb_image.h b/vendor/stb/stb_image.h
similarity index 100%
rename from common/stb_image.h
rename to vendor/stb/stb_image.h