feat merge

2024-03-12 21:44:27 +08:00 · 2024-03-12 21:44:27 +08:00 · 8d2c92c44e
commit 8d2c92c44e
parent ec1a8e1fcf 9ed6245cfb
145 changed files with 44091 additions and 140 deletions
--- a/.gitmodules
+++ b/.gitmodules
--- a/3party/inja/inja.hpp
+++ b/3party/inja/inja.hpp
--- a/3party/inja/nlohmann/json.hpp
+++ b/3party/inja/nlohmann/json.hpp
--- a/3party/marl/.clang-format
+++ b/3party/marl/.clang-format
@ -0,0 +1,5 @@
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+BasedOnStyle: Chromium
+
+---
+Language:        Cpp
--- a/3party/marl/.gitignore
+++ b/3party/marl/.gitignore
@ -0,0 +1,8 @@
+/.vs/
+/.vscode/
+/build/
+/cmake-build-*/
+/out/
+bazel-*
+CMakeSettings.json
+/.idea/
--- a/3party/marl/.gitmodules
+++ b/3party/marl/.gitmodules
@ -0,0 +1,6 @@
+[submodule "third_party/googletest"]
+	path = third_party/googletest
+	url = https://github.com/google/googletest.git
+[submodule "third_party/benchmark"]
+	path = third_party/benchmark
+	url = https://github.com/google/benchmark.git
--- a/3party/marl/AUTHORS
+++ b/3party/marl/AUTHORS
@ -0,0 +1,9 @@
+# This is the list of the Marl authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
+Shawn Anastasio <shawn@anastas.io>
+A. Wilcox <awilfox@adelielinux.org>
+Jiaxun Yang <jiaxun.yang@flygoat.com>
--- a/3party/marl/BUILD.bazel
+++ b/3party/marl/BUILD.bazel
@ -0,0 +1,68 @@
+# Copyright 2019 The Marl Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+config_setting(
+    name = "linux_x86_64",
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+    ],
+)
+
+config_setting(
+    name = "windows",
+    constraint_values = ["@platforms//os:windows"],
+)
+
+cc_library(
+    name = "marl",
+    srcs = glob(
+        [
+            "src/**/*.cpp",
+            "src/**/*.c",
+            "src/**/*.h",
+        ],
+        exclude = glob([
+            "src/**/*_bench.cpp",
+            "src/**/*_test.cpp",
+        ]),
+    ) + select({
+        ":windows": [],
+        "//conditions:default": glob(["src/**/*.S"]),
+    }),
+    hdrs = glob([
+        "include/marl/**/*.h",
+    ]),
+    includes = [
+        "include",
+    ],
+    linkopts = select({
+        ":linux_x86_64": ["-pthread"],
+        "//conditions:default": [],
+    }),
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
+cc_test(
+    name = "tests",
+    srcs = glob([
+        "src/**/*_test.cpp",
+    ]),
+    deps = [
+        "//:marl",
+        "@googletest//:gtest",
+    ],
+)
--- a/3party/marl/CHANGES.md
+++ b/3party/marl/CHANGES.md
@ -0,0 +1,8 @@
+# Revision history for `marl`
+
+All notable changes to this project will be documented in this file.
+This project adheres to [Semantic Versioning](https://semver.org/).
+
+## 1.0.0-dev
+
+First versioned release of marl.
--- a/3party/marl/CMakeLists.txt
+++ b/3party/marl/CMakeLists.txt
@ -0,0 +1,425 @@
+# Copyright 2019 The Marl Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.0)
+
+include(cmake/parse_version.cmake)
+parse_version("${CMAKE_CURRENT_SOURCE_DIR}/CHANGES.md" MARL)
+
+set(CMAKE_CXX_STANDARD 11)
+
+project(Marl
+    VERSION   "${MARL_VERSION_MAJOR}.${MARL_VERSION_MINOR}.${MARL_VERSION_PATCH}"
+    LANGUAGES C CXX ASM
+)
+
+if (EMSCRIPTEN)
+    add_compile_options(-O3 -pthread)
+endif()
+
+include(CheckCXXSourceCompiles)
+
+# MARL_IS_SUBPROJECT is 1 if added via add_subdirectory() from another project.
+get_directory_property(MARL_IS_SUBPROJECT PARENT_DIRECTORY)
+if(MARL_IS_SUBPROJECT)
+    set(MARL_IS_SUBPROJECT 1)
+endif()
+
+###########################################################
+# Options
+###########################################################
+function(option_if_not_defined name description default)
+    if(NOT DEFINED ${name})
+        option(${name} ${description} ${default})
+    endif()
+endfunction()
+
+option_if_not_defined(MARL_WARNINGS_AS_ERRORS "Treat warnings as errors" OFF)
+option_if_not_defined(MARL_BUILD_EXAMPLES "Build example applications" OFF)
+option_if_not_defined(MARL_BUILD_TESTS "Build tests" OFF)
+option_if_not_defined(MARL_BUILD_BENCHMARKS "Build benchmarks" OFF)
+option_if_not_defined(MARL_BUILD_SHARED "Build marl as a shared / dynamic library (default static)" OFF)
+option_if_not_defined(MARL_USE_PTHREAD_THREAD_LOCAL "Use pthreads for thread local storage" OFF)
+option_if_not_defined(MARL_ASAN "Build marl with address sanitizer" OFF)
+option_if_not_defined(MARL_MSAN "Build marl with memory sanitizer" OFF)
+option_if_not_defined(MARL_TSAN "Build marl with thread sanitizer" OFF)
+option_if_not_defined(MARL_UBSAN "Build marl with undefined-behavior sanitizer" OFF)
+option_if_not_defined(MARL_INSTALL "Create marl install target" OFF)
+option_if_not_defined(MARL_FULL_BENCHMARK "Run benchmarks for [0 .. numLogicalCPUs] with no stepping" OFF)
+option_if_not_defined(MARL_FIBERS_USE_UCONTEXT "Use ucontext instead of assembly for fibers (ignored for platforms that do not support ucontext)" OFF)
+option_if_not_defined(MARL_DEBUG_ENABLED "Enable debug checks even in release builds" OFF)
+
+###########################################################
+# Directories
+###########################################################
+function(set_if_not_defined name value)
+    if(NOT DEFINED ${name})
+        set(${name} ${value} PARENT_SCOPE)
+    endif()
+endfunction()
+
+set(MARL_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
+set(MARL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set_if_not_defined(MARL_THIRD_PARTY_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
+set_if_not_defined(MARL_GOOGLETEST_DIR ${MARL_THIRD_PARTY_DIR}/googletest)
+set_if_not_defined(MARL_BENCHMARK_DIR ${MARL_THIRD_PARTY_DIR}/benchmark)
+
+###########################################################
+# Submodules
+###########################################################
+if(MARL_BUILD_TESTS)
+    if(NOT EXISTS ${MARL_GOOGLETEST_DIR}/.git)
+        message(WARNING "third_party/googletest submodule missing.")
+        message(WARNING "Run: `git submodule update --init` to build tests.")
+        set(MARL_BUILD_TESTS OFF)
+    endif()
+endif(MARL_BUILD_TESTS)
+
+if(MARL_BUILD_BENCHMARKS)
+    if(NOT EXISTS ${MARL_BENCHMARK_DIR}/.git)
+        message(WARNING "third_party/benchmark submodule missing.")
+        message(WARNING "Run: `git submodule update --init` to build benchmarks.")
+        set(MARL_BUILD_BENCHMARKS OFF)
+    endif()
+endif(MARL_BUILD_BENCHMARKS)
+
+if(MARL_BUILD_BENCHMARKS)
+    set(BENCHMARK_ENABLE_TESTING FALSE CACHE BOOL FALSE FORCE)
+    add_subdirectory(${MARL_BENCHMARK_DIR})
+endif(MARL_BUILD_BENCHMARKS)
+
+###########################################################
+# Compiler feature tests
+###########################################################
+# Check that the Clang Thread Safety Analysis' try_acquire_capability behaves
+# correctly. This is broken on some earlier versions of clang.
+# See: https://bugs.llvm.org/show_bug.cgi?id=32954
+set(SAVE_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+set(CMAKE_REQUIRED_FLAGS "-Wthread-safety -Werror")
+check_cxx_source_compiles(
+    "int main() {
+      struct __attribute__((capability(\"mutex\"))) Mutex {
+        void Unlock() __attribute__((release_capability)) {};
+        bool TryLock() __attribute__((try_acquire_capability(true))) { return true; };
+      };
+      Mutex m;
+      if (m.TryLock()) {
+        m.Unlock();  // Should not warn.
+      }
+      return 0;
+    }"
+    MARL_THREAD_SAFETY_ANALYSIS_SUPPORTED)
+set(CMAKE_REQUIRED_FLAGS ${SAVE_CMAKE_REQUIRED_FLAGS})
+
+# Check whether ucontext is supported.
+set(SAVE_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+set(CMAKE_REQUIRED_FLAGS "-Werror")
+check_cxx_source_compiles(
+    "#include <ucontext.h>
+    int main() {
+      ucontext_t ctx;
+      getcontext(&ctx);
+      makecontext(&ctx, nullptr, 2, 1, 2);
+      swapcontext(&ctx, &ctx);
+      return 0;
+    }"
+    MARL_UCONTEXT_SUPPORTED)
+set(CMAKE_REQUIRED_FLAGS ${SAVE_CMAKE_REQUIRED_FLAGS})
+if (MARL_FIBERS_USE_UCONTEXT AND NOT MARL_UCONTEXT_SUPPORTED)
+    # Disable MARL_FIBERS_USE_UCONTEXT and warn if MARL_UCONTEXT_SUPPORTED is 0.
+    message(WARNING "MARL_FIBERS_USE_UCONTEXT is enabled, but ucontext is not supported by the target. Disabling")
+    set(MARL_FIBERS_USE_UCONTEXT 0)
+endif()
+
+if(MARL_IS_SUBPROJECT)
+    # Export supported flags as this may be useful to parent projects
+    set(MARL_THREAD_SAFETY_ANALYSIS_SUPPORTED PARENT_SCOPE ${MARL_THREAD_SAFETY_ANALYSIS_SUPPORTED})
+    set(MARL_UCONTEXT_SUPPORTED               PARENT_SCOPE ${MARL_UCONTEXT_SUPPORTED})
+endif()
+
+###########################################################
+# File lists
+###########################################################
+set(MARL_LIST
+    ${MARL_SRC_DIR}/debug.cpp
+    ${MARL_SRC_DIR}/memory.cpp
+    ${MARL_SRC_DIR}/scheduler.cpp
+    ${MARL_SRC_DIR}/thread.cpp
+    ${MARL_SRC_DIR}/trace.cpp
+)
+if(NOT MSVC)
+    list(APPEND MARL_LIST
+        ${MARL_SRC_DIR}/osfiber_aarch64.c
+        ${MARL_SRC_DIR}/osfiber_arm.c
+        ${MARL_SRC_DIR}/osfiber_asm_aarch64.S
+        ${MARL_SRC_DIR}/osfiber_asm_arm.S
+        ${MARL_SRC_DIR}/osfiber_asm_loongarch64.S
+        ${MARL_SRC_DIR}/osfiber_asm_mips64.S
+        ${MARL_SRC_DIR}/osfiber_asm_ppc64.S
+        ${MARL_SRC_DIR}/osfiber_asm_rv64.S
+        ${MARL_SRC_DIR}/osfiber_asm_x64.S
+        ${MARL_SRC_DIR}/osfiber_asm_x86.S
+        ${MARL_SRC_DIR}/osfiber_loongarch64.c
+        ${MARL_SRC_DIR}/osfiber_mips64.c
+        ${MARL_SRC_DIR}/osfiber_ppc64.c
+        ${MARL_SRC_DIR}/osfiber_rv64.c
+        ${MARL_SRC_DIR}/osfiber_x64.c
+        ${MARL_SRC_DIR}/osfiber_x86.c
+        ${MARL_SRC_DIR}/osfiber_emscripten.cpp
+    )
+    # CMAKE_OSX_ARCHITECTURES settings aren't propagated to assembly files when
+    # building for Apple platforms (https://gitlab.kitware.com/cmake/cmake/-/issues/20771),
+    # we treat assembly files as C files to work around this bug.
+    set_source_files_properties(
+        ${MARL_SRC_DIR}/osfiber_asm_aarch64.S
+        ${MARL_SRC_DIR}/osfiber_asm_arm.S
+        ${MARL_SRC_DIR}/osfiber_asm_loongarch64.S
+        ${MARL_SRC_DIR}/osfiber_asm_mips64.S
+        ${MARL_SRC_DIR}/osfiber_asm_ppc64.S
+        ${MARL_SRC_DIR}/osfiber_asm_x64.S
+        ${MARL_SRC_DIR}/osfiber_asm_x86.S
+        PROPERTIES LANGUAGE C
+    )
+endif(NOT MSVC)
+
+###########################################################
+# OS libraries
+###########################################################
+find_package(Threads REQUIRED)
+
+###########################################################
+# Functions
+###########################################################
+function(marl_set_target_options target)
+    if(MARL_THREAD_SAFETY_ANALYSIS_SUPPORTED)
+        target_compile_options(${target} PRIVATE "-Wthread-safety")
+    endif()
+
+    # Enable all warnings
+    if(MSVC)
+        target_compile_options(${target} PRIVATE "-W4")
+    else()
+        target_compile_options(${target} PRIVATE "-Wall")
+    endif()
+
+    # Disable specific, pedantic warnings
+    if(MSVC)
+        target_compile_options(${target} PRIVATE
+            "-D_CRT_SECURE_NO_WARNINGS"
+            "/wd4127" # conditional expression is constant
+            "/wd4324" # structure was padded due to alignment specifier
+        )
+    endif()
+
+    # Treat all warnings as errors
+    if(MARL_WARNINGS_AS_ERRORS)
+        if(MSVC)
+            target_compile_options(${target} PRIVATE "/WX")
+        else()
+            target_compile_options(${target} PRIVATE "-Werror")
+        endif()
+    endif(MARL_WARNINGS_AS_ERRORS)
+
+    if(MARL_USE_PTHREAD_THREAD_LOCAL)
+        target_compile_definitions(${target} PRIVATE "MARL_USE_PTHREAD_THREAD_LOCAL=1")
+        target_link_libraries(${target} PUBLIC pthread)
+    endif()
+
+    if(MARL_ASAN)
+        target_compile_options(${target} PUBLIC "-fsanitize=address")
+        target_link_libraries(${target} PUBLIC "-fsanitize=address")
+    elseif(MARL_MSAN)
+        target_compile_options(${target} PUBLIC "-fsanitize=memory")
+        target_link_libraries(${target} PUBLIC "-fsanitize=memory")
+    elseif(MARL_TSAN)
+        target_compile_options(${target} PUBLIC "-fsanitize=thread")
+        target_link_libraries(${target} PUBLIC "-fsanitize=thread")
+    elseif(MARL_UBSAN)
+        target_compile_options(${target} PUBLIC "-fsanitize=undefined")
+        target_link_libraries(${target} PUBLIC "-fsanitize=undefined")
+    endif()
+
+    if(MARL_FIBERS_USE_UCONTEXT)
+        target_compile_definitions(${target} PRIVATE "MARL_FIBERS_USE_UCONTEXT=1")
+    endif()
+
+    if(MARL_DEBUG_ENABLED)
+        target_compile_definitions(${target} PRIVATE "MARL_DEBUG_ENABLED=1")
+    endif()
+
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^rv.*")
+        target_link_libraries(${target} INTERFACE atomic) #explicitly use -latomic for RISC-V linking
+    endif()
+
+    target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${MARL_INCLUDE_DIR}>)
+endfunction(marl_set_target_options)
+
+###########################################################
+# Targets
+###########################################################
+
+# marl
+if(MARL_BUILD_SHARED OR BUILD_SHARED_LIBS)
+    add_library(marl SHARED ${MARL_LIST})
+    if(MSVC)
+        target_compile_definitions(marl
+            PRIVATE "MARL_BUILDING_DLL=1"
+            PUBLIC  "MARL_DLL=1"
+        )
+    endif()
+else()
+    add_library(marl ${MARL_LIST})
+endif()
+
+if(NOT MSVC)
+    # Public API symbols are made visible with the MARL_EXPORT annotation.
+    target_compile_options(marl PRIVATE "-fvisibility=hidden")
+endif()
+
+set_target_properties(marl PROPERTIES
+    POSITION_INDEPENDENT_CODE 1
+    VERSION ${MARL_VERSION}
+    SOVERSION "${MARL_VERSION_MAJOR}"
+)
+
+marl_set_target_options(marl)
+
+target_link_libraries(marl PUBLIC Threads::Threads)
+
+# install
+if(MARL_INSTALL)
+    include(CMakePackageConfigHelpers)
+    include(GNUInstallDirs)
+
+    configure_package_config_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/marl-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/marl-config.cmake
+        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/marl
+    )
+
+    install(DIRECTORY ${MARL_INCLUDE_DIR}/marl
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+        USE_SOURCE_PERMISSIONS
+    )
+
+    install(TARGETS marl
+        EXPORT marl-targets
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    )
+
+    install(EXPORT marl-targets
+        FILE marl-targets.cmake
+        NAMESPACE marl::
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/marl
+    )
+
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/marl-config.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/marl
+    )
+endif(MARL_INSTALL)
+
+# tests
+if(MARL_BUILD_TESTS)
+    set(MARL_TEST_LIST
+        ${MARL_SRC_DIR}/blockingcall_test.cpp
+        ${MARL_SRC_DIR}/conditionvariable_test.cpp
+        ${MARL_SRC_DIR}/containers_test.cpp
+        ${MARL_SRC_DIR}/dag_test.cpp
+        ${MARL_SRC_DIR}/defer_test.cpp
+        ${MARL_SRC_DIR}/event_test.cpp
+        ${MARL_SRC_DIR}/marl_test.cpp
+        ${MARL_SRC_DIR}/marl_test.h
+        ${MARL_SRC_DIR}/memory_test.cpp
+        ${MARL_SRC_DIR}/osfiber_test.cpp
+        ${MARL_SRC_DIR}/parallelize_test.cpp
+        ${MARL_SRC_DIR}/pool_test.cpp
+        ${MARL_SRC_DIR}/scheduler_test.cpp
+        ${MARL_SRC_DIR}/thread_test.cpp
+        ${MARL_SRC_DIR}/ticket_test.cpp
+        ${MARL_SRC_DIR}/waitgroup_test.cpp
+        ${MARL_GOOGLETEST_DIR}/googletest/src/gtest-all.cc
+        ${MARL_GOOGLETEST_DIR}/googlemock/src/gmock-all.cc
+    )
+
+    set(MARL_TEST_INCLUDE_DIR
+        ${MARL_GOOGLETEST_DIR}/googletest/include/
+        ${MARL_GOOGLETEST_DIR}/googlemock/include/
+        ${MARL_GOOGLETEST_DIR}/googletest/
+        ${MARL_GOOGLETEST_DIR}/googlemock/
+    )
+
+    add_executable(marl-unittests ${MARL_TEST_LIST})
+
+    set_target_properties(marl-unittests PROPERTIES
+        INCLUDE_DIRECTORIES "${MARL_TEST_INCLUDE_DIR}"
+        FOLDER "Tests"
+    )
+
+    marl_set_target_options(marl-unittests)
+
+    target_link_libraries(marl-unittests PRIVATE marl)
+endif(MARL_BUILD_TESTS)
+
+# benchmarks
+if(MARL_BUILD_BENCHMARKS)
+    set(MARL_BENCHMARK_LIST
+        ${MARL_SRC_DIR}/blockingcall_bench.cpp
+        ${MARL_SRC_DIR}/defer_bench.cpp
+        ${MARL_SRC_DIR}/event_bench.cpp
+        ${MARL_SRC_DIR}/marl_bench.cpp
+        ${MARL_SRC_DIR}/non_marl_bench.cpp
+        ${MARL_SRC_DIR}/scheduler_bench.cpp
+        ${MARL_SRC_DIR}/ticket_bench.cpp
+        ${MARL_SRC_DIR}/waitgroup_bench.cpp
+    )
+
+    add_executable(marl-benchmarks ${MARL_BENCHMARK_LIST})
+    set_target_properties(${target} PROPERTIES FOLDER "Benchmarks")
+
+    marl_set_target_options(marl-benchmarks)
+
+    target_compile_definitions(marl-benchmarks PRIVATE
+        "MARL_FULL_BENCHMARK=${MARL_FULL_BENCHMARK}"
+    )
+
+    target_link_libraries(marl-benchmarks PRIVATE benchmark::benchmark marl)
+endif(MARL_BUILD_BENCHMARKS)
+
+# examples
+if(MARL_BUILD_EXAMPLES)
+    function(build_example target)
+        add_executable(${target} "${CMAKE_CURRENT_SOURCE_DIR}/examples/${target}.cpp")
+        set_target_properties(${target} PROPERTIES FOLDER "Examples")
+        marl_set_target_options(${target})
+        target_link_libraries(${target} PRIVATE marl)
+        if (EMSCRIPTEN)
+            target_link_options(${target} PRIVATE
+                    -O1
+                    -pthread -sPTHREAD_POOL_SIZE=2 -sPROXY_TO_PTHREAD
+                    -sASYNCIFY # -sASYNCIFY_STACK_SIZE=1000000
+                    -sALLOW_MEMORY_GROWTH=1 -sASSERTIONS
+                    -sENVIRONMENT=web,worker
+                    "SHELL:--shell-file ${CMAKE_CURRENT_SOURCE_DIR}/examples/shell.emscripten.html")
+            set_target_properties(${target} PROPERTIES SUFFIX .html)
+        endif()
+    endfunction(build_example)
+
+    build_example(fractal)
+    build_example(hello_task)
+    build_example(primes)
+    build_example(tasks_in_tasks)
+endif(MARL_BUILD_EXAMPLES)
--- a/3party/marl/CONTRIBUTING.md
+++ b/3party/marl/CONTRIBUTING.md
@ -0,0 +1,28 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
--- a/3party/marl/LICENSE
+++ b/3party/marl/LICENSE
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/3party/marl/README.md
+++ b/3party/marl/README.md
@ -0,0 +1,234 @@
+# Marl
+
+Marl is a hybrid thread / fiber task scheduler written in C++ 11.
+
+## About
+
+Marl is a C++ 11 library that provides a fluent interface for running tasks across a number of threads.
+
+Marl uses a combination of fibers and threads to allow efficient execution of tasks that can block, while keeping a fixed number of hardware threads.
+
+Marl supports Windows, macOS, Linux, FreeBSD, Fuchsia, Emscripten, Android and iOS (arm, aarch64, loongarch64, mips64, ppc64, rv64, x86 and x64).
+
+Marl has no dependencies on other libraries (with an exception on googletest for building the optional unit tests).
+
+Example:
+
+```cpp
+#include "marl/defer.h"
+#include "marl/event.h"
+#include "marl/scheduler.h"
+#include "marl/waitgroup.h"
+
+#include <cstdio>
+
+int main() {
+  // Create a marl scheduler using all the logical processors available to the process.
+  // Bind this scheduler to the main thread so we can call marl::schedule()
+  marl::Scheduler scheduler(marl::Scheduler::Config::allCores());
+  scheduler.bind();
+  defer(scheduler.unbind());  // Automatically unbind before returning.
+
+  constexpr int numTasks = 10;
+
+  // Create an event that is manually reset.
+  marl::Event sayHello(marl::Event::Mode::Manual);
+
+  // Create a WaitGroup with an initial count of numTasks.
+  marl::WaitGroup saidHello(numTasks);
+
+  // Schedule some tasks to run asynchronously.
+  for (int i = 0; i < numTasks; i++) {
+    // Each task will run on one of the 4 worker threads.
+    marl::schedule([=] {  // All marl primitives are capture-by-value.
+      // Decrement the WaitGroup counter when the task has finished.
+      defer(saidHello.done());
+
+      printf("Task %d waiting to say hello...\n", i);
+
+      // Blocking in a task?
+      // The scheduler will find something else for this thread to do.
+      sayHello.wait();
+
+      printf("Hello from task %d!\n", i);
+    });
+  }
+
+  sayHello.signal();  // Unblock all the tasks.
+
+  saidHello.wait();  // Wait for all tasks to complete.
+
+  printf("All tasks said hello.\n");
+
+  // All tasks are guaranteed to complete before the scheduler is destructed.
+}
+```
+
+## Benchmarks
+
+Graphs of several microbenchmarks can be found [here](https://google.github.io/marl/benchmarks).
+
+## Building
+
+Marl contains many unit tests and examples that can be built using CMake.
+
+Unit tests require fetching the `googletest` external project, which can be done by typing the following in your terminal:
+
+```bash
+cd <path-to-marl>
+git submodule update --init
+```
+
+### Linux and macOS
+
+To build the unit tests and examples, type the following in your terminal:
+
+```bash
+cd <path-to-marl>
+mkdir build
+cd build
+cmake .. -DMARL_BUILD_EXAMPLES=1 -DMARL_BUILD_TESTS=1
+make
+```
+
+The resulting binaries will be found in `<path-to-marl>/build`
+
+### Emscripten
+
+1. install and activate the emscripten sdk following [standard instructions for your platform](https://emscripten.org/docs/getting_started/downloads.html).
+2. build an example from the examples folder using emscripten, say `hello_task`. 
+```bash
+cd <path-to-marl>
+mkdir build
+cd build
+emcmake cmake .. -DMARL_BUILD_EXAMPLES=1
+make hello_task -j 8
+```
+NOTE: you want to change the value of the linker flag `sPTHREAD_POOL_SIZE` that must be at least as large as the number of threads used by your application.
+3. Test the emscripten output.
+You can use the provided python script to create a local web server:
+```bash
+../run_webserver
+```
+In your browser, navigate to the example URL: [http://127.0.0.1:8080/hello_task.html](http://127.0.0.1:8080/hello_task.html).  
+Voilà - you should see the log output appear on the web page.
+
+### Installing Marl (vcpkg)
+
+Alternatively, you can build and install Marl using [vcpkg](https://github.com/Microsoft/vcpkg/) dependency manager:
+
+```bash or powershell
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh
+./vcpkg integrate install
+./vcpkg install marl
+```
+
+The Marl port in vcpkg is kept up to date by Microsoft team members and community contributors. If the version is out of date, please [create an issue or pull request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+### Windows
+
+Marl can be built using [Visual Studio 2019's CMake integration](https://docs.microsoft.com/en-us/cpp/build/cmake-projects-in-visual-studio?view=vs-2019).
+
+### Using Marl in your CMake project
+
+You can build and link Marl using `add_subdirectory()` in your project's `CMakeLists.txt` file:
+
+```cmake
+set(MARL_DIR <path-to-marl>) # example <path-to-marl>: "${CMAKE_CURRENT_SOURCE_DIR}/third_party/marl"
+add_subdirectory(${MARL_DIR})
+```
+
+This will define the `marl` library target, which you can pass to `target_link_libraries()`:
+
+```cmake
+target_link_libraries(<target> marl) # replace <target> with the name of your project's target
+```
+
+You may also wish to specify your own paths to the third party libraries used by `marl`.
+You can do this by setting any of the following variables before the call to `add_subdirectory()`:
+
+```cmake
+set(MARL_THIRD_PARTY_DIR <third-party-root-directory>) # defaults to ${MARL_DIR}/third_party
+set(MARL_GOOGLETEST_DIR  <path-to-googletest>)         # defaults to ${MARL_THIRD_PARTY_DIR}/googletest
+add_subdirectory(${MARL_DIR})
+```
+
+### Usage Recommendations
+
+#### Capture marl synchronization primitives by value
+
+All marl synchronization primitives aside from `marl::ConditionVariable` should be lambda-captured by **value**:
+
+```c++
+marl::Event event;
+marl::schedule([=]{ // [=] Good, [&] Bad.
+  event.signal();
+})
+```
+
+Internally, these primitives hold a shared pointer to the primitive state. By capturing by value we avoid common issues where the primitive may be destructed before the last reference is used.
+
+#### Create one instance of `marl::Scheduler`, use it for the lifetime of the process
+
+The `marl::Scheduler` constructor can be expensive as it may spawn a number of hardware threads. \
+Destructing the `marl::Scheduler` requires waiting on all tasks to complete.
+
+Multiple `marl::Scheduler`s may fight each other for hardware thread utilization.
+
+For these reasons, it is recommended to create a single `marl::Scheduler` for the lifetime of your process.
+
+For example:
+
+```c++
+int main() {
+  marl::Scheduler scheduler(marl::Scheduler::Config::allCores());
+  scheduler.bind();
+  defer(scheduler.unbind());
+
+  return do_program_stuff();
+}
+```
+
+#### Bind the scheduler to externally created threads
+
+In order to call `marl::schedule()` the scheduler must be bound to the calling thread. Failure to bind the scheduler to the thread before calling `marl::schedule()` will result in undefined behavior.
+
+`marl::Scheduler` may be simultaneously bound to any number of threads, and the scheduler can be retrieved from a bound thread with `marl::Scheduler::get()`.
+
+A typical way to pass the scheduler from one thread to another would be:
+
+```c++
+std::thread spawn_new_thread() {
+  // Grab the scheduler from the currently running thread.
+  marl::Scheduler* scheduler = marl::Scheduler::get();
+
+  // Spawn the new thread.
+  return std::thread([=] {
+    // Bind the scheduler to the new thread.
+    scheduler->bind();
+    defer(scheduler->unbind());
+
+    // You can now safely call `marl::schedule()`
+    run_thread_logic();
+  });
+}
+
+```
+
+Always remember to unbind the scheduler before terminating the thread. Forgetting to unbind will result in the `marl::Scheduler` destructor blocking indefinitely.
+
+#### Don't use externally blocking calls in marl tasks
+
+The `marl::Scheduler` internally holds a number of worker threads which will execute the scheduled tasks. If a marl task becomes blocked on a marl synchronization primitive, marl can yield from the blocked task and continue execution of other scheduled tasks.
+
+Calling a non-marl blocking function on a marl worker thread will prevent that worker thread from being able to switch to execute other tasks until the blocking function has returned. Examples of these non-marl blocking functions include: [`std::mutex::lock()`](https://en.cppreference.com/w/cpp/thread/mutex/lock), [`std::condition_variable::wait()`](https://en.cppreference.com/w/cpp/thread/condition_variable/wait), [`accept()`](http://man7.org/linux/man-pages/man2/accept.2.html).
+
+Short blocking calls are acceptable, such as a mutex lock to access a data structure. However be careful that you do not use a marl blocking call with a `std::mutex` lock held - the marl task may yield with the lock held, and block other tasks from re-locking the mutex. This sort of situation may end up with a deadlock.
+
+If you need to make a blocking call from a marl worker thread, you may wish to use [`marl::blocking_call()`](https://github.com/google/marl/blob/main/include/marl/blockingcall.h), which will spawn a new thread for performing the call, allowing the marl worker to continue processing other scheduled tasks.
+
+---
+
+Note: This is not an officially supported Google product
--- a/3party/marl/WORKSPACE
+++ b/3party/marl/WORKSPACE
@ -0,0 +1,31 @@
+# Copyright 2019 The Marl Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "googletest",  # 2021-07-09
+    sha256 = "353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a",
+    strip_prefix = "googletest-release-1.11.0",
+    urls = ["https://github.com/google/googletest/archive/refs/tags/release-1.11.0.zip"],
+)
+
+http_archive(
+    name = "platforms",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.6/platforms-0.0.6.tar.gz",
+        "https://github.com/bazelbuild/platforms/releases/download/0.0.6/platforms-0.0.6.tar.gz",
+    ],
+    sha256 = "5308fc1d8865406a49427ba24a9ab53087f17f5266a7aabbfc28823f3916e1ca",
+)
--- a/3party/marl/cmake/marl-config.cmake.in
+++ b/3party/marl/cmake/marl-config.cmake.in
@ -0,0 +1,23 @@
+# Copyright 2020 The Marl Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+
+find_dependency(Threads)
+
+if(NOT TARGET marl::marl)
+    include(${CMAKE_CURRENT_LIST_DIR}/marl-targets.cmake)
+endif()
--- a/3party/marl/cmake/parse_version.cmake
+++ b/3party/marl/cmake/parse_version.cmake
@ -0,0 +1,41 @@
+# Copyright 2020 The Marl Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# parse_version() reads and parses the version string from FILE, assigning the
+# version string to ${PROJECT}_VERSION and the parsed version to
+# ${PROJECT}_VERSION_MAJOR, ${PROJECT}_VERSION_MINOR, ${PROJECT}_VERSION_PATCH,
+# and the optional ${PROJECT}_VERSION_FLAVOR.
+#
+# The version string take one of the forms:
+#    <major>.<minor>.<patch>
+#    <major>.<minor>.<patch>-<flavor>
+function(parse_version FILE PROJECT)
+    configure_file(${FILE} "${CMAKE_CURRENT_BINARY_DIR}/CHANGES.md") # Required to re-run cmake on version change
+    file(READ ${FILE} CHANGES)
+    if(${CHANGES} MATCHES "#+ *([0-9]+)\\.([0-9]+)\\.([0-9]+)(-[a-zA-Z0-9]+)?")
+        set(FLAVOR "")
+        if(NOT "${CMAKE_MATCH_4}" STREQUAL "")
+            string(SUBSTRING ${CMAKE_MATCH_4} 1 -1 FLAVOR)
+        endif()
+        set("${PROJECT}_VERSION_MAJOR"  ${CMAKE_MATCH_1} PARENT_SCOPE)
+        set("${PROJECT}_VERSION_MINOR"  ${CMAKE_MATCH_2} PARENT_SCOPE)
+        set("${PROJECT}_VERSION_PATCH"  ${CMAKE_MATCH_3} PARENT_SCOPE)
+        set("${PROJECT}_VERSION_FLAVOR" ${FLAVOR}        PARENT_SCOPE)
+        set("${PROJECT}_VERSION"
+            "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}${CMAKE_MATCH_4}"
+            PARENT_SCOPE)
+    else()
+        message(FATAL_ERROR "Unable to parse version from '${FILE}'")
+    endif()
+endfunction()
--- a/3party/marl/go.mod
+++ b/3party/marl/go.mod
@ -0,0 +1,3 @@
+module github.com/google/marl
+
+go 1.16
--- a/3party/marl/include/marl/blockingcall.h
+++ b/3party/marl/include/marl/blockingcall.h
@ -0,0 +1,106 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_blocking_call_h
+#define marl_blocking_call_h
+
+#include "export.h"
+#include "scheduler.h"
+#include "waitgroup.h"
+
+#include <thread>
+#include <type_traits>
+#include <utility>
+
+namespace marl {
+namespace detail {
+
+template <typename RETURN_TYPE>
+class OnNewThread {
+ public:
+  template <typename F, typename... Args>
+  MARL_NO_EXPORT inline static RETURN_TYPE call(F&& f, Args&&... args) {
+    RETURN_TYPE result;
+    WaitGroup wg(1);
+    auto scheduler = Scheduler::get();
+    auto thread = std::thread(
+        [&, wg](Args&&... args) {
+          if (scheduler != nullptr) {
+            scheduler->bind();
+          }
+          result = f(std::forward<Args>(args)...);
+          if (scheduler != nullptr) {
+            Scheduler::unbind();
+          }
+          wg.done();
+        },
+        std::forward<Args>(args)...);
+    wg.wait();
+    thread.join();
+    return result;
+  }
+};
+
+template <>
+class OnNewThread<void> {
+ public:
+  template <typename F, typename... Args>
+  MARL_NO_EXPORT inline static void call(F&& f, Args&&... args) {
+    WaitGroup wg(1);
+    auto scheduler = Scheduler::get();
+    auto thread = std::thread(
+        [&, wg](Args&&... args) {
+          if (scheduler != nullptr) {
+            scheduler->bind();
+          }
+          f(std::forward<Args>(args)...);
+          if (scheduler != nullptr) {
+            Scheduler::unbind();
+          }
+          wg.done();
+        },
+        std::forward<Args>(args)...);
+    wg.wait();
+    thread.join();
+  }
+};
+
+}  // namespace detail
+
+// blocking_call() calls the function F on a new thread, yielding this fiber
+// to execute other tasks until F has returned.
+//
+// Example:
+//
+//  void runABlockingFunctionOnATask()
+//  {
+//      // Schedule a task that calls a blocking, non-yielding function.
+//      marl::schedule([=] {
+//          // call_blocking_function() may block indefinitely.
+//          // Ensure this call does not block other tasks from running.
+//          auto result = marl::blocking_call(call_blocking_function);
+//          // call_blocking_function() has now returned.
+//          // result holds the return value of the blocking function call.
+//      });
+//  }
+template <typename F, typename... Args>
+MARL_NO_EXPORT auto inline blocking_call(F&& f, Args&&... args)
+    -> decltype(f(args...)) {
+  return detail::OnNewThread<decltype(f(args...))>::call(
+      std::forward<F>(f), std::forward<Args>(args)...);
+}
+
+}  // namespace marl
+
+#endif  // marl_blocking_call_h
--- a/3party/marl/include/marl/conditionvariable.h
+++ b/3party/marl/include/marl/conditionvariable.h
@ -0,0 +1,197 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_condition_variable_h
+#define marl_condition_variable_h
+
+#include "containers.h"
+#include "debug.h"
+#include "memory.h"
+#include "mutex.h"
+#include "scheduler.h"
+#include "tsa.h"
+
+#include <atomic>
+#include <condition_variable>
+
+namespace marl {
+
+// ConditionVariable is a synchronization primitive that can be used to block
+// one or more fibers or threads, until another fiber or thread modifies a
+// shared variable (the condition) and notifies the ConditionVariable.
+//
+// If the ConditionVariable is blocked on a thread with a Scheduler bound, the
+// thread will work on other tasks until the ConditionVariable is unblocked.
+class ConditionVariable {
+ public:
+  MARL_NO_EXPORT inline ConditionVariable(
+      Allocator* allocator = Allocator::Default);
+
+  // notify_one() notifies and potentially unblocks one waiting fiber or thread.
+  MARL_NO_EXPORT inline void notify_one();
+
+  // notify_all() notifies and potentially unblocks all waiting fibers and/or
+  // threads.
+  MARL_NO_EXPORT inline void notify_all();
+
+  // wait() blocks the current fiber or thread until the predicate is satisfied
+  // and the ConditionVariable is notified.
+  template <typename Predicate>
+  MARL_NO_EXPORT inline void wait(marl::lock& lock, Predicate&& pred);
+
+  // wait_for() blocks the current fiber or thread until the predicate is
+  // satisfied, and the ConditionVariable is notified, or the timeout has been
+  // reached. Returns false if pred still evaluates to false after the timeout
+  // has been reached, otherwise true.
+  template <typename Rep, typename Period, typename Predicate>
+  MARL_NO_EXPORT inline bool wait_for(
+      marl::lock& lock,
+      const std::chrono::duration<Rep, Period>& duration,
+      Predicate&& pred);
+
+  // wait_until() blocks the current fiber or thread until the predicate is
+  // satisfied, and the ConditionVariable is notified, or the timeout has been
+  // reached. Returns false if pred still evaluates to false after the timeout
+  // has been reached, otherwise true.
+  template <typename Clock, typename Duration, typename Predicate>
+  MARL_NO_EXPORT inline bool wait_until(
+      marl::lock& lock,
+      const std::chrono::time_point<Clock, Duration>& timeout,
+      Predicate&& pred);
+
+ private:
+  ConditionVariable(const ConditionVariable&) = delete;
+  ConditionVariable(ConditionVariable&&) = delete;
+  ConditionVariable& operator=(const ConditionVariable&) = delete;
+  ConditionVariable& operator=(ConditionVariable&&) = delete;
+
+  marl::mutex mutex;
+  containers::list<Scheduler::Fiber*> waiting;
+  std::condition_variable condition;
+  std::atomic<int> numWaiting = {0};
+  std::atomic<int> numWaitingOnCondition = {0};
+};
+
+ConditionVariable::ConditionVariable(
+    Allocator* allocator /* = Allocator::Default */)
+    : waiting(allocator) {}
+
+void ConditionVariable::notify_one() {
+  if (numWaiting == 0) {
+    return;
+  }
+  {
+    marl::lock lock(mutex);
+    if (waiting.size() > 0) {
+      (*waiting.begin())->notify();  // Only wake one fiber.
+      return;
+    }
+  }
+  if (numWaitingOnCondition > 0) {
+    condition.notify_one();
+  }
+}
+
+void ConditionVariable::notify_all() {
+  if (numWaiting == 0) {
+    return;
+  }
+  {
+    marl::lock lock(mutex);
+    for (auto fiber : waiting) {
+      fiber->notify();
+    }
+  }
+  if (numWaitingOnCondition > 0) {
+    condition.notify_all();
+  }
+}
+
+template <typename Predicate>
+void ConditionVariable::wait(marl::lock& lock, Predicate&& pred) {
+  if (pred()) {
+    return;
+  }
+  numWaiting++;
+  if (auto fiber = Scheduler::Fiber::current()) {
+    // Currently executing on a scheduler fiber.
+    // Yield to let other tasks run that can unblock this fiber.
+    mutex.lock();
+    auto it = waiting.emplace_front(fiber);
+    mutex.unlock();
+
+    fiber->wait(lock, pred);
+
+    mutex.lock();
+    waiting.erase(it);
+    mutex.unlock();
+  } else {
+    // Currently running outside of the scheduler.
+    // Delegate to the std::condition_variable.
+    numWaitingOnCondition++;
+    lock.wait(condition, pred);
+    numWaitingOnCondition--;
+  }
+  numWaiting--;
+}
+
+template <typename Rep, typename Period, typename Predicate>
+bool ConditionVariable::wait_for(
+    marl::lock& lock,
+    const std::chrono::duration<Rep, Period>& duration,
+    Predicate&& pred) {
+  return wait_until(lock, std::chrono::system_clock::now() + duration, pred);
+}
+
+template <typename Clock, typename Duration, typename Predicate>
+bool ConditionVariable::wait_until(
+    marl::lock& lock,
+    const std::chrono::time_point<Clock, Duration>& timeout,
+    Predicate&& pred) {
+  if (pred()) {
+    return true;
+  }
+
+  if (auto fiber = Scheduler::Fiber::current()) {
+    numWaiting++;
+
+    // Currently executing on a scheduler fiber.
+    // Yield to let other tasks run that can unblock this fiber.
+    mutex.lock();
+    auto it = waiting.emplace_front(fiber);
+    mutex.unlock();
+
+    auto res = fiber->wait(lock, timeout, pred);
+
+    mutex.lock();
+    waiting.erase(it);
+    mutex.unlock();
+
+    numWaiting--;
+    return res;
+  }
+
+  // Currently running outside of the scheduler.
+  // Delegate to the std::condition_variable.
+  numWaiting++;
+  numWaitingOnCondition++;
+  auto res = lock.wait_until(condition, timeout, pred);
+  numWaitingOnCondition--;
+  numWaiting--;
+  return res;
+}
+
+}  // namespace marl
+
+#endif  // marl_condition_variable_h
--- a/3party/marl/include/marl/containers.h
+++ b/3party/marl/include/marl/containers.h
@ -0,0 +1,571 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_containers_h
+#define marl_containers_h
+
+#include "debug.h"
+#include "memory.h"
+
+#include <algorithm>  // std::max
+#include <cstddef>    // size_t
+#include <utility>    // std::move
+
+#include <deque>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace marl {
+namespace containers {
+
+////////////////////////////////////////////////////////////////////////////////
+// STL wrappers
+// STL containers that use a marl::StlAllocator backed by a marl::Allocator.
+// Note: These may be re-implemented to optimize for marl's usage cases.
+// See: https://github.com/google/marl/issues/129
+////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+using deque = std::deque<T, StlAllocator<T>>;
+
+template <typename K, typename V, typename C = std::less<K>>
+using map = std::map<K, V, C, StlAllocator<std::pair<const K, V>>>;
+
+template <typename K, typename C = std::less<K>>
+using set = std::set<K, C, StlAllocator<K>>;
+
+template <typename K,
+          typename V,
+          typename H = std::hash<K>,
+          typename E = std::equal_to<K>>
+using unordered_map =
+    std::unordered_map<K, V, H, E, StlAllocator<std::pair<const K, V>>>;
+
+template <typename K, typename H = std::hash<K>, typename E = std::equal_to<K>>
+using unordered_set = std::unordered_set<K, H, E, StlAllocator<K>>;
+
+// take() takes and returns the front value from the deque.
+template <typename T>
+MARL_NO_EXPORT inline T take(deque<T>& queue) {
+  auto out = std::move(queue.front());
+  queue.pop_front();
+  return out;
+}
+
+// take() takes and returns the first value from the unordered_set.
+template <typename T, typename H, typename E>
+MARL_NO_EXPORT inline T take(unordered_set<T, H, E>& set) {
+  auto it = set.begin();
+  auto out = std::move(*it);
+  set.erase(it);
+  return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// vector<T, BASE_CAPACITY>
+////////////////////////////////////////////////////////////////////////////////
+
+// vector is a container of contiguously stored elements.
+// Unlike std::vector, marl::containers::vector keeps the first
+// BASE_CAPACITY elements internally, which will avoid dynamic heap
+// allocations. Once the vector exceeds BASE_CAPACITY elements, vector will
+// allocate storage from the heap.
+template <typename T, int BASE_CAPACITY>
+class vector {
+ public:
+  MARL_NO_EXPORT inline vector(Allocator* allocator = Allocator::Default);
+
+  template <int BASE_CAPACITY_2>
+  MARL_NO_EXPORT inline vector(const vector<T, BASE_CAPACITY_2>& other,
+                               Allocator* allocator = Allocator::Default);
+
+  template <int BASE_CAPACITY_2>
+  MARL_NO_EXPORT inline vector(vector<T, BASE_CAPACITY_2>&& other,
+                               Allocator* allocator = Allocator::Default);
+
+  MARL_NO_EXPORT inline ~vector();
+
+  MARL_NO_EXPORT inline vector& operator=(const vector&);
+
+  template <int BASE_CAPACITY_2>
+  MARL_NO_EXPORT inline vector<T, BASE_CAPACITY>& operator=(
+      const vector<T, BASE_CAPACITY_2>&);
+
+  template <int BASE_CAPACITY_2>
+  MARL_NO_EXPORT inline vector<T, BASE_CAPACITY>& operator=(
+      vector<T, BASE_CAPACITY_2>&&);
+
+  MARL_NO_EXPORT inline void push_back(const T& el);
+  MARL_NO_EXPORT inline void emplace_back(T&& el);
+  MARL_NO_EXPORT inline void pop_back();
+  MARL_NO_EXPORT inline T& front();
+  MARL_NO_EXPORT inline T& back();
+  MARL_NO_EXPORT inline const T& front() const;
+  MARL_NO_EXPORT inline const T& back() const;
+  MARL_NO_EXPORT inline T* begin();
+  MARL_NO_EXPORT inline T* end();
+  MARL_NO_EXPORT inline const T* begin() const;
+  MARL_NO_EXPORT inline const T* end() const;
+  MARL_NO_EXPORT inline T& operator[](size_t i);
+  MARL_NO_EXPORT inline const T& operator[](size_t i) const;
+  MARL_NO_EXPORT inline size_t size() const;
+  MARL_NO_EXPORT inline size_t cap() const;
+  MARL_NO_EXPORT inline void resize(size_t n);
+  MARL_NO_EXPORT inline void reserve(size_t n);
+  MARL_NO_EXPORT inline T* data();
+  MARL_NO_EXPORT inline const T* data() const;
+
+  Allocator* const allocator;
+
+ private:
+  using TStorage = typename marl::aligned_storage<sizeof(T), alignof(T)>::type;
+
+  vector(const vector&) = delete;
+
+  MARL_NO_EXPORT inline void free();
+
+  size_t count = 0;
+  size_t capacity = BASE_CAPACITY;
+  TStorage buffer[BASE_CAPACITY];
+  TStorage* elements = buffer;
+  Allocation allocation;
+};
+
+template <typename T, int BASE_CAPACITY>
+vector<T, BASE_CAPACITY>::vector(
+    Allocator* allocator_ /* = Allocator::Default */)
+    : allocator(allocator_) {}
+
+template <typename T, int BASE_CAPACITY>
+template <int BASE_CAPACITY_2>
+vector<T, BASE_CAPACITY>::vector(
+    const vector<T, BASE_CAPACITY_2>& other,
+    Allocator* allocator_ /* = Allocator::Default */)
+    : allocator(allocator_) {
+  *this = other;
+}
+
+template <typename T, int BASE_CAPACITY>
+template <int BASE_CAPACITY_2>
+vector<T, BASE_CAPACITY>::vector(
+    vector<T, BASE_CAPACITY_2>&& other,
+    Allocator* allocator_ /* = Allocator::Default */)
+    : allocator(allocator_) {
+  *this = std::move(other);
+}
+
+template <typename T, int BASE_CAPACITY>
+vector<T, BASE_CAPACITY>::~vector() {
+  free();
+}
+
+template <typename T, int BASE_CAPACITY>
+vector<T, BASE_CAPACITY>& vector<T, BASE_CAPACITY>::operator=(
+    const vector& other) {
+  free();
+  reserve(other.size());
+  count = other.size();
+  for (size_t i = 0; i < count; i++) {
+    new (&reinterpret_cast<T*>(elements)[i]) T(other[i]);
+  }
+  return *this;
+}
+
+template <typename T, int BASE_CAPACITY>
+template <int BASE_CAPACITY_2>
+vector<T, BASE_CAPACITY>& vector<T, BASE_CAPACITY>::operator=(
+    const vector<T, BASE_CAPACITY_2>& other) {
+  free();
+  reserve(other.size());
+  count = other.size();
+  for (size_t i = 0; i < count; i++) {
+    new (&reinterpret_cast<T*>(elements)[i]) T(other[i]);
+  }
+  return *this;
+}
+
+template <typename T, int BASE_CAPACITY>
+template <int BASE_CAPACITY_2>
+vector<T, BASE_CAPACITY>& vector<T, BASE_CAPACITY>::operator=(
+    vector<T, BASE_CAPACITY_2>&& other) {
+  free();
+  reserve(other.size());
+  count = other.size();
+  for (size_t i = 0; i < count; i++) {
+    new (&reinterpret_cast<T*>(elements)[i]) T(std::move(other[i]));
+  }
+  other.resize(0);
+  return *this;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::push_back(const T& el) {
+  reserve(count + 1);
+  new (&reinterpret_cast<T*>(elements)[count]) T(el);
+  count++;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::emplace_back(T&& el) {
+  reserve(count + 1);
+  new (&reinterpret_cast<T*>(elements)[count]) T(std::move(el));
+  count++;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::pop_back() {
+  MARL_ASSERT(count > 0, "pop_back() called on empty vector");
+  count--;
+  reinterpret_cast<T*>(elements)[count].~T();
+}
+
+template <typename T, int BASE_CAPACITY>
+T& vector<T, BASE_CAPACITY>::front() {
+  MARL_ASSERT(count > 0, "front() called on empty vector");
+  return reinterpret_cast<T*>(elements)[0];
+}
+
+template <typename T, int BASE_CAPACITY>
+T& vector<T, BASE_CAPACITY>::back() {
+  MARL_ASSERT(count > 0, "back() called on empty vector");
+  return reinterpret_cast<T*>(elements)[count - 1];
+}
+
+template <typename T, int BASE_CAPACITY>
+const T& vector<T, BASE_CAPACITY>::front() const {
+  MARL_ASSERT(count > 0, "front() called on empty vector");
+  return reinterpret_cast<T*>(elements)[0];
+}
+
+template <typename T, int BASE_CAPACITY>
+const T& vector<T, BASE_CAPACITY>::back() const {
+  MARL_ASSERT(count > 0, "back() called on empty vector");
+  return reinterpret_cast<T*>(elements)[count - 1];
+}
+
+template <typename T, int BASE_CAPACITY>
+T* vector<T, BASE_CAPACITY>::begin() {
+  return reinterpret_cast<T*>(elements);
+}
+
+template <typename T, int BASE_CAPACITY>
+T* vector<T, BASE_CAPACITY>::end() {
+  return reinterpret_cast<T*>(elements) + count;
+}
+
+template <typename T, int BASE_CAPACITY>
+const T* vector<T, BASE_CAPACITY>::begin() const {
+  return reinterpret_cast<T*>(elements);
+}
+
+template <typename T, int BASE_CAPACITY>
+const T* vector<T, BASE_CAPACITY>::end() const {
+  return reinterpret_cast<T*>(elements) + count;
+}
+
+template <typename T, int BASE_CAPACITY>
+T& vector<T, BASE_CAPACITY>::operator[](size_t i) {
+  MARL_ASSERT(i < count, "index %d exceeds vector size %d", int(i), int(count));
+  return reinterpret_cast<T*>(elements)[i];
+}
+
+template <typename T, int BASE_CAPACITY>
+const T& vector<T, BASE_CAPACITY>::operator[](size_t i) const {
+  MARL_ASSERT(i < count, "index %d exceeds vector size %d", int(i), int(count));
+  return reinterpret_cast<T*>(elements)[i];
+}
+
+template <typename T, int BASE_CAPACITY>
+size_t vector<T, BASE_CAPACITY>::size() const {
+  return count;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::resize(size_t n) {
+  reserve(n);
+  while (count < n) {
+    new (&reinterpret_cast<T*>(elements)[count++]) T();
+  }
+  while (n < count) {
+    reinterpret_cast<T*>(elements)[--count].~T();
+  }
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::reserve(size_t n) {
+  if (n > capacity) {
+    capacity = std::max<size_t>(n * 2, 8);
+
+    Allocation::Request request;
+    request.size = sizeof(T) * capacity;
+    request.alignment = alignof(T);
+    request.usage = Allocation::Usage::Vector;
+
+    auto alloc = allocator->allocate(request);
+    auto grown = reinterpret_cast<TStorage*>(alloc.ptr);
+    for (size_t i = 0; i < count; i++) {
+      new (&reinterpret_cast<T*>(grown)[i])
+          T(std::move(reinterpret_cast<T*>(elements)[i]));
+    }
+    free();
+    elements = grown;
+    allocation = alloc;
+  }
+}
+
+template <typename T, int BASE_CAPACITY>
+T* vector<T, BASE_CAPACITY>::data() {
+  return elements;
+}
+
+template <typename T, int BASE_CAPACITY>
+const T* vector<T, BASE_CAPACITY>::data() const {
+  return elements;
+}
+
+template <typename T, int BASE_CAPACITY>
+void vector<T, BASE_CAPACITY>::free() {
+  for (size_t i = 0; i < count; i++) {
+    reinterpret_cast<T*>(elements)[i].~T();
+  }
+
+  if (allocation.ptr != nullptr) {
+    allocator->free(allocation);
+    allocation = {};
+    elements = nullptr;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// list<T, BASE_CAPACITY>
+////////////////////////////////////////////////////////////////////////////////
+
+// list is a minimal std::list like container that supports constant time
+// insertion and removal of elements.
+// list keeps hold of allocations (it only releases allocations on destruction),
+// to avoid repeated heap allocations and frees when frequently inserting and
+// removing elements.
+template <typename T>
+class list {
+  struct Entry {
+    T data;
+    Entry* next;
+    Entry* prev;
+  };
+
+ public:
+  class iterator {
+   public:
+    MARL_NO_EXPORT inline iterator(Entry*);
+    MARL_NO_EXPORT inline T* operator->();
+    MARL_NO_EXPORT inline T& operator*();
+    MARL_NO_EXPORT inline iterator& operator++();
+    MARL_NO_EXPORT inline bool operator==(const iterator&) const;
+    MARL_NO_EXPORT inline bool operator!=(const iterator&) const;
+
+   private:
+    friend list;
+    Entry* entry;
+  };
+
+  MARL_NO_EXPORT inline list(Allocator* allocator = Allocator::Default);
+  MARL_NO_EXPORT inline ~list();
+
+  MARL_NO_EXPORT inline iterator begin();
+  MARL_NO_EXPORT inline iterator end();
+  MARL_NO_EXPORT inline size_t size() const;
+
+  template <typename... Args>
+  MARL_NO_EXPORT inline iterator emplace_front(Args&&... args);
+  MARL_NO_EXPORT inline void erase(iterator);
+
+ private:
+  // copy / move is currently unsupported.
+  list(const list&) = delete;
+  list(list&&) = delete;
+  list& operator=(const list&) = delete;
+  list& operator=(list&&) = delete;
+
+  struct AllocationChain {
+    Allocation allocation;
+    AllocationChain* next;
+  };
+
+  MARL_NO_EXPORT inline void grow(size_t count);
+
+  MARL_NO_EXPORT static inline void unlink(Entry* entry, Entry*& list);
+  MARL_NO_EXPORT static inline void link(Entry* entry, Entry*& list);
+
+  Allocator* const allocator;
+  size_t size_ = 0;
+  size_t capacity = 0;
+  AllocationChain* allocations = nullptr;
+  Entry* free = nullptr;
+  Entry* head = nullptr;
+};
+
+template <typename T>
+list<T>::iterator::iterator(Entry* entry_) : entry(entry_) {}
+
+template <typename T>
+T* list<T>::iterator::operator->() {
+  return &entry->data;
+}
+
+template <typename T>
+T& list<T>::iterator::operator*() {
+  return entry->data;
+}
+
+template <typename T>
+typename list<T>::iterator& list<T>::iterator::operator++() {
+  entry = entry->next;
+  return *this;
+}
+
+template <typename T>
+bool list<T>::iterator::operator==(const iterator& rhs) const {
+  return entry == rhs.entry;
+}
+
+template <typename T>
+bool list<T>::iterator::operator!=(const iterator& rhs) const {
+  return entry != rhs.entry;
+}
+
+template <typename T>
+list<T>::list(Allocator* allocator_ /* = Allocator::Default */)
+    : allocator(allocator_) {}
+
+template <typename T>
+list<T>::~list() {
+  for (auto el = head; el != nullptr; el = el->next) {
+    el->data.~T();
+  }
+
+  auto curr = allocations;
+  while (curr != nullptr) {
+    auto next = curr->next;
+    allocator->free(curr->allocation);
+    curr = next;
+  }
+}
+
+template <typename T>
+typename list<T>::iterator list<T>::begin() {
+  return {head};
+}
+
+template <typename T>
+typename list<T>::iterator list<T>::end() {
+  return {nullptr};
+}
+
+template <typename T>
+size_t list<T>::size() const {
+  return size_;
+}
+
+template <typename T>
+template <typename... Args>
+typename list<T>::iterator list<T>::emplace_front(Args&&... args) {
+  if (free == nullptr) {
+    grow(std::max<size_t>(capacity, 8));
+  }
+
+  auto entry = free;
+
+  unlink(entry, free);
+  link(entry, head);
+
+  new (&entry->data) T(std::forward<T>(args)...);
+  size_++;
+
+  return entry;
+}
+
+template <typename T>
+void list<T>::erase(iterator it) {
+  auto entry = it.entry;
+  unlink(entry, head);
+  link(entry, free);
+
+  entry->data.~T();
+  size_--;
+}
+
+template <typename T>
+void list<T>::grow(size_t count) {
+  auto const entriesSize = sizeof(Entry) * count;
+  auto const allocChainOffset = alignUp(entriesSize, alignof(AllocationChain));
+  auto const allocSize = allocChainOffset + sizeof(AllocationChain);
+
+  Allocation::Request request;
+  request.size = allocSize;
+  request.alignment = std::max(alignof(Entry), alignof(AllocationChain));
+  request.usage = Allocation::Usage::List;
+  auto alloc = allocator->allocate(request);
+
+  auto entries = reinterpret_cast<Entry*>(alloc.ptr);
+  for (size_t i = 0; i < count; i++) {
+    auto entry = &entries[i];
+    entry->prev = nullptr;
+    entry->next = free;
+    if (free) {
+      free->prev = entry;
+    }
+    free = entry;
+  }
+
+  auto allocChain = reinterpret_cast<AllocationChain*>(
+      reinterpret_cast<uint8_t*>(alloc.ptr) + allocChainOffset);
+
+  allocChain->allocation = alloc;
+  allocChain->next = allocations;
+  allocations = allocChain;
+
+  capacity += count;
+}
+
+template <typename T>
+void list<T>::unlink(Entry* entry, Entry*& list) {
+  if (list == entry) {
+    list = list->next;
+  }
+  if (entry->prev) {
+    entry->prev->next = entry->next;
+  }
+  if (entry->next) {
+    entry->next->prev = entry->prev;
+  }
+  entry->prev = nullptr;
+  entry->next = nullptr;
+}
+
+template <typename T>
+void list<T>::link(Entry* entry, Entry*& list) {
+  MARL_ASSERT(entry->next == nullptr, "link() called on entry already linked");
+  MARL_ASSERT(entry->prev == nullptr, "link() called on entry already linked");
+  if (list) {
+    entry->next = list;
+    list->prev = entry;
+  }
+  list = entry;
+}
+
+}  // namespace containers
+}  // namespace marl
+
+#endif  // marl_containers_h
--- a/3party/marl/include/marl/dag.h
+++ b/3party/marl/include/marl/dag.h
@ -0,0 +1,410 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// marl::DAG<> provides an ahead of time, declarative, directed acyclic
+// task graph.
+
+#ifndef marl_dag_h
+#define marl_dag_h
+
+#include "containers.h"
+#include "export.h"
+#include "memory.h"
+#include "scheduler.h"
+#include "waitgroup.h"
+
+namespace marl {
+namespace detail {
+using DAGCounter = std::atomic<uint32_t>;
+template <typename T>
+struct DAGRunContext {
+  T data;
+  Allocator::unique_ptr<DAGCounter> counters;
+
+  template <typename F>
+  MARL_NO_EXPORT inline void invoke(F&& f) {
+    f(data);
+  }
+};
+template <>
+struct DAGRunContext<void> {
+  Allocator::unique_ptr<DAGCounter> counters;
+
+  template <typename F>
+  MARL_NO_EXPORT inline void invoke(F&& f) {
+    f();
+  }
+};
+template <typename T>
+struct DAGWork {
+  using type = std::function<void(T)>;
+};
+template <>
+struct DAGWork<void> {
+  using type = std::function<void()>;
+};
+}  // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+// Forward declarations
+///////////////////////////////////////////////////////////////////////////////
+template <typename T>
+class DAG;
+
+template <typename T>
+class DAGBuilder;
+
+template <typename T>
+class DAGNodeBuilder;
+
+///////////////////////////////////////////////////////////////////////////////
+// DAGBase<T>
+///////////////////////////////////////////////////////////////////////////////
+
+// DAGBase is derived by DAG<T> and DAG<void>. It has no public API.
+template <typename T>
+class DAGBase {
+ protected:
+  friend DAGBuilder<T>;
+  friend DAGNodeBuilder<T>;
+
+  using RunContext = detail::DAGRunContext<T>;
+  using Counter = detail::DAGCounter;
+  using NodeIndex = size_t;
+  using Work = typename detail::DAGWork<T>::type;
+  static const constexpr size_t NumReservedNodes = 32;
+  static const constexpr size_t NumReservedNumOuts = 4;
+  static const constexpr size_t InvalidCounterIndex = ~static_cast<size_t>(0);
+  static const constexpr NodeIndex RootIndex = 0;
+  static const constexpr NodeIndex InvalidNodeIndex =
+      ~static_cast<NodeIndex>(0);
+
+  // DAG work node.
+  struct Node {
+    MARL_NO_EXPORT inline Node() = default;
+    MARL_NO_EXPORT inline Node(Work&& work);
+    MARL_NO_EXPORT inline Node(const Work& work);
+
+    // The work to perform for this node in the graph.
+    Work work;
+
+    // counterIndex if valid, is the index of the counter in the RunContext for
+    // this node. The counter is decremented for each completed dependency task
+    // (ins), and once it reaches 0, this node will be invoked.
+    size_t counterIndex = InvalidCounterIndex;
+
+    // Indices for all downstream nodes.
+    containers::vector<NodeIndex, NumReservedNumOuts> outs;
+  };
+
+  // initCounters() allocates and initializes the ctx->coutners from
+  // initialCounters.
+  MARL_NO_EXPORT inline void initCounters(RunContext* ctx,
+                                          Allocator* allocator);
+
+  // notify() is called each time a dependency task (ins) has completed for the
+  // node with the given index.
+  // If all dependency tasks have completed (or this is the root node) then
+  // notify() returns true and the caller should then call invoke().
+  MARL_NO_EXPORT inline bool notify(RunContext*, NodeIndex);
+
+  // invoke() calls the work function for the node with the given index, then
+  // calls notify() and possibly invoke() for all the dependee nodes.
+  MARL_NO_EXPORT inline void invoke(RunContext*, NodeIndex, WaitGroup*);
+
+  // nodes is the full list of the nodes in the graph.
+  // nodes[0] is always the root node, which has no dependencies (ins).
+  containers::vector<Node, NumReservedNodes> nodes;
+
+  // initialCounters is a list of initial counter values to be copied to
+  // RunContext::counters on DAG<>::run().
+  // initialCounters is indexed by Node::counterIndex, and only contains counts
+  // for nodes that have at least 2 dependencies (ins) - because of this the
+  // number of entries in initialCounters may be fewer than nodes.
+  containers::vector<uint32_t, NumReservedNodes> initialCounters;
+};
+
+template <typename T>
+DAGBase<T>::Node::Node(Work&& work) : work(std::move(work)) {}
+
+template <typename T>
+DAGBase<T>::Node::Node(const Work& work) : work(work) {}
+
+template <typename T>
+void DAGBase<T>::initCounters(RunContext* ctx, Allocator* allocator) {
+  auto numCounters = initialCounters.size();
+  ctx->counters = allocator->make_unique_n<Counter>(numCounters);
+  for (size_t i = 0; i < numCounters; i++) {
+    ctx->counters.get()[i] = {initialCounters[i]};
+  }
+}
+
+template <typename T>
+bool DAGBase<T>::notify(RunContext* ctx, NodeIndex nodeIdx) {
+  Node* node = &nodes[nodeIdx];
+
+  // If we have multiple dependencies, decrement the counter and check whether
+  // we've reached 0.
+  if (node->counterIndex == InvalidCounterIndex) {
+    return true;
+  }
+  auto counters = ctx->counters.get();
+  auto counter = --counters[node->counterIndex];
+  return counter == 0;
+}
+
+template <typename T>
+void DAGBase<T>::invoke(RunContext* ctx, NodeIndex nodeIdx, WaitGroup* wg) {
+  Node* node = &nodes[nodeIdx];
+
+  // Run this node's work.
+  if (node->work) {
+    ctx->invoke(node->work);
+  }
+
+  // Then call notify() on all dependees (outs), and invoke() those that
+  // returned true.
+  // We buffer the node to invoke (toInvoke) so we can schedule() all but the
+  // last node to invoke(), and directly call the last invoke() on this thread.
+  // This is done to avoid the overheads of scheduling when a direct call would
+  // suffice.
+  NodeIndex toInvoke = InvalidNodeIndex;
+  for (NodeIndex idx : node->outs) {
+    if (notify(ctx, idx)) {
+      if (toInvoke != InvalidNodeIndex) {
+        wg->add(1);
+        // Schedule while promoting the WaitGroup capture from a pointer
+        // reference to a value. This ensures that the WaitGroup isn't dropped
+        // while in use.
+        schedule(
+            [=](WaitGroup wg) {
+              invoke(ctx, toInvoke, &wg);
+              wg.done();
+            },
+            *wg);
+      }
+      toInvoke = idx;
+    }
+  }
+  if (toInvoke != InvalidNodeIndex) {
+    invoke(ctx, toInvoke, wg);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// DAGNodeBuilder<T>
+///////////////////////////////////////////////////////////////////////////////
+
+// DAGNodeBuilder is the builder interface for a DAG node.
+template <typename T>
+class DAGNodeBuilder {
+  using NodeIndex = typename DAGBase<T>::NodeIndex;
+
+ public:
+  // then() builds and returns a new DAG node that will be invoked after this
+  // node has completed.
+  //
+  // F is a function that will be called when the new DAG node is invoked, with
+  // the signature:
+  //   void(T)   when T is not void
+  // or
+  //   void()    when T is void
+  template <typename F>
+  MARL_NO_EXPORT inline DAGNodeBuilder then(F&&);
+
+ private:
+  friend DAGBuilder<T>;
+  MARL_NO_EXPORT inline DAGNodeBuilder(DAGBuilder<T>*, NodeIndex);
+  DAGBuilder<T>* builder;
+  NodeIndex index;
+};
+
+template <typename T>
+DAGNodeBuilder<T>::DAGNodeBuilder(DAGBuilder<T>* builder, NodeIndex index)
+    : builder(builder), index(index) {}
+
+template <typename T>
+template <typename F>
+DAGNodeBuilder<T> DAGNodeBuilder<T>::then(F&& work) {
+  auto node = builder->node(std::forward<F>(work));
+  builder->addDependency(*this, node);
+  return node;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// DAGBuilder<T>
+///////////////////////////////////////////////////////////////////////////////
+template <typename T>
+class DAGBuilder {
+ public:
+  // DAGBuilder constructor
+  MARL_NO_EXPORT inline DAGBuilder(Allocator* allocator = Allocator::Default);
+
+  // root() returns the root DAG node.
+  MARL_NO_EXPORT inline DAGNodeBuilder<T> root();
+
+  // node() builds and returns a new DAG node with no initial dependencies.
+  // The returned node must be attached to the graph in order to invoke F or any
+  // of the dependees of this returned node.
+  //
+  // F is a function that will be called when the new DAG node is invoked, with
+  // the signature:
+  //   void(T)   when T is not void
+  // or
+  //   void()    when T is void
+  template <typename F>
+  MARL_NO_EXPORT inline DAGNodeBuilder<T> node(F&& work);
+
+  // node() builds and returns a new DAG node that depends on all the tasks in
+  // after to be completed before invoking F.
+  //
+  // F is a function that will be called when the new DAG node is invoked, with
+  // the signature:
+  //   void(T)   when T is not void
+  // or
+  //   void()    when T is void
+  template <typename F>
+  MARL_NO_EXPORT inline DAGNodeBuilder<T> node(
+      F&& work,
+      std::initializer_list<DAGNodeBuilder<T>> after);
+
+  // addDependency() adds parent as dependency on child. All dependencies of
+  // child must have completed before child is invoked.
+  MARL_NO_EXPORT inline void addDependency(DAGNodeBuilder<T> parent,
+                                           DAGNodeBuilder<T> child);
+
+  // build() constructs and returns the DAG. No other methods of this class may
+  // be called after calling build().
+  MARL_NO_EXPORT inline Allocator::unique_ptr<DAG<T>> build();
+
+ private:
+  static const constexpr size_t NumReservedNumIns = 4;
+  using Node = typename DAG<T>::Node;
+
+  // The DAG being built.
+  Allocator::unique_ptr<DAG<T>> dag;
+
+  // Number of dependencies (ins) for each node in dag->nodes.
+  containers::vector<uint32_t, NumReservedNumIns> numIns;
+};
+
+template <typename T>
+DAGBuilder<T>::DAGBuilder(Allocator* allocator /* = Allocator::Default */)
+    : dag(allocator->make_unique<DAG<T>>()), numIns(allocator) {
+  // Add root
+  dag->nodes.emplace_back(Node{});
+  numIns.emplace_back(0);
+}
+
+template <typename T>
+DAGNodeBuilder<T> DAGBuilder<T>::root() {
+  return DAGNodeBuilder<T>{this, DAGBase<T>::RootIndex};
+}
+
+template <typename T>
+template <typename F>
+DAGNodeBuilder<T> DAGBuilder<T>::node(F&& work) {
+  return node(std::forward<F>(work), {});
+}
+
+template <typename T>
+template <typename F>
+DAGNodeBuilder<T> DAGBuilder<T>::node(
+    F&& work,
+    std::initializer_list<DAGNodeBuilder<T>> after) {
+  MARL_ASSERT(numIns.size() == dag->nodes.size(),
+              "NodeBuilder vectors out of sync");
+  auto index = dag->nodes.size();
+  numIns.emplace_back(0);
+  dag->nodes.emplace_back(Node{std::forward<F>(work)});
+  auto node = DAGNodeBuilder<T>{this, index};
+  for (auto in : after) {
+    addDependency(in, node);
+  }
+  return node;
+}
+
+template <typename T>
+void DAGBuilder<T>::addDependency(DAGNodeBuilder<T> parent,
+                                  DAGNodeBuilder<T> child) {
+  numIns[child.index]++;
+  dag->nodes[parent.index].outs.push_back(child.index);
+}
+
+template <typename T>
+Allocator::unique_ptr<DAG<T>> DAGBuilder<T>::build() {
+  auto numNodes = dag->nodes.size();
+  MARL_ASSERT(numIns.size() == dag->nodes.size(),
+              "NodeBuilder vectors out of sync");
+  for (size_t i = 0; i < numNodes; i++) {
+    if (numIns[i] > 1) {
+      auto& node = dag->nodes[i];
+      node.counterIndex = dag->initialCounters.size();
+      dag->initialCounters.push_back(numIns[i]);
+    }
+  }
+  return std::move(dag);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// DAG<T>
+///////////////////////////////////////////////////////////////////////////////
+template <typename T = void>
+class DAG : public DAGBase<T> {
+ public:
+  using Builder = DAGBuilder<T>;
+  using NodeBuilder = DAGNodeBuilder<T>;
+
+  // run() invokes the function of each node in the graph of the DAG, passing
+  // data to each, starting with the root node. All dependencies need to have
+  // completed their function before dependees will be invoked.
+  MARL_NO_EXPORT inline void run(T& data,
+                                 Allocator* allocator = Allocator::Default);
+};
+
+template <typename T>
+void DAG<T>::run(T& arg, Allocator* allocator /* = Allocator::Default */) {
+  typename DAGBase<T>::RunContext ctx{arg};
+  this->initCounters(&ctx, allocator);
+  WaitGroup wg;
+  this->invoke(&ctx, this->RootIndex, &wg);
+  wg.wait();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// DAG<void>
+///////////////////////////////////////////////////////////////////////////////
+template <>
+class DAG<void> : public DAGBase<void> {
+ public:
+  using Builder = DAGBuilder<void>;
+  using NodeBuilder = DAGNodeBuilder<void>;
+
+  // run() invokes the function of each node in the graph of the DAG, starting
+  // with the root node. All dependencies need to have completed their function
+  // before dependees will be invoked.
+  MARL_NO_EXPORT inline void run(Allocator* allocator = Allocator::Default);
+};
+
+void DAG<void>::run(Allocator* allocator /* = Allocator::Default */) {
+  typename DAGBase<void>::RunContext ctx{};
+  this->initCounters(&ctx, allocator);
+  WaitGroup wg;
+  this->invoke(&ctx, this->RootIndex, &wg);
+  wg.wait();
+}
+
+}  // namespace marl
+
+#endif  // marl_dag_h
--- a/3party/marl/include/marl/debug.h
+++ b/3party/marl/include/marl/debug.h
@ -0,0 +1,61 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_debug_h
+#define marl_debug_h
+
+#include "export.h"
+
+#if !defined(MARL_DEBUG_ENABLED)
+#if !defined(NDEBUG) || defined(DCHECK_ALWAYS_ON)
+#define MARL_DEBUG_ENABLED 1
+#else
+#define MARL_DEBUG_ENABLED 0
+#endif
+#endif
+
+namespace marl {
+
+MARL_EXPORT
+void fatal(const char* msg, ...);
+
+MARL_EXPORT
+void warn(const char* msg, ...);
+
+MARL_EXPORT
+void assert_has_bound_scheduler(const char* feature);
+
+#if MARL_DEBUG_ENABLED
+#define MARL_FATAL(msg, ...) marl::fatal(msg "\n", ##__VA_ARGS__);
+#define MARL_ASSERT(cond, msg, ...)              \
+  do {                                           \
+    if (!(cond)) {                               \
+      MARL_FATAL("ASSERT: " msg, ##__VA_ARGS__); \
+    }                                            \
+  } while (false);
+#define MARL_ASSERT_HAS_BOUND_SCHEDULER(feature) \
+  marl::assert_has_bound_scheduler(feature);
+#define MARL_UNREACHABLE() MARL_FATAL("UNREACHABLE");
+#define MARL_WARN(msg, ...) marl::warn("WARNING: " msg "\n", ##__VA_ARGS__);
+#else
+#define MARL_FATAL(msg, ...)
+#define MARL_ASSERT(cond, msg, ...)
+#define MARL_ASSERT_HAS_BOUND_SCHEDULER(feature)
+#define MARL_UNREACHABLE()
+#define MARL_WARN(msg, ...)
+#endif
+
+}  // namespace marl
+
+#endif  // marl_debug_h
--- a/3party/marl/include/marl/defer.h
+++ b/3party/marl/include/marl/defer.h
@ -0,0 +1,45 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_defer_h
+#define marl_defer_h
+
+#include "finally.h"
+
+namespace marl {
+
+#define MARL_CONCAT_(a, b) a##b
+#define MARL_CONCAT(a, b) MARL_CONCAT_(a, b)
+
+// defer() is a macro to defer execution of a statement until the surrounding
+// scope is closed and is typically used to perform cleanup logic once a
+// function returns.
+//
+// Note: Unlike golang's defer(), the defer statement is executed when the
+// surrounding *scope* is closed, not necessarily the function.
+//
+// Example usage:
+//
+//  void sayHelloWorld()
+//  {
+//      defer(printf("world\n"));
+//      printf("hello ");
+//  }
+//
+#define defer(x) \
+  auto MARL_CONCAT(defer_, __LINE__) = marl::make_finally([&] { x; })
+
+}  // namespace marl
+
+#endif  // marl_defer_h
--- a/3party/marl/include/marl/deprecated.h
+++ b/3party/marl/include/marl/deprecated.h
@ -0,0 +1,38 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_deprecated_h
+#define marl_deprecated_h
+
+#ifndef MARL_WARN_DEPRECATED
+#define MARL_WARN_DEPRECATED 1
+#endif  // MARL_WARN_DEPRECATED
+
+#if MARL_WARN_DEPRECATED
+#if defined(_WIN32)
+#define MARL_DEPRECATED(issue_num, message)                              \
+  __declspec(deprecated(                                                 \
+      message "\nSee: https://github.com/google/marl/issues/" #issue_num \
+              " for more information"))
+#else
+#define MARL_DEPRECATED(issue_num, message)                              \
+  __attribute__((deprecated(                                             \
+      message "\nSee: https://github.com/google/marl/issues/" #issue_num \
+              " for more information")))
+#endif
+#else
+#define MARL_DEPRECATED(issue_num, message)
+#endif
+
+#endif  // marl_deprecated_h
--- a/3party/marl/include/marl/event.h
+++ b/3party/marl/include/marl/event.h
@ -0,0 +1,250 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_event_h
+#define marl_event_h
+
+#include "conditionvariable.h"
+#include "containers.h"
+#include "export.h"
+#include "memory.h"
+
+#include <chrono>
+
+namespace marl {
+
+// Event is a synchronization primitive used to block until a signal is raised.
+class Event {
+ public:
+  enum class Mode : uint8_t {
+    // The event signal will be automatically reset when a call to wait()
+    // returns.
+    // A single call to signal() will only unblock a single (possibly
+    // future) call to wait().
+    Auto,
+
+    // While the event is in the signaled state, any calls to wait() will
+    // unblock without automatically reseting the signaled state.
+    // The signaled state can be reset with a call to clear().
+    Manual
+  };
+
+  MARL_NO_EXPORT inline Event(Mode mode = Mode::Auto,
+                              bool initialState = false,
+                              Allocator* allocator = Allocator::Default);
+
+  // signal() signals the event, possibly unblocking a call to wait().
+  MARL_NO_EXPORT inline void signal() const;
+
+  // clear() clears the signaled state.
+  MARL_NO_EXPORT inline void clear() const;
+
+  // wait() blocks until the event is signaled.
+  // If the event was constructed with the Auto Mode, then only one
+  // call to wait() will unblock before returning, upon which the signalled
+  // state will be automatically cleared.
+  MARL_NO_EXPORT inline void wait() const;
+
+  // wait_for() blocks until the event is signaled, or the timeout has been
+  // reached.
+  // If the timeout was reached, then wait_for() return false.
+  // If the event is signalled and event was constructed with the Auto Mode,
+  // then only one call to wait() will unblock before returning, upon which the
+  // signalled state will be automatically cleared.
+  template <typename Rep, typename Period>
+  MARL_NO_EXPORT inline bool wait_for(
+      const std::chrono::duration<Rep, Period>& duration) const;
+
+  // wait_until() blocks until the event is signaled, or the timeout has been
+  // reached.
+  // If the timeout was reached, then wait_for() return false.
+  // If the event is signalled and event was constructed with the Auto Mode,
+  // then only one call to wait() will unblock before returning, upon which the
+  // signalled state will be automatically cleared.
+  template <typename Clock, typename Duration>
+  MARL_NO_EXPORT inline bool wait_until(
+      const std::chrono::time_point<Clock, Duration>& timeout) const;
+
+  // test() returns true if the event is signaled, otherwise false.
+  // If the event is signalled and was constructed with the Auto Mode
+  // then the signalled state will be automatically cleared upon returning.
+  MARL_NO_EXPORT inline bool test() const;
+
+  // isSignalled() returns true if the event is signaled, otherwise false.
+  // Unlike test() the signal is not automatically cleared when the event was
+  // constructed with the Auto Mode.
+  // Note: No lock is held after bool() returns, so the event state may
+  // immediately change after returning. Use with caution.
+  MARL_NO_EXPORT inline bool isSignalled() const;
+
+  // any returns an event that is automatically signalled whenever any of the
+  // events in the list are signalled.
+  template <typename Iterator>
+  MARL_NO_EXPORT inline static Event any(Mode mode,
+                                         const Iterator& begin,
+                                         const Iterator& end);
+
+  // any returns an event that is automatically signalled whenever any of the
+  // events in the list are signalled.
+  // This overload defaults to using the Auto mode.
+  template <typename Iterator>
+  MARL_NO_EXPORT inline static Event any(const Iterator& begin,
+                                         const Iterator& end);
+
+ private:
+  struct Shared {
+    MARL_NO_EXPORT inline Shared(Allocator* allocator,
+                                 Mode mode,
+                                 bool initialState);
+    MARL_NO_EXPORT inline void signal();
+    MARL_NO_EXPORT inline void wait();
+
+    template <typename Rep, typename Period>
+    MARL_NO_EXPORT inline bool wait_for(
+        const std::chrono::duration<Rep, Period>& duration);
+
+    template <typename Clock, typename Duration>
+    MARL_NO_EXPORT inline bool wait_until(
+        const std::chrono::time_point<Clock, Duration>& timeout);
+
+    marl::mutex mutex;
+    ConditionVariable cv;
+    containers::vector<std::shared_ptr<Shared>, 1> deps;
+    const Mode mode;
+    bool signalled;
+  };
+
+  const std::shared_ptr<Shared> shared;
+};
+
+Event::Shared::Shared(Allocator* allocator, Mode mode_, bool initialState)
+    : cv(allocator), mode(mode_), signalled(initialState) {}
+
+void Event::Shared::signal() {
+  marl::lock lock(mutex);
+  if (signalled) {
+    return;
+  }
+  signalled = true;
+  if (mode == Mode::Auto) {
+    cv.notify_one();
+  } else {
+    cv.notify_all();
+  }
+  for (auto dep : deps) {
+    dep->signal();
+  }
+}
+
+void Event::Shared::wait() {
+  marl::lock lock(mutex);
+  cv.wait(lock, [&] { return signalled; });
+  if (mode == Mode::Auto) {
+    signalled = false;
+  }
+}
+
+template <typename Rep, typename Period>
+bool Event::Shared::wait_for(
+    const std::chrono::duration<Rep, Period>& duration) {
+  marl::lock lock(mutex);
+  if (!cv.wait_for(lock, duration, [&] { return signalled; })) {
+    return false;
+  }
+  if (mode == Mode::Auto) {
+    signalled = false;
+  }
+  return true;
+}
+
+template <typename Clock, typename Duration>
+bool Event::Shared::wait_until(
+    const std::chrono::time_point<Clock, Duration>& timeout) {
+  marl::lock lock(mutex);
+  if (!cv.wait_until(lock, timeout, [&] { return signalled; })) {
+    return false;
+  }
+  if (mode == Mode::Auto) {
+    signalled = false;
+  }
+  return true;
+}
+
+Event::Event(Mode mode /* = Mode::Auto */,
+             bool initialState /* = false */,
+             Allocator* allocator /* = Allocator::Default */)
+    : shared(allocator->make_shared<Shared>(allocator, mode, initialState)) {}
+
+void Event::signal() const {
+  shared->signal();
+}
+
+void Event::clear() const {
+  marl::lock lock(shared->mutex);
+  shared->signalled = false;
+}
+
+void Event::wait() const {
+  shared->wait();
+}
+
+template <typename Rep, typename Period>
+bool Event::wait_for(const std::chrono::duration<Rep, Period>& duration) const {
+  return shared->wait_for(duration);
+}
+
+template <typename Clock, typename Duration>
+bool Event::wait_until(
+    const std::chrono::time_point<Clock, Duration>& timeout) const {
+  return shared->wait_until(timeout);
+}
+
+bool Event::test() const {
+  marl::lock lock(shared->mutex);
+  if (!shared->signalled) {
+    return false;
+  }
+  if (shared->mode == Mode::Auto) {
+    shared->signalled = false;
+  }
+  return true;
+}
+
+bool Event::isSignalled() const {
+  marl::lock lock(shared->mutex);
+  return shared->signalled;
+}
+
+template <typename Iterator>
+Event Event::any(Mode mode, const Iterator& begin, const Iterator& end) {
+  Event any(mode, false);
+  for (auto it = begin; it != end; it++) {
+    auto s = it->shared;
+    marl::lock lock(s->mutex);
+    if (s->signalled) {
+      any.signal();
+    }
+    s->deps.push_back(any.shared);
+  }
+  return any;
+}
+
+template <typename Iterator>
+Event Event::any(const Iterator& begin, const Iterator& end) {
+  return any(Mode::Auto, begin, end);
+}
+
+}  // namespace marl
+
+#endif  // marl_event_h
--- a/3party/marl/include/marl/export.h
+++ b/3party/marl/include/marl/export.h
@ -0,0 +1,43 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_export_h
+#define marl_export_h
+
+#ifdef MARL_DLL
+
+#if MARL_BUILDING_DLL
+#define MARL_EXPORT __declspec(dllexport)
+#else
+#define MARL_EXPORT __declspec(dllimport)
+#endif
+
+#else  // #ifdef MARL_DLL
+
+#if __GNUC__ >= 4
+#define MARL_EXPORT __attribute__((visibility("default")))
+#define MARL_NO_EXPORT __attribute__((visibility("hidden")))
+#endif
+
+#endif
+
+#ifndef MARL_EXPORT
+#define MARL_EXPORT
+#endif
+
+#ifndef MARL_NO_EXPORT
+#define MARL_NO_EXPORT
+#endif
+
+#endif  // marl_export_h
--- a/3party/marl/include/marl/finally.h
+++ b/3party/marl/include/marl/finally.h
@ -0,0 +1,92 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Finally can be used to execute a lambda or function when the final reference
+// to the Finally is dropped.
+//
+// The purpose of a finally is to perform cleanup or termination logic and is
+// especially useful when there are multiple early returns within a function.
+//
+// A moveable Finally can be constructed with marl::make_finally().
+// A sharable Finally can be constructed with marl::make_shared_finally().
+
+#ifndef marl_finally_h
+#define marl_finally_h
+
+#include "export.h"
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+namespace marl {
+
+// Finally is a pure virtual base class, implemented by the templated
+// FinallyImpl.
+class Finally {
+ public:
+  virtual ~Finally() = default;
+};
+
+// FinallyImpl implements a Finally.
+// The template parameter F is the function type to be called when the finally
+// is destructed. F must have the signature void().
+template <typename F>
+class FinallyImpl : public Finally {
+ public:
+  MARL_NO_EXPORT inline FinallyImpl(const F& func);
+  MARL_NO_EXPORT inline FinallyImpl(F&& func);
+  MARL_NO_EXPORT inline FinallyImpl(FinallyImpl<F>&& other);
+  MARL_NO_EXPORT inline ~FinallyImpl();
+
+ private:
+  FinallyImpl(const FinallyImpl<F>& other) = delete;
+  FinallyImpl<F>& operator=(const FinallyImpl<F>& other) = delete;
+  FinallyImpl<F>& operator=(FinallyImpl<F>&&) = delete;
+  F func;
+  bool valid = true;
+};
+
+template <typename F>
+FinallyImpl<F>::FinallyImpl(const F& func_) : func(func_) {}
+
+template <typename F>
+FinallyImpl<F>::FinallyImpl(F&& func_) : func(std::move(func_)) {}
+
+template <typename F>
+FinallyImpl<F>::FinallyImpl(FinallyImpl<F>&& other)
+    : func(std::move(other.func)) {
+  other.valid = false;
+}
+
+template <typename F>
+FinallyImpl<F>::~FinallyImpl() {
+  if (valid) {
+    func();
+  }
+}
+
+template <typename F>
+inline FinallyImpl<F> make_finally(F&& f) {
+  return FinallyImpl<F>(std::forward<F>(f));
+}
+
+template <typename F>
+inline std::shared_ptr<Finally> make_shared_finally(F&& f) {
+  return std::make_shared<FinallyImpl<F>>(std::forward<F>(f));
+}
+
+}  // namespace marl
+
+#endif  // marl_finally_h
--- a/3party/marl/include/marl/memory.h
+++ b/3party/marl/include/marl/memory.h
@ -0,0 +1,461 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_memory_h
+#define marl_memory_h
+
+#include "debug.h"
+#include "export.h"
+
+#include <stdint.h>
+
+#include <array>
+#include <cstdlib>
+#include <memory>
+#include <mutex>
+#include <utility>  // std::forward
+
+namespace marl {
+
+template <typename T>
+struct StlAllocator;
+
+// pageSize() returns the size in bytes of a virtual memory page for the host
+// system.
+MARL_EXPORT
+size_t pageSize();
+
+template <typename T>
+MARL_NO_EXPORT inline T alignUp(T val, T alignment) {
+  return alignment * ((val + alignment - 1) / alignment);
+}
+
+// aligned_storage() is a replacement for std::aligned_storage that isn't busted
+// on older versions of MSVC.
+template <size_t SIZE, size_t ALIGNMENT>
+struct aligned_storage {
+  struct alignas(ALIGNMENT) type {
+    unsigned char data[SIZE];
+  };
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Allocation
+///////////////////////////////////////////////////////////////////////////////
+
+// Allocation holds the result of a memory allocation from an Allocator.
+struct Allocation {
+  // Intended usage of the allocation. Used for allocation trackers.
+  enum class Usage : uint8_t {
+    Undefined = 0,
+    Stack,   // Fiber stack
+    Create,  // Allocator::create(), make_unique(), make_shared()
+    Vector,  // marl::containers::vector<T>
+    List,    // marl::containers::list<T>
+    Stl,     // marl::StlAllocator
+    Count,   // Not intended to be used as a usage type - used for upper bound.
+  };
+
+  // Request holds all the information required to make an allocation.
+  struct Request {
+    size_t size = 0;                 // The size of the allocation in bytes.
+    size_t alignment = 0;            // The minimum alignment of the allocation.
+    bool useGuards = false;          // Whether the allocation is guarded.
+    Usage usage = Usage::Undefined;  // Intended usage of the allocation.
+  };
+
+  void* ptr = nullptr;  // The pointer to the allocated memory.
+  Request request;      // Request used for the allocation.
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Allocator
+///////////////////////////////////////////////////////////////////////////////
+
+// Allocator is an interface to a memory allocator.
+// Marl provides a default implementation with Allocator::Default.
+class Allocator {
+ public:
+  // The default allocator. Initialized with an implementation that allocates
+  // from the OS. Can be assigned a custom implementation.
+  MARL_EXPORT static Allocator* Default;
+
+  // Deleter is a smart-pointer compatible deleter that can be used to delete
+  // objects created by Allocator::create(). Deleter is used by the smart
+  // pointers returned by make_shared() and make_unique().
+  struct MARL_EXPORT Deleter {
+    MARL_NO_EXPORT inline Deleter();
+    MARL_NO_EXPORT inline Deleter(Allocator* allocator, size_t count);
+
+    template <typename T>
+    MARL_NO_EXPORT inline void operator()(T* object);
+
+    Allocator* allocator = nullptr;
+    size_t count = 0;
+  };
+
+  // unique_ptr<T> is an alias to std::unique_ptr<T, Deleter>.
+  template <typename T>
+  using unique_ptr = std::unique_ptr<T, Deleter>;
+
+  virtual ~Allocator() = default;
+
+  // allocate() allocates memory from the allocator.
+  // The returned Allocation::request field must be equal to the Request
+  // parameter.
+  virtual Allocation allocate(const Allocation::Request&) = 0;
+
+  // free() frees the memory returned by allocate().
+  // The Allocation must have all fields equal to those returned by allocate().
+  virtual void free(const Allocation&) = 0;
+
+  // create() allocates and constructs an object of type T, respecting the
+  // alignment of the type.
+  // The pointer returned by create() must be deleted with destroy().
+  template <typename T, typename... ARGS>
+  inline T* create(ARGS&&... args);
+
+  // destroy() destructs and frees the object allocated with create().
+  template <typename T>
+  inline void destroy(T* object);
+
+  // make_unique() returns a new object allocated from the allocator wrapped
+  // in a unique_ptr that respects the alignment of the type.
+  template <typename T, typename... ARGS>
+  inline unique_ptr<T> make_unique(ARGS&&... args);
+
+  // make_unique_n() returns an array of n new objects allocated from the
+  // allocator wrapped in a unique_ptr that respects the alignment of the
+  // type.
+  template <typename T, typename... ARGS>
+  inline unique_ptr<T> make_unique_n(size_t n, ARGS&&... args);
+
+  // make_shared() returns a new object allocated from the allocator
+  // wrapped in a std::shared_ptr that respects the alignment of the type.
+  template <typename T, typename... ARGS>
+  inline std::shared_ptr<T> make_shared(ARGS&&... args);
+
+ protected:
+  Allocator() = default;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Allocator::Deleter
+///////////////////////////////////////////////////////////////////////////////
+Allocator::Deleter::Deleter() : allocator(nullptr) {}
+Allocator::Deleter::Deleter(Allocator* allocator_, size_t count_)
+    : allocator(allocator_), count(count_) {}
+
+template <typename T>
+void Allocator::Deleter::operator()(T* object) {
+  object->~T();
+
+  Allocation allocation;
+  allocation.ptr = object;
+  allocation.request.size = sizeof(T) * count;
+  allocation.request.alignment = alignof(T);
+  allocation.request.usage = Allocation::Usage::Create;
+  allocator->free(allocation);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Allocator
+///////////////////////////////////////////////////////////////////////////////
+template <typename T, typename... ARGS>
+T* Allocator::create(ARGS&&... args) {
+  Allocation::Request request;
+  request.size = sizeof(T);
+  request.alignment = alignof(T);
+  request.usage = Allocation::Usage::Create;
+
+  auto alloc = allocate(request);
+  new (alloc.ptr) T(std::forward<ARGS>(args)...);
+  return reinterpret_cast<T*>(alloc.ptr);
+}
+
+template <typename T>
+void Allocator::destroy(T* object) {
+  object->~T();
+
+  Allocation alloc;
+  alloc.ptr = object;
+  alloc.request.size = sizeof(T);
+  alloc.request.alignment = alignof(T);
+  alloc.request.usage = Allocation::Usage::Create;
+  free(alloc);
+}
+
+template <typename T, typename... ARGS>
+Allocator::unique_ptr<T> Allocator::make_unique(ARGS&&... args) {
+  return make_unique_n<T>(1, std::forward<ARGS>(args)...);
+}
+
+template <typename T, typename... ARGS>
+Allocator::unique_ptr<T> Allocator::make_unique_n(size_t n, ARGS&&... args) {
+  if (n == 0) {
+    return nullptr;
+  }
+
+  Allocation::Request request;
+  request.size = sizeof(T) * n;
+  request.alignment = alignof(T);
+  request.usage = Allocation::Usage::Create;
+
+  auto alloc = allocate(request);
+  new (alloc.ptr) T(std::forward<ARGS>(args)...);
+  return unique_ptr<T>(reinterpret_cast<T*>(alloc.ptr), Deleter{this, n});
+}
+
+template <typename T, typename... ARGS>
+std::shared_ptr<T> Allocator::make_shared(ARGS&&... args) {
+  Allocation::Request request;
+  request.size = sizeof(T);
+  request.alignment = alignof(T);
+  request.usage = Allocation::Usage::Create;
+
+  auto alloc = allocate(request);
+  new (alloc.ptr) T(std::forward<ARGS>(args)...);
+  return std::shared_ptr<T>(reinterpret_cast<T*>(alloc.ptr), Deleter{this, 1});
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// TrackedAllocator
+///////////////////////////////////////////////////////////////////////////////
+
+// TrackedAllocator wraps an Allocator to track the allocations made.
+class TrackedAllocator : public Allocator {
+ public:
+  struct UsageStats {
+    // Total number of allocations.
+    size_t count = 0;
+    // total allocation size in bytes (as requested, may be higher due to
+    // alignment or guards).
+    size_t bytes = 0;
+  };
+
+  struct Stats {
+    // numAllocations() returns the total number of allocations across all
+    // usages for the allocator.
+    inline size_t numAllocations() const;
+
+    // bytesAllocated() returns the total number of bytes allocated across all
+    // usages for the allocator.
+    inline size_t bytesAllocated() const;
+
+    // Statistics per usage.
+    std::array<UsageStats, size_t(Allocation::Usage::Count)> byUsage;
+  };
+
+  // Constructor that wraps an existing allocator.
+  inline TrackedAllocator(Allocator* allocator);
+
+  // stats() returns the current allocator statistics.
+  inline Stats stats();
+
+  // Allocator compliance
+  inline Allocation allocate(const Allocation::Request&) override;
+  inline void free(const Allocation&) override;
+
+ private:
+  Allocator* const allocator;
+  std::mutex mutex;
+  Stats stats_;
+};
+
+size_t TrackedAllocator::Stats::numAllocations() const {
+  size_t out = 0;
+  for (auto& stats : byUsage) {
+    out += stats.count;
+  }
+  return out;
+}
+
+size_t TrackedAllocator::Stats::bytesAllocated() const {
+  size_t out = 0;
+  for (auto& stats : byUsage) {
+    out += stats.bytes;
+  }
+  return out;
+}
+
+TrackedAllocator::TrackedAllocator(Allocator* allocator_)
+    : allocator(allocator_) {}
+
+TrackedAllocator::Stats TrackedAllocator::stats() {
+  std::unique_lock<std::mutex> lock(mutex);
+  return stats_;
+}
+
+Allocation TrackedAllocator::allocate(const Allocation::Request& request) {
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    auto& usageStats = stats_.byUsage[int(request.usage)];
+    ++usageStats.count;
+    usageStats.bytes += request.size;
+  }
+  return allocator->allocate(request);
+}
+
+void TrackedAllocator::free(const Allocation& allocation) {
+  {
+    std::unique_lock<std::mutex> lock(mutex);
+    auto& usageStats = stats_.byUsage[int(allocation.request.usage)];
+    MARL_ASSERT(usageStats.count > 0,
+                "TrackedAllocator detected abnormal free()");
+    MARL_ASSERT(usageStats.bytes >= allocation.request.size,
+                "TrackedAllocator detected abnormal free()");
+    --usageStats.count;
+    usageStats.bytes -= allocation.request.size;
+  }
+  return allocator->free(allocation);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// StlAllocator
+///////////////////////////////////////////////////////////////////////////////
+
+// StlAllocator exposes an STL-compatible allocator wrapping a marl::Allocator.
+template <typename T>
+struct StlAllocator {
+  using value_type = T;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using reference = T&;
+  using const_reference = const T&;
+  using size_type = size_t;
+  using difference_type = size_t;
+
+  // An equivalent STL allocator for a different type.
+  template <class U>
+  struct rebind {
+    typedef StlAllocator<U> other;
+  };
+
+  // Constructs an StlAllocator that will allocate using allocator.
+  // allocator must remain valid until this StlAllocator has been destroyed.
+  inline StlAllocator(Allocator* allocator);
+
+  template <typename U>
+  inline StlAllocator(const StlAllocator<U>& other);
+
+  // Returns the actual address of x even in presence of overloaded operator&.
+  inline pointer address(reference x) const;
+  inline const_pointer address(const_reference x) const;
+
+  // Allocates the memory for n objects of type T.
+  // Does not actually construct the objects.
+  inline T* allocate(std::size_t n);
+
+  // Deallocates the memory for n objects of type T.
+  inline void deallocate(T* p, std::size_t n);
+
+  // Returns the maximum theoretically possible number of T stored in this
+  // allocator.
+  inline size_type max_size() const;
+
+  // Copy constructs an object of type T at the address p.
+  inline void construct(pointer p, const_reference val);
+
+  // Constructs an object of type U at the address P forwarning all other
+  // arguments to the constructor.
+  template <typename U, typename... Args>
+  inline void construct(U* p, Args&&... args);
+
+  // Deconstructs the object at p. It does not free the memory.
+  inline void destroy(pointer p);
+
+  // Deconstructs the object at p. It does not free the memory.
+  template <typename U>
+  inline void destroy(U* p);
+
+ private:
+  inline Allocation::Request request(size_t n) const;
+
+  template <typename U>
+  friend struct StlAllocator;
+  Allocator* allocator;
+};
+
+template <typename T>
+StlAllocator<T>::StlAllocator(Allocator* allocator_) : allocator(allocator_) {}
+
+template <typename T>
+template <typename U>
+StlAllocator<T>::StlAllocator(const StlAllocator<U>& other) {
+  allocator = other.allocator;
+}
+
+template <typename T>
+typename StlAllocator<T>::pointer StlAllocator<T>::address(reference x) const {
+  return &x;
+}
+template <typename T>
+typename StlAllocator<T>::const_pointer StlAllocator<T>::address(
+    const_reference x) const {
+  return &x;
+}
+
+template <typename T>
+T* StlAllocator<T>::allocate(std::size_t n) {
+  auto alloc = allocator->allocate(request(n));
+  return reinterpret_cast<T*>(alloc.ptr);
+}
+
+template <typename T>
+void StlAllocator<T>::deallocate(T* p, std::size_t n) {
+  Allocation alloc;
+  alloc.ptr = p;
+  alloc.request = request(n);
+  allocator->free(alloc);
+}
+
+template <typename T>
+typename StlAllocator<T>::size_type StlAllocator<T>::max_size() const {
+  return std::numeric_limits<size_type>::max() / sizeof(value_type);
+}
+
+template <typename T>
+void StlAllocator<T>::construct(pointer p, const_reference val) {
+  new (p) T(val);
+}
+
+template <typename T>
+template <typename U, typename... Args>
+void StlAllocator<T>::construct(U* p, Args&&... args) {
+  ::new ((void*)p) U(std::forward<Args>(args)...);
+}
+
+template <typename T>
+void StlAllocator<T>::destroy(pointer p) {
+  ((T*)p)->~T();
+}
+
+template <typename T>
+template <typename U>
+void StlAllocator<T>::destroy(U* p) {
+  p->~U();
+}
+
+template <typename T>
+Allocation::Request StlAllocator<T>::request(size_t n) const {
+  Allocation::Request req = {};
+  req.size = sizeof(T) * n;
+  req.alignment = alignof(T);
+  req.usage = Allocation::Usage::Stl;
+  return req;
+}
+
+}  // namespace marl
+
+#endif  // marl_memory_h
--- a/3party/marl/include/marl/mutex.h
+++ b/3party/marl/include/marl/mutex.h
@ -0,0 +1,109 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Wrappers around std::mutex and std::unique_lock that provide clang's
+// Thread Safety Analysis annotations.
+// See: https://clang.llvm.org/docs/ThreadSafetyAnalysis.html
+
+#ifndef marl_mutex_h
+#define marl_mutex_h
+
+#include "export.h"
+#include "tsa.h"
+
+#include <condition_variable>
+#include <mutex>
+
+namespace marl {
+
+// mutex is a wrapper around std::mutex that offers Thread Safety Analysis
+// annotations.
+// mutex also holds methods for performing std::condition_variable::wait() calls
+// as these require a std::unique_lock<> which are unsupported by the TSA.
+class CAPABILITY("mutex") mutex {
+ public:
+  MARL_NO_EXPORT inline void lock() ACQUIRE() { _.lock(); }
+
+  MARL_NO_EXPORT inline void unlock() RELEASE() { _.unlock(); }
+
+  MARL_NO_EXPORT inline bool try_lock() TRY_ACQUIRE(true) {
+    return _.try_lock();
+  }
+
+  // wait_locked calls cv.wait() on this already locked mutex.
+  template <typename Predicate>
+  MARL_NO_EXPORT inline void wait_locked(std::condition_variable& cv,
+                                         Predicate&& p) REQUIRES(this) {
+    std::unique_lock<std::mutex> lock(_, std::adopt_lock);
+    cv.wait(lock, std::forward<Predicate>(p));
+    lock.release();  // Keep lock held.
+  }
+
+  // wait_until_locked calls cv.wait() on this already locked mutex.
+  template <typename Predicate, typename Time>
+  MARL_NO_EXPORT inline bool wait_until_locked(std::condition_variable& cv,
+                                               Time&& time,
+                                               Predicate&& p) REQUIRES(this) {
+    std::unique_lock<std::mutex> lock(_, std::adopt_lock);
+    auto res = cv.wait_until(lock, std::forward<Time>(time),
+                             std::forward<Predicate>(p));
+    lock.release();  // Keep lock held.
+    return res;
+  }
+
+ private:
+  friend class lock;
+  std::mutex _;
+};
+
+// lock is a RAII lock helper that offers Thread Safety Analysis annotations.
+// lock also holds methods for performing std::condition_variable::wait()
+// calls as these require a std::unique_lock<> which are unsupported by the TSA.
+class SCOPED_CAPABILITY lock {
+ public:
+  inline lock(mutex& m) ACQUIRE(m) : _(m._) {}
+  inline ~lock() RELEASE() = default;
+
+  // wait calls cv.wait() on this lock.
+  template <typename Predicate>
+  inline void wait(std::condition_variable& cv, Predicate&& p) {
+    cv.wait(_, std::forward<Predicate>(p));
+  }
+
+  // wait_until calls cv.wait() on this lock.
+  template <typename Predicate, typename Time>
+  inline bool wait_until(std::condition_variable& cv,
+                         Time&& time,
+                         Predicate&& p) {
+    return cv.wait_until(_, std::forward<Time>(time),
+                         std::forward<Predicate>(p));
+  }
+
+  inline bool owns_lock() const { return _.owns_lock(); }
+
+  // lock_no_tsa locks the mutex outside of the visiblity of the thread
+  // safety analysis. Use with caution.
+  inline void lock_no_tsa() { _.lock(); }
+
+  // unlock_no_tsa unlocks the mutex outside of the visiblity of the thread
+  // safety analysis. Use with caution.
+  inline void unlock_no_tsa() { _.unlock(); }
+
+ private:
+  std::unique_lock<std::mutex> _;
+};
+
+}  // namespace marl
+
+#endif  // marl_mutex_h
--- a/3party/marl/include/marl/parallelize.h
+++ b/3party/marl/include/marl/parallelize.h
@ -0,0 +1,63 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_parallelize_h
+#define marl_parallelize_h
+
+#include "scheduler.h"
+#include "waitgroup.h"
+
+namespace marl {
+
+namespace detail {
+
+MARL_NO_EXPORT inline void parallelizeChain(WaitGroup&) {}
+
+template <typename F, typename... L>
+MARL_NO_EXPORT inline void parallelizeChain(WaitGroup& wg, F&& f, L&&... l) {
+  schedule([=] {
+    f();
+    wg.done();
+  });
+  parallelizeChain(wg, std::forward<L>(l)...);
+}
+
+}  // namespace detail
+
+// parallelize() invokes all the function parameters, potentially concurrently,
+// and waits for them all to complete before returning.
+//
+// Each function must take no parameters.
+//
+// parallelize() does the following:
+//   (1) Schedules the function parameters in the parameter pack fn.
+//   (2) Calls f0 on the current thread.
+//   (3) Once f0 returns, waits for the scheduled functions in fn to all
+//   complete.
+// As the fn functions are scheduled before running f0, it is recommended to
+// pass the function that'll take the most time as the first argument. That way
+// you'll be more likely to avoid the cost of a fiber switch.
+template <typename F0, typename... FN>
+MARL_NO_EXPORT inline void parallelize(F0&& f0, FN&&... fn) {
+  WaitGroup wg(sizeof...(FN));
+  // Schedule all the functions in fn.
+  detail::parallelizeChain(wg, std::forward<FN>(fn)...);
+  // While we wait for fn to complete, run the first function on this thread.
+  f0();
+  wg.wait();
+}
+
+}  // namespace marl
+
+#endif  // marl_parallelize_h
--- a/3party/marl/include/marl/pool.h
+++ b/3party/marl/include/marl/pool.h
@ -0,0 +1,451 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_pool_h
+#define marl_pool_h
+
+#include "conditionvariable.h"
+#include "memory.h"
+#include "mutex.h"
+
+#include <atomic>
+
+namespace marl {
+
+// PoolPolicy controls whether pool items are constructed and destructed each
+// time they are borrowed from and returned to a pool, or whether they persist
+// constructed for the lifetime of the pool.
+enum class PoolPolicy {
+  // Call the Pool items constructor on borrow(), and destruct the item
+  // when the item is returned.
+  Reconstruct,
+
+  // Construct and destruct all items once for the lifetime of the Pool.
+  // Items will keep their state between loans.
+  Preserve,
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Pool<T>
+////////////////////////////////////////////////////////////////////////////////
+
+// Pool is the abstract base class for BoundedPool<> and UnboundedPool<>.
+template <typename T>
+class Pool {
+ protected:
+  struct Item;
+  class Storage;
+
+ public:
+  // A Loan is returned by the pool's borrow() function.
+  // Loans track the number of references to the loaned item, and return the
+  // item to the pool when the final Loan reference is dropped.
+  class Loan {
+   public:
+    MARL_NO_EXPORT inline Loan() = default;
+    MARL_NO_EXPORT inline Loan(Item*, const std::shared_ptr<Storage>&);
+    MARL_NO_EXPORT inline Loan(const Loan&);
+    MARL_NO_EXPORT inline Loan(Loan&&);
+    MARL_NO_EXPORT inline ~Loan();
+    MARL_NO_EXPORT inline Loan& operator=(const Loan&);
+    MARL_NO_EXPORT inline Loan& operator=(Loan&&);
+    MARL_NO_EXPORT inline T& operator*();
+    MARL_NO_EXPORT inline T* operator->() const;
+    MARL_NO_EXPORT inline T* get() const;
+    MARL_NO_EXPORT inline void reset();
+
+   private:
+    Item* item = nullptr;
+    std::shared_ptr<Storage> storage;
+  };
+
+ protected:
+  Pool() = default;
+
+  // The shared storage between the pool and all loans.
+  class Storage {
+   public:
+    virtual ~Storage() = default;
+    virtual void return_(Item*) = 0;
+  };
+
+  // The backing data of a single item in the pool.
+  struct Item {
+    // get() returns a pointer to the item's data.
+    MARL_NO_EXPORT inline T* get();
+
+    // construct() calls the constructor on the item's data.
+    MARL_NO_EXPORT inline void construct();
+
+    // destruct() calls the destructor on the item's data.
+    MARL_NO_EXPORT inline void destruct();
+
+    using Data = typename aligned_storage<sizeof(T), alignof(T)>::type;
+    Data data;
+    std::atomic<int> refcount = {0};
+    Item* next = nullptr;  // pointer to the next free item in the pool.
+  };
+};
+
+// Loan<T> is an alias to Pool<T>::Loan.
+template <typename T>
+using Loan = typename Pool<T>::Loan;
+
+////////////////////////////////////////////////////////////////////////////////
+// Pool<T>::Item
+////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+T* Pool<T>::Item::get() {
+  return reinterpret_cast<T*>(&data);
+}
+
+template <typename T>
+void Pool<T>::Item::construct() {
+  new (&data) T;
+}
+
+template <typename T>
+void Pool<T>::Item::destruct() {
+  get()->~T();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pool<T>::Loan
+////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+Pool<T>::Loan::Loan(Item* item, const std::shared_ptr<Storage>& storage)
+    : item(item), storage(storage) {
+  item->refcount++;
+}
+
+template <typename T>
+Pool<T>::Loan::Loan(const Loan& other)
+    : item(other.item), storage(other.storage) {
+  if (item != nullptr) {
+    item->refcount++;
+  }
+}
+
+template <typename T>
+Pool<T>::Loan::Loan(Loan&& other) : item(other.item), storage(other.storage) {
+  other.item = nullptr;
+  other.storage = nullptr;
+}
+
+template <typename T>
+Pool<T>::Loan::~Loan() {
+  reset();
+}
+
+template <typename T>
+void Pool<T>::Loan::reset() {
+  if (item != nullptr) {
+    auto refs = --item->refcount;
+    MARL_ASSERT(refs >= 0, "reset() called on zero-ref pool item");
+    if (refs == 0) {
+      storage->return_(item);
+    }
+    item = nullptr;
+    storage = nullptr;
+  }
+}
+
+template <typename T>
+typename Pool<T>::Loan& Pool<T>::Loan::operator=(const Loan& rhs) {
+  reset();
+  if (rhs.item != nullptr) {
+    item = rhs.item;
+    storage = rhs.storage;
+    rhs.item->refcount++;
+  }
+  return *this;
+}
+
+template <typename T>
+typename Pool<T>::Loan& Pool<T>::Loan::operator=(Loan&& rhs) {
+  reset();
+  std::swap(item, rhs.item);
+  std::swap(storage, rhs.storage);
+  return *this;
+}
+
+template <typename T>
+T& Pool<T>::Loan::operator*() {
+  return *item->get();
+}
+
+template <typename T>
+T* Pool<T>::Loan::operator->() const {
+  return item->get();
+}
+
+template <typename T>
+T* Pool<T>::Loan::get() const {
+  return item ? item->get() : nullptr;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// BoundedPool<T, N, POLICY>
+////////////////////////////////////////////////////////////////////////////////
+
+// BoundedPool<T, N, POLICY> is a pool of items of type T, with a maximum
+// capacity of N items.
+// BoundedPool<> is initially populated with N default-constructed items.
+// POLICY controls whether pool items are constructed and destructed each
+// time they are borrowed from and returned to the pool.
+template <typename T, int N, PoolPolicy POLICY = PoolPolicy::Reconstruct>
+class BoundedPool : public Pool<T> {
+ public:
+  using Item = typename Pool<T>::Item;
+  using Loan = typename Pool<T>::Loan;
+
+  MARL_NO_EXPORT inline BoundedPool(Allocator* allocator = Allocator::Default);
+
+  // borrow() borrows a single item from the pool, blocking until an item is
+  // returned if the pool is empty.
+  MARL_NO_EXPORT inline Loan borrow() const;
+
+  // borrow() borrows count items from the pool, blocking until there are at
+  // least count items in the pool. The function f() is called with each
+  // borrowed item.
+  // F must be a function with the signature: void(T&&)
+  template <typename F>
+  MARL_NO_EXPORT inline void borrow(size_t count, const F& f) const;
+
+  // tryBorrow() attempts to borrow a single item from the pool without
+  // blocking.
+  // The boolean of the returned pair is true on success, or false if the pool
+  // is empty.
+  MARL_NO_EXPORT inline std::pair<Loan, bool> tryBorrow() const;
+
+ private:
+  class Storage : public Pool<T>::Storage {
+   public:
+    MARL_NO_EXPORT inline Storage(Allocator* allocator);
+    MARL_NO_EXPORT inline ~Storage();
+    MARL_NO_EXPORT inline void return_(Item*) override;
+    // We cannot copy this as the Item pointers would be shared and
+    // deleted at a wrong point. We cannot move this because we return
+    // pointers into items[N].
+    MARL_NO_EXPORT inline Storage(const Storage&) = delete;
+    MARL_NO_EXPORT inline Storage& operator=(const Storage&) = delete;
+
+    Item items[N];
+    marl::mutex mutex;
+    ConditionVariable returned;
+    Item* free = nullptr;
+  };
+  std::shared_ptr<Storage> storage;
+};
+
+template <typename T, int N, PoolPolicy POLICY>
+BoundedPool<T, N, POLICY>::Storage::Storage(Allocator* allocator)
+    : returned(allocator) {
+  for (int i = 0; i < N; i++) {
+    if (POLICY == PoolPolicy::Preserve) {
+      items[i].construct();
+    }
+    items[i].next = this->free;
+    this->free = &items[i];
+  }
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+BoundedPool<T, N, POLICY>::Storage::~Storage() {
+  if (POLICY == PoolPolicy::Preserve) {
+    for (int i = 0; i < N; i++) {
+      items[i].destruct();
+    }
+  }
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+BoundedPool<T, N, POLICY>::BoundedPool(
+    Allocator* allocator /* = Allocator::Default */)
+    : storage(allocator->make_shared<Storage>(allocator)) {}
+
+template <typename T, int N, PoolPolicy POLICY>
+typename BoundedPool<T, N, POLICY>::Loan BoundedPool<T, N, POLICY>::borrow()
+    const {
+  Loan out;
+  borrow(1, [&](Loan&& loan) { out = std::move(loan); });
+  return out;
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+template <typename F>
+void BoundedPool<T, N, POLICY>::borrow(size_t n, const F& f) const {
+  marl::lock lock(storage->mutex);
+  for (size_t i = 0; i < n; i++) {
+    storage->returned.wait(lock, [&] { return storage->free != nullptr; });
+    auto item = storage->free;
+    storage->free = storage->free->next;
+    if (POLICY == PoolPolicy::Reconstruct) {
+      item->construct();
+    }
+    f(std::move(Loan(item, storage)));
+  }
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+std::pair<typename BoundedPool<T, N, POLICY>::Loan, bool>
+BoundedPool<T, N, POLICY>::tryBorrow() const {
+  Item* item = nullptr;
+  {
+    marl::lock lock(storage->mutex);
+    if (storage->free == nullptr) {
+      return std::make_pair(Loan(), false);
+    }
+    item = storage->free;
+    storage->free = storage->free->next;
+    item->pool = this;
+  }
+  if (POLICY == PoolPolicy::Reconstruct) {
+    item->construct();
+  }
+  return std::make_pair(Loan(item, storage), true);
+}
+
+template <typename T, int N, PoolPolicy POLICY>
+void BoundedPool<T, N, POLICY>::Storage::return_(Item* item) {
+  if (POLICY == PoolPolicy::Reconstruct) {
+    item->destruct();
+  }
+  {
+    marl::lock lock(mutex);
+    item->next = free;
+    free = item;
+  }
+  returned.notify_one();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// UnboundedPool
+////////////////////////////////////////////////////////////////////////////////
+
+// UnboundedPool<T, POLICY> is a pool of items of type T.
+// UnboundedPool<> will automatically allocate more items if the pool becomes
+// empty.
+// POLICY controls whether pool items are constructed and destructed each
+// time they are borrowed from and returned to the pool.
+template <typename T, PoolPolicy POLICY = PoolPolicy::Reconstruct>
+class UnboundedPool : public Pool<T> {
+ public:
+  using Item = typename Pool<T>::Item;
+  using Loan = typename Pool<T>::Loan;
+
+  MARL_NO_EXPORT inline UnboundedPool(
+      Allocator* allocator = Allocator::Default);
+
+  // borrow() borrows a single item from the pool, automatically allocating
+  // more items if the pool is empty.
+  // This function does not block.
+  MARL_NO_EXPORT inline Loan borrow() const;
+
+  // borrow() borrows count items from the pool, calling the function f() with
+  // each borrowed item.
+  // F must be a function with the signature: void(T&&)
+  // This function does not block.
+  template <typename F>
+  MARL_NO_EXPORT inline void borrow(size_t n, const F& f) const;
+
+ private:
+  class Storage : public Pool<T>::Storage {
+   public:
+    MARL_NO_EXPORT inline Storage(Allocator* allocator);
+    MARL_NO_EXPORT inline ~Storage();
+    MARL_NO_EXPORT inline void return_(Item*) override;
+    // We cannot copy this as the Item pointers would be shared and
+    // deleted at a wrong point. We could move this but would have to take
+    // extra care no Item pointers are left in the moved-out object.
+    MARL_NO_EXPORT inline Storage(const Storage&) = delete;
+    MARL_NO_EXPORT inline Storage& operator=(const Storage&) = delete;
+
+    Allocator* allocator;
+    marl::mutex mutex;
+    containers::vector<Item*, 4> items;
+    Item* free = nullptr;
+  };
+
+  Allocator* allocator;
+  std::shared_ptr<Storage> storage;
+};
+
+template <typename T, PoolPolicy POLICY>
+UnboundedPool<T, POLICY>::Storage::Storage(Allocator* allocator)
+    : allocator(allocator), items(allocator) {}
+
+template <typename T, PoolPolicy POLICY>
+UnboundedPool<T, POLICY>::Storage::~Storage() {
+  for (auto item : items) {
+    if (POLICY == PoolPolicy::Preserve) {
+      item->destruct();
+    }
+    allocator->destroy(item);
+  }
+}
+
+template <typename T, PoolPolicy POLICY>
+UnboundedPool<T, POLICY>::UnboundedPool(
+    Allocator* allocator /* = Allocator::Default */)
+    : allocator(allocator),
+      storage(allocator->make_shared<Storage>(allocator)) {}
+
+template <typename T, PoolPolicy POLICY>
+Loan<T> UnboundedPool<T, POLICY>::borrow() const {
+  Loan out;
+  borrow(1, [&](Loan&& loan) { out = std::move(loan); });
+  return out;
+}
+
+template <typename T, PoolPolicy POLICY>
+template <typename F>
+inline void UnboundedPool<T, POLICY>::borrow(size_t n, const F& f) const {
+  marl::lock lock(storage->mutex);
+  for (size_t i = 0; i < n; i++) {
+    if (storage->free == nullptr) {
+      auto count = std::max<size_t>(storage->items.size(), 32);
+      for (size_t j = 0; j < count; j++) {
+        auto item = allocator->create<Item>();
+        if (POLICY == PoolPolicy::Preserve) {
+          item->construct();
+        }
+        storage->items.push_back(item);
+        item->next = storage->free;
+        storage->free = item;
+      }
+    }
+
+    auto item = storage->free;
+    storage->free = storage->free->next;
+    if (POLICY == PoolPolicy::Reconstruct) {
+      item->construct();
+    }
+    f(std::move(Loan(item, storage)));
+  }
+}
+
+template <typename T, PoolPolicy POLICY>
+void UnboundedPool<T, POLICY>::Storage::return_(Item* item) {
+  if (POLICY == PoolPolicy::Reconstruct) {
+    item->destruct();
+  }
+  marl::lock lock(mutex);
+  item->next = free;
+  free = item;
+}
+
+}  // namespace marl
+
+#endif  // marl_pool_h
--- a/3party/marl/include/marl/sanitizers.h
+++ b/3party/marl/include/marl/sanitizers.h
@ -0,0 +1,98 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_sanitizers_h
+#define marl_sanitizers_h
+
+// Define MARL_ADDRESS_SANITIZER_ENABLED to 1 if the project was built with the
+// address sanitizer enabled (-fsanitize=address).
+#if defined(__SANITIZE_ADDRESS__)
+#define MARL_ADDRESS_SANITIZER_ENABLED 1
+#else  // defined(__SANITIZE_ADDRESS__)
+#if defined(__clang__)
+#if __has_feature(address_sanitizer)
+#define MARL_ADDRESS_SANITIZER_ENABLED 1
+#endif  // __has_feature(address_sanitizer)
+#endif  // defined(__clang__)
+#endif  // defined(__SANITIZE_ADDRESS__)
+
+// MARL_ADDRESS_SANITIZER_ONLY(X) resolves to X if
+// MARL_ADDRESS_SANITIZER_ENABLED is defined to a non-zero value, otherwise
+// MARL_ADDRESS_SANITIZER_ONLY() is stripped by the preprocessor.
+#if MARL_ADDRESS_SANITIZER_ENABLED
+#define MARL_ADDRESS_SANITIZER_ONLY(x) x
+#else
+#define MARL_ADDRESS_SANITIZER_ONLY(x)
+#endif  // MARL_ADDRESS_SANITIZER_ENABLED
+
+// Define MARL_MEMORY_SANITIZER_ENABLED to 1 if the project was built with the
+// memory sanitizer enabled (-fsanitize=memory).
+#if defined(__SANITIZE_MEMORY__)
+#define MARL_MEMORY_SANITIZER_ENABLED 1
+#else  // defined(__SANITIZE_MEMORY__)
+#if defined(__clang__)
+#if __has_feature(memory_sanitizer)
+#define MARL_MEMORY_SANITIZER_ENABLED 1
+#endif  // __has_feature(memory_sanitizer)
+#endif  // defined(__clang__)
+#endif  // defined(__SANITIZE_MEMORY__)
+
+// MARL_MEMORY_SANITIZER_ONLY(X) resolves to X if MARL_MEMORY_SANITIZER_ENABLED
+// is defined to a non-zero value, otherwise MARL_MEMORY_SANITIZER_ONLY() is
+// stripped by the preprocessor.
+#if MARL_MEMORY_SANITIZER_ENABLED
+#define MARL_MEMORY_SANITIZER_ONLY(x) x
+#else
+#define MARL_MEMORY_SANITIZER_ONLY(x)
+#endif  // MARL_MEMORY_SANITIZER_ENABLED
+
+// Define MARL_THREAD_SANITIZER_ENABLED to 1 if the project was built with the
+// thread sanitizer enabled (-fsanitize=thread).
+#if defined(__SANITIZE_THREAD__)
+#define MARL_THREAD_SANITIZER_ENABLED 1
+#else  // defined(__SANITIZE_THREAD__)
+#if defined(__clang__)
+#if __has_feature(thread_sanitizer)
+#define MARL_THREAD_SANITIZER_ENABLED 1
+#endif  // __has_feature(thread_sanitizer)
+#endif  // defined(__clang__)
+#endif  // defined(__SANITIZE_THREAD__)
+
+// MARL_THREAD_SANITIZER_ONLY(X) resolves to X if MARL_THREAD_SANITIZER_ENABLED
+// is defined to a non-zero value, otherwise MARL_THREAD_SANITIZER_ONLY() is
+// stripped by the preprocessor.
+#if MARL_THREAD_SANITIZER_ENABLED
+#define MARL_THREAD_SANITIZER_ONLY(x) x
+#else
+#define MARL_THREAD_SANITIZER_ONLY(x)
+#endif  // MARL_THREAD_SANITIZER_ENABLED
+
+// Define MARL_UNDEFINED_SANITIZER_ENABLED to 1 if the project was built with
+// the undefined sanitizer enabled (-fsanitize=undefined).
+#if defined(__clang__)
+#if __has_feature(undefined_behavior_sanitizer)
+#define MARL_UNDEFINED_SANITIZER_ENABLED 1
+#endif  // __has_feature(undefined_behavior_sanitizer)
+#endif  // defined(__clang__)
+
+// MARL_UNDEFINED_SANITIZER_ONLY(X) resolves to X if
+// MARL_UNDEFINED_SANITIZER_ENABLED is defined to a non-zero value, otherwise
+// MARL_UNDEFINED_SANITIZER_ONLY() is stripped by the preprocessor.
+#if MARL_UNDEFINED_SANITIZER_ENABLED
+#define MARL_UNDEFINED_SANITIZER_ONLY(x) x
+#else
+#define MARL_UNDEFINED_SANITIZER_ONLY(x)
+#endif  // MARL_UNDEFINED_SANITIZER_ENABLED
+
+#endif  // marl_sanitizers_h
--- a/3party/marl/include/marl/scheduler.h
+++ b/3party/marl/include/marl/scheduler.h
@ -0,0 +1,615 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_scheduler_h
+#define marl_scheduler_h
+
+#include "containers.h"
+#include "debug.h"
+#include "deprecated.h"
+#include "export.h"
+#include "memory.h"
+#include "mutex.h"
+#include "sanitizers.h"
+#include "task.h"
+#include "thread.h"
+#include "thread_local.h"
+
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <thread>
+
+namespace marl {
+
+class OSFiber;
+
+// Scheduler asynchronously processes Tasks.
+// A scheduler can be bound to one or more threads using the bind() method.
+// Once bound to a thread, that thread can call marl::schedule() to enqueue
+// work tasks to be executed asynchronously.
+// Scheduler are initially constructed in single-threaded mode.
+// Call setWorkerThreadCount() to spawn dedicated worker threads.
+class Scheduler {
+  class Worker;
+
+ public:
+  using TimePoint = std::chrono::system_clock::time_point;
+  using Predicate = std::function<bool()>;
+  using ThreadInitializer = std::function<void(int workerId)>;
+
+  // Config holds scheduler configuration settings that can be passed to the
+  // Scheduler constructor.
+  struct Config {
+    static constexpr size_t DefaultFiberStackSize = 1024 * 1024;
+
+    // Per-worker-thread settings.
+    struct WorkerThread {
+      // Total number of dedicated worker threads to spawn for the scheduler.
+      int count = 0;
+
+      // Initializer function to call after thread creation and before any work
+      // is run by the thread.
+      ThreadInitializer initializer;
+
+      // Thread affinity policy to use for worker threads.
+      std::shared_ptr<Thread::Affinity::Policy> affinityPolicy;
+    };
+
+    WorkerThread workerThread;
+
+    // Memory allocator to use for the scheduler and internal allocations.
+    Allocator* allocator = Allocator::Default;
+
+    // Size of each fiber stack. This may be rounded up to the nearest
+    // allocation granularity for the given platform.
+    size_t fiberStackSize = DefaultFiberStackSize;
+
+    // allCores() returns a Config with a worker thread for each of the logical
+    // cpus available to the process.
+    MARL_EXPORT
+    static Config allCores();
+
+    // Fluent setters that return this Config so set calls can be chained.
+    MARL_NO_EXPORT inline Config& setAllocator(Allocator*);
+    MARL_NO_EXPORT inline Config& setFiberStackSize(size_t);
+    MARL_NO_EXPORT inline Config& setWorkerThreadCount(int);
+    MARL_NO_EXPORT inline Config& setWorkerThreadInitializer(
+        const ThreadInitializer&);
+    MARL_NO_EXPORT inline Config& setWorkerThreadAffinityPolicy(
+        const std::shared_ptr<Thread::Affinity::Policy>&);
+  };
+
+  // Constructor.
+  MARL_EXPORT
+  Scheduler(const Config&);
+
+  // Destructor.
+  // Blocks until the scheduler is unbound from all threads before returning.
+  MARL_EXPORT
+  ~Scheduler();
+
+  // get() returns the scheduler bound to the current thread.
+  MARL_EXPORT
+  static Scheduler* get();
+
+  // bind() binds this scheduler to the current thread.
+  // There must be no existing scheduler bound to the thread prior to calling.
+  MARL_EXPORT
+  void bind();
+
+  // unbind() unbinds the scheduler currently bound to the current thread.
+  // There must be an existing scheduler bound to the thread prior to calling.
+  // unbind() flushes any enqueued tasks on the single-threaded worker before
+  // returning.
+  MARL_EXPORT
+  static void unbind();
+
+  // enqueue() queues the task for asynchronous execution.
+  MARL_EXPORT
+  void enqueue(Task&& task);
+
+  // config() returns the Config that was used to build the scheduler.
+  MARL_EXPORT
+  const Config& config() const;
+
+  // Fibers expose methods to perform cooperative multitasking and are
+  // automatically created by the Scheduler.
+  //
+  // The currently executing Fiber can be obtained by calling Fiber::current().
+  //
+  // When execution becomes blocked, yield() can be called to suspend execution
+  // of the fiber and start executing other pending work. Once the block has
+  // been lifted, schedule() can be called to reschedule the Fiber on the same
+  // thread that previously executed it.
+  class Fiber {
+   public:
+    // current() returns the currently executing fiber, or nullptr if called
+    // without a bound scheduler.
+    MARL_EXPORT
+    static Fiber* current();
+
+    // wait() suspends execution of this Fiber until the Fiber is woken up with
+    // a call to notify() and the predicate pred returns true.
+    // If the predicate pred does not return true when notify() is called, then
+    // the Fiber is automatically re-suspended, and will need to be woken with
+    // another call to notify().
+    // While the Fiber is suspended, the scheduler thread may continue executing
+    // other tasks.
+    // lock must be locked before calling, and is unlocked by wait() just before
+    // the Fiber is suspended, and re-locked before the fiber is resumed. lock
+    // will be locked before wait() returns.
+    // pred will be always be called with the lock held.
+    // wait() must only be called on the currently executing fiber.
+    MARL_EXPORT
+    void wait(marl::lock& lock, const Predicate& pred);
+
+    // wait() suspends execution of this Fiber until the Fiber is woken up with
+    // a call to notify() and the predicate pred returns true, or sometime after
+    // the timeout is reached.
+    // If the predicate pred does not return true when notify() is called, then
+    // the Fiber is automatically re-suspended, and will need to be woken with
+    // another call to notify() or will be woken sometime after the timeout is
+    // reached.
+    // While the Fiber is suspended, the scheduler thread may continue executing
+    // other tasks.
+    // lock must be locked before calling, and is unlocked by wait() just before
+    // the Fiber is suspended, and re-locked before the fiber is resumed. lock
+    // will be locked before wait() returns.
+    // pred will be always be called with the lock held.
+    // wait() must only be called on the currently executing fiber.
+    template <typename Clock, typename Duration>
+    MARL_NO_EXPORT inline bool wait(
+        marl::lock& lock,
+        const std::chrono::time_point<Clock, Duration>& timeout,
+        const Predicate& pred);
+
+    // wait() suspends execution of this Fiber until the Fiber is woken up with
+    // a call to notify().
+    // While the Fiber is suspended, the scheduler thread may continue executing
+    // other tasks.
+    // wait() must only be called on the currently executing fiber.
+    //
+    // Warning: Unlike wait() overloads that take a lock and predicate, this
+    // form of wait() offers no safety for notify() signals that occur before
+    // the fiber is suspended, when signalling between different threads. In
+    // this scenario you may deadlock. For this reason, it is only ever
+    // recommended to use this overload if you can guarantee that the calls to
+    // wait() and notify() are made by the same thread.
+    //
+    // Use with extreme caution.
+    MARL_NO_EXPORT inline void wait();
+
+    // wait() suspends execution of this Fiber until the Fiber is woken up with
+    // a call to notify(), or sometime after the timeout is reached.
+    // While the Fiber is suspended, the scheduler thread may continue executing
+    // other tasks.
+    // wait() must only be called on the currently executing fiber.
+    //
+    // Warning: Unlike wait() overloads that take a lock and predicate, this
+    // form of wait() offers no safety for notify() signals that occur before
+    // the fiber is suspended, when signalling between different threads. For
+    // this reason, it is only ever recommended to use this overload if you can
+    // guarantee that the calls to wait() and notify() are made by the same
+    // thread.
+    //
+    // Use with extreme caution.
+    template <typename Clock, typename Duration>
+    MARL_NO_EXPORT inline bool wait(
+        const std::chrono::time_point<Clock, Duration>& timeout);
+
+    // notify() reschedules the suspended Fiber for execution.
+    // notify() is usually only called when the predicate for one or more wait()
+    // calls will likely return true.
+    MARL_EXPORT
+    void notify();
+
+    // id is the thread-unique identifier of the Fiber.
+    uint32_t const id;
+
+   private:
+    friend class Allocator;
+    friend class Scheduler;
+
+    enum class State {
+      // Idle: the Fiber is currently unused, and sits in Worker::idleFibers,
+      // ready to be recycled.
+      Idle,
+
+      // Yielded: the Fiber is currently blocked on a wait() call with no
+      // timeout.
+      Yielded,
+
+      // Waiting: the Fiber is currently blocked on a wait() call with a
+      // timeout. The fiber is stilling in the Worker::Work::waiting queue.
+      Waiting,
+
+      // Queued: the Fiber is currently queued for execution in the
+      // Worker::Work::fibers queue.
+      Queued,
+
+      // Running: the Fiber is currently executing.
+      Running,
+    };
+
+    Fiber(Allocator::unique_ptr<OSFiber>&&, uint32_t id);
+
+    // switchTo() switches execution to the given fiber.
+    // switchTo() must only be called on the currently executing fiber.
+    void switchTo(Fiber*);
+
+    // create() constructs and returns a new fiber with the given identifier,
+    // stack size and func that will be executed when switched to.
+    static Allocator::unique_ptr<Fiber> create(
+        Allocator* allocator,
+        uint32_t id,
+        size_t stackSize,
+        const std::function<void()>& func);
+
+    // createFromCurrentThread() constructs and returns a new fiber with the
+    // given identifier for the current thread.
+    static Allocator::unique_ptr<Fiber> createFromCurrentThread(
+        Allocator* allocator,
+        uint32_t id);
+
+    // toString() returns a string representation of the given State.
+    // Used for debugging.
+    static const char* toString(State state);
+
+    Allocator::unique_ptr<OSFiber> const impl;
+    Worker* const worker;
+    State state = State::Running;  // Guarded by Worker's work.mutex.
+  };
+
+ private:
+  Scheduler(const Scheduler&) = delete;
+  Scheduler(Scheduler&&) = delete;
+  Scheduler& operator=(const Scheduler&) = delete;
+  Scheduler& operator=(Scheduler&&) = delete;
+
+  // Maximum number of worker threads.
+  static constexpr size_t MaxWorkerThreads = 256;
+
+  // WaitingFibers holds all the fibers waiting on a timeout.
+  struct WaitingFibers {
+    inline WaitingFibers(Allocator*);
+
+    // operator bool() returns true iff there are any wait fibers.
+    inline operator bool() const;
+
+    // take() returns the next fiber that has exceeded its timeout, or nullptr
+    // if there are no fibers that have yet exceeded their timeouts.
+    inline Fiber* take(const TimePoint& timeout);
+
+    // next() returns the timepoint of the next fiber to timeout.
+    // next() can only be called if operator bool() returns true.
+    inline TimePoint next() const;
+
+    // add() adds another fiber and timeout to the list of waiting fibers.
+    inline void add(const TimePoint& timeout, Fiber* fiber);
+
+    // erase() removes the fiber from the waiting list.
+    inline void erase(Fiber* fiber);
+
+    // contains() returns true if fiber is waiting.
+    inline bool contains(Fiber* fiber) const;
+
+   private:
+    struct Timeout {
+      TimePoint timepoint;
+      Fiber* fiber;
+      inline bool operator<(const Timeout&) const;
+    };
+    containers::set<Timeout, std::less<Timeout>> timeouts;
+    containers::unordered_map<Fiber*, TimePoint> fibers;
+  };
+
+  // TODO: Implement a queue that recycles elements to reduce number of
+  // heap allocations.
+  using TaskQueue = containers::deque<Task>;
+  using FiberQueue = containers::deque<Fiber*>;
+  using FiberSet = containers::unordered_set<Fiber*>;
+
+  // Workers execute Tasks on a single thread.
+  // Once a task is started, it may yield to other tasks on the same Worker.
+  // Tasks are always resumed by the same Worker.
+  class Worker {
+   public:
+    enum class Mode {
+      // Worker will spawn a background thread to process tasks.
+      MultiThreaded,
+
+      // Worker will execute tasks whenever it yields.
+      SingleThreaded,
+    };
+
+    Worker(Scheduler* scheduler, Mode mode, uint32_t id);
+
+    // start() begins execution of the worker.
+    void start() EXCLUDES(work.mutex);
+
+    // stop() ceases execution of the worker, blocking until all pending
+    // tasks have fully finished.
+    void stop() EXCLUDES(work.mutex);
+
+    // wait() suspends execution of the current task until the predicate pred
+    // returns true or the optional timeout is reached.
+    // See Fiber::wait() for more information.
+    MARL_EXPORT
+    bool wait(marl::lock& lock, const TimePoint* timeout, const Predicate& pred)
+        EXCLUDES(work.mutex);
+
+    // wait() suspends execution of the current task until the fiber is
+    // notified, or the optional timeout is reached.
+    // See Fiber::wait() for more information.
+    MARL_EXPORT
+    bool wait(const TimePoint* timeout) EXCLUDES(work.mutex);
+
+    // suspend() suspends the currently executing Fiber until the fiber is
+    // woken with a call to enqueue(Fiber*), or automatically sometime after the
+    // optional timeout.
+    void suspend(const TimePoint* timeout) REQUIRES(work.mutex);
+
+    // enqueue(Fiber*) enqueues resuming of a suspended fiber.
+    void enqueue(Fiber* fiber) EXCLUDES(work.mutex);
+
+    // enqueue(Task&&) enqueues a new, unstarted task.
+    void enqueue(Task&& task) EXCLUDES(work.mutex);
+
+    // tryLock() attempts to lock the worker for task enqueuing.
+    // If the lock was successful then true is returned, and the caller must
+    // call enqueueAndUnlock().
+    bool tryLock() EXCLUDES(work.mutex) TRY_ACQUIRE(true, work.mutex);
+
+    // enqueueAndUnlock() enqueues the task and unlocks the worker.
+    // Must only be called after a call to tryLock() which returned true.
+    // _Releases_lock_(work.mutex)
+    void enqueueAndUnlock(Task&& task) REQUIRES(work.mutex) RELEASE(work.mutex);
+
+    // runUntilShutdown() processes all tasks and fibers until there are no more
+    // and shutdown is true, upon runUntilShutdown() returns.
+    void runUntilShutdown() REQUIRES(work.mutex);
+
+    // steal() attempts to steal a Task from the worker for another worker.
+    // Returns true if a task was taken and assigned to out, otherwise false.
+    bool steal(Task& out) EXCLUDES(work.mutex);
+
+    // getCurrent() returns the Worker currently bound to the current
+    // thread.
+    static inline Worker* getCurrent();
+
+    // getCurrentFiber() returns the Fiber currently being executed.
+    inline Fiber* getCurrentFiber() const;
+
+    // Unique identifier of the Worker.
+    const uint32_t id;
+
+   private:
+    // run() is the task processing function for the worker.
+    // run() processes tasks until stop() is called.
+    void run() REQUIRES(work.mutex);
+
+    // createWorkerFiber() creates a new fiber that when executed calls
+    // run().
+    Fiber* createWorkerFiber() REQUIRES(work.mutex);
+
+    // switchToFiber() switches execution to the given fiber. The fiber
+    // must belong to this worker.
+    void switchToFiber(Fiber*) REQUIRES(work.mutex);
+
+    // runUntilIdle() executes all pending tasks and then returns.
+    void runUntilIdle() REQUIRES(work.mutex);
+
+    // waitForWork() blocks until new work is available, potentially calling
+    // spinForWork().
+    void waitForWork() REQUIRES(work.mutex);
+
+    // spinForWorkAndLock() attempts to steal work from another Worker, and keeps
+    // the thread awake for a short duration. This reduces overheads of
+    // frequently putting the thread to sleep and re-waking. It locks the mutex
+    // before returning so that a stolen task cannot be re-stolen by other workers.
+    void spinForWorkAndLock() ACQUIRE(work.mutex);
+
+    // enqueueFiberTimeouts() enqueues all the fibers that have finished
+    // waiting.
+    void enqueueFiberTimeouts() REQUIRES(work.mutex);
+
+    inline void changeFiberState(Fiber* fiber,
+                                 Fiber::State from,
+                                 Fiber::State to) const REQUIRES(work.mutex);
+
+    inline void setFiberState(Fiber* fiber, Fiber::State to) const
+        REQUIRES(work.mutex);
+
+    // Work holds tasks and fibers that are enqueued on the Worker.
+    struct Work {
+      inline Work(Allocator*);
+
+      std::atomic<uint64_t> num = {0};  // tasks.size() + fibers.size()
+      GUARDED_BY(mutex) uint64_t numBlockedFibers = 0;
+      GUARDED_BY(mutex) TaskQueue tasks;
+      GUARDED_BY(mutex) FiberQueue fibers;
+      GUARDED_BY(mutex) WaitingFibers waiting;
+      GUARDED_BY(mutex) bool notifyAdded = true;
+      std::condition_variable added;
+      marl::mutex mutex;
+
+      template <typename F>
+      inline void wait(F&&) REQUIRES(mutex);
+    };
+
+    // https://en.wikipedia.org/wiki/Xorshift
+    class FastRnd {
+     public:
+      inline uint64_t operator()() {
+        x ^= x << 13;
+        x ^= x >> 7;
+        x ^= x << 17;
+        return x;
+      }
+
+     private:
+      uint64_t x = std::chrono::system_clock::now().time_since_epoch().count();
+    };
+
+    // The current worker bound to the current thread.
+    MARL_DECLARE_THREAD_LOCAL(Worker*, current);
+
+    Mode const mode;
+    Scheduler* const scheduler;
+    Allocator::unique_ptr<Fiber> mainFiber;
+    Fiber* currentFiber = nullptr;
+    Thread thread;
+    Work work;
+    FiberSet idleFibers;  // Fibers that have completed which can be reused.
+    containers::vector<Allocator::unique_ptr<Fiber>, 16>
+        workerFibers;  // All fibers created by this worker.
+    FastRnd rng;
+    bool shutdown = false;
+  };
+
+  // stealWork() attempts to steal a task from the worker with the given id.
+  // Returns true if a task was stolen and assigned to out, otherwise false.
+  bool stealWork(Worker* thief, uint64_t from, Task& out);
+
+  // onBeginSpinning() is called when a Worker calls spinForWork().
+  // The scheduler will prioritize this worker for new tasks to try to prevent
+  // it going to sleep.
+  void onBeginSpinning(int workerId);
+
+  // setBound() sets the scheduler bound to the current thread.
+  static void setBound(Scheduler* scheduler);
+
+  // The scheduler currently bound to the current thread.
+  MARL_DECLARE_THREAD_LOCAL(Scheduler*, bound);
+
+  // The immutable configuration used to build the scheduler.
+  const Config cfg;
+
+  std::array<std::atomic<int>, MaxWorkerThreads> spinningWorkers;
+  std::atomic<unsigned int> nextSpinningWorkerIdx = {0x8000000};
+
+  std::atomic<unsigned int> nextEnqueueIndex = {0};
+  std::array<Worker*, MaxWorkerThreads> workerThreads;
+
+  struct SingleThreadedWorkers {
+    inline SingleThreadedWorkers(Allocator*);
+
+    using WorkerByTid =
+        containers::unordered_map<std::thread::id,
+                                  Allocator::unique_ptr<Worker>>;
+    marl::mutex mutex;
+    GUARDED_BY(mutex) std::condition_variable unbind;
+    GUARDED_BY(mutex) WorkerByTid byTid;
+  };
+  SingleThreadedWorkers singleThreadedWorkers;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::Config
+////////////////////////////////////////////////////////////////////////////////
+Scheduler::Config& Scheduler::Config::setAllocator(Allocator* alloc) {
+  allocator = alloc;
+  return *this;
+}
+
+Scheduler::Config& Scheduler::Config::setFiberStackSize(size_t size) {
+  fiberStackSize = size;
+  return *this;
+}
+
+Scheduler::Config& Scheduler::Config::setWorkerThreadCount(int count) {
+  workerThread.count = count;
+  return *this;
+}
+
+Scheduler::Config& Scheduler::Config::setWorkerThreadInitializer(
+    const ThreadInitializer& initializer) {
+  workerThread.initializer = initializer;
+  return *this;
+}
+
+Scheduler::Config& Scheduler::Config::setWorkerThreadAffinityPolicy(
+    const std::shared_ptr<Thread::Affinity::Policy>& policy) {
+  workerThread.affinityPolicy = policy;
+  return *this;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::Fiber
+////////////////////////////////////////////////////////////////////////////////
+template <typename Clock, typename Duration>
+bool Scheduler::Fiber::wait(
+    marl::lock& lock,
+    const std::chrono::time_point<Clock, Duration>& timeout,
+    const Predicate& pred) {
+  using ToDuration = typename TimePoint::duration;
+  using ToClock = typename TimePoint::clock;
+  auto tp = std::chrono::time_point_cast<ToDuration, ToClock>(timeout);
+  return worker->wait(lock, &tp, pred);
+}
+
+void Scheduler::Fiber::wait() {
+  worker->wait(nullptr);
+}
+
+template <typename Clock, typename Duration>
+bool Scheduler::Fiber::wait(
+    const std::chrono::time_point<Clock, Duration>& timeout) {
+  using ToDuration = typename TimePoint::duration;
+  using ToClock = typename TimePoint::clock;
+  auto tp = std::chrono::time_point_cast<ToDuration, ToClock>(timeout);
+  return worker->wait(&tp);
+}
+
+Scheduler::Worker* Scheduler::Worker::getCurrent() {
+  return Worker::current;
+}
+
+Scheduler::Fiber* Scheduler::Worker::getCurrentFiber() const {
+  return currentFiber;
+}
+
+// schedule() schedules the task T to be asynchronously called using the
+// currently bound scheduler.
+inline void schedule(Task&& t) {
+  MARL_ASSERT_HAS_BOUND_SCHEDULER("marl::schedule");
+  auto scheduler = Scheduler::get();
+  scheduler->enqueue(std::move(t));
+}
+
+// schedule() schedules the function f to be asynchronously called with the
+// given arguments using the currently bound scheduler.
+template <typename Function, typename... Args>
+inline void schedule(Function&& f, Args&&... args) {
+  MARL_ASSERT_HAS_BOUND_SCHEDULER("marl::schedule");
+  auto scheduler = Scheduler::get();
+  scheduler->enqueue(
+      Task(std::bind(std::forward<Function>(f), std::forward<Args>(args)...)));
+}
+
+// schedule() schedules the function f to be asynchronously called using the
+// currently bound scheduler.
+template <typename Function>
+inline void schedule(Function&& f) {
+  MARL_ASSERT_HAS_BOUND_SCHEDULER("marl::schedule");
+  auto scheduler = Scheduler::get();
+  scheduler->enqueue(Task(std::forward<Function>(f)));
+}
+
+}  // namespace marl
+
+#endif  // marl_scheduler_h
--- a/3party/marl/include/marl/task.h
+++ b/3party/marl/include/marl/task.h
@ -0,0 +1,107 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_task_h
+#define marl_task_h
+
+#include "export.h"
+
+#include <functional>
+
+namespace marl {
+
+// Task is a unit of work for the scheduler.
+class Task {
+ public:
+  using Function = std::function<void()>;
+
+  enum class Flags {
+    None = 0,
+
+    // SameThread ensures the task will be run on the same thread that scheduled
+    // the task. This can offer performance improvements if the current thread
+    // is immediately going to block on the newly scheduled task, by reducing
+    // overheads of waking another thread.
+    SameThread = 1,
+  };
+
+  MARL_NO_EXPORT inline Task();
+  MARL_NO_EXPORT inline Task(const Task&);
+  MARL_NO_EXPORT inline Task(Task&&);
+  MARL_NO_EXPORT inline Task(const Function& function,
+                             Flags flags = Flags::None);
+  MARL_NO_EXPORT inline Task(Function&& function, Flags flags = Flags::None);
+  MARL_NO_EXPORT inline Task& operator=(const Task&);
+  MARL_NO_EXPORT inline Task& operator=(Task&&);
+  MARL_NO_EXPORT inline Task& operator=(const Function&);
+  MARL_NO_EXPORT inline Task& operator=(Function&&);
+
+  // operator bool() returns true if the Task has a valid function.
+  MARL_NO_EXPORT inline operator bool() const;
+
+  // operator()() runs the task.
+  MARL_NO_EXPORT inline void operator()() const;
+
+  // is() returns true if the Task was created with the given flag.
+  MARL_NO_EXPORT inline bool is(Flags flag) const;
+
+ private:
+  Function function;
+  Flags flags = Flags::None;
+};
+
+Task::Task() = default;
+Task::Task(const Task& o) : function(o.function), flags(o.flags) {}
+Task::Task(Task&& o) : function(std::move(o.function)), flags(o.flags) {}
+Task::Task(const Function& function_, Flags flags_ /* = Flags::None */)
+    : function(function_), flags(flags_) {}
+Task::Task(Function&& function_, Flags flags_ /* = Flags::None */)
+    : function(std::move(function_)), flags(flags_) {}
+Task& Task::operator=(const Task& o) {
+  function = o.function;
+  flags = o.flags;
+  return *this;
+}
+Task& Task::operator=(Task&& o) {
+  function = std::move(o.function);
+  flags = o.flags;
+  return *this;
+}
+
+Task& Task::operator=(const Function& f) {
+  function = f;
+  flags = Flags::None;
+  return *this;
+}
+Task& Task::operator=(Function&& f) {
+  function = std::move(f);
+  flags = Flags::None;
+  return *this;
+}
+Task::operator bool() const {
+  return function.operator bool();
+}
+
+void Task::operator()() const {
+  function();
+}
+
+bool Task::is(Flags flag) const {
+  return (static_cast<int>(flags) & static_cast<int>(flag)) ==
+         static_cast<int>(flag);
+}
+
+}  // namespace marl
+
+#endif  // marl_task_h
--- a/3party/marl/include/marl/thread.h
+++ b/3party/marl/include/marl/thread.h
@ -0,0 +1,172 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_thread_h
+#define marl_thread_h
+
+#include "containers.h"
+#include "export.h"
+
+#include <functional>
+
+namespace marl {
+
+// Thread provides an OS abstraction for threads of execution.
+class Thread {
+ public:
+  using Func = std::function<void()>;
+
+  // Core identifies a logical processor unit.
+  // How a core is identified varies by platform.
+  struct Core {
+    struct Windows {
+      uint8_t group;  // Group number
+      uint8_t index;  // Core within the processor group
+    };
+    struct Pthread {
+      uint16_t index;  // Core number
+    };
+    union {
+      Windows windows;
+      Pthread pthread;
+    };
+
+    // Comparison functions
+    MARL_NO_EXPORT inline bool operator==(Core) const;
+    MARL_NO_EXPORT inline bool operator<(Core) const;
+  };
+
+  // Affinity holds the affinity mask for a thread - a description of what cores
+  // the thread is allowed to run on.
+  struct Affinity {
+    // supported is true if marl supports controlling thread affinity for this
+    // platform.
+#if defined(_WIN32) ||                                                       \
+    (defined(__linux__) && !defined(__ANDROID__) && !defined(__BIONIC__)) || \
+    defined(__FreeBSD__)
+    static constexpr bool supported = true;
+#else
+    static constexpr bool supported = false;
+#endif
+
+    // Policy is an interface that provides a get() method for returning an
+    // Affinity for the given thread by id.
+    class Policy {
+     public:
+      virtual ~Policy() {}
+
+      // anyOf() returns a Policy that returns an Affinity for a number of
+      // available cores in affinity.
+      //
+      // Windows requires that each thread is only associated with a
+      // single affinity group, so the Policy's returned affinity will contain
+      // cores all from the same group.
+      MARL_EXPORT static std::shared_ptr<Policy> anyOf(
+          Affinity&& affinity,
+          Allocator* allocator = Allocator::Default);
+
+      // oneOf() returns a Policy that returns an affinity with a single enabled
+      // core from affinity. The single enabled core in the Policy's returned
+      // affinity is:
+      //      affinity[threadId % affinity.count()]
+      MARL_EXPORT static std::shared_ptr<Policy> oneOf(
+          Affinity&& affinity,
+          Allocator* allocator = Allocator::Default);
+
+      // get() returns the thread Affinity for the given thread by id.
+      MARL_EXPORT virtual Affinity get(uint32_t threadId,
+                                       Allocator* allocator) const = 0;
+    };
+
+    MARL_EXPORT Affinity(Allocator*);
+
+    MARL_EXPORT Affinity(Affinity&&);
+
+    MARL_EXPORT Affinity& operator=(Affinity&&);
+
+    MARL_EXPORT Affinity(const Affinity&, Allocator* allocator);
+
+    // all() returns an Affinity with all the cores available to the process.
+    MARL_EXPORT static Affinity all(Allocator* allocator = Allocator::Default);
+
+    MARL_EXPORT Affinity(std::initializer_list<Core>, Allocator* allocator);
+
+    MARL_EXPORT Affinity(const containers::vector<Core, 32>&,
+                         Allocator* allocator);
+
+    // count() returns the number of enabled cores in the affinity.
+    MARL_EXPORT size_t count() const;
+
+    // operator[] returns the i'th enabled core from this affinity.
+    MARL_EXPORT Core operator[](size_t index) const;
+
+    // add() adds the cores from the given affinity to this affinity.
+    // This affinity is returned to allow for fluent calls.
+    MARL_EXPORT Affinity& add(const Affinity&);
+
+    // remove() removes the cores from the given affinity from this affinity.
+    // This affinity is returned to allow for fluent calls.
+    MARL_EXPORT Affinity& remove(const Affinity&);
+
+   private:
+    Affinity(const Affinity&) = delete;
+
+    containers::vector<Core, 32> cores;
+  };
+
+  MARL_EXPORT Thread() = default;
+
+  MARL_EXPORT Thread(Thread&&);
+
+  MARL_EXPORT Thread& operator=(Thread&&);
+
+  // Start a new thread using the given affinity that calls func.
+  MARL_EXPORT Thread(Affinity&& affinity, Func&& func);
+
+  MARL_EXPORT ~Thread();
+
+  // join() blocks until the thread completes.
+  MARL_EXPORT void join();
+
+  // setName() sets the name of the currently executing thread for displaying
+  // in a debugger.
+  MARL_EXPORT static void setName(const char* fmt, ...);
+
+  // numLogicalCPUs() returns the number of available logical CPU cores for
+  // the system.
+  MARL_EXPORT static unsigned int numLogicalCPUs();
+
+ private:
+  Thread(const Thread&) = delete;
+  Thread& operator=(const Thread&) = delete;
+
+  class Impl;
+  Impl* impl = nullptr;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread::Core
+////////////////////////////////////////////////////////////////////////////////
+// Comparison functions
+bool Thread::Core::operator==(Core other) const {
+  return pthread.index == other.pthread.index;
+}
+
+bool Thread::Core::operator<(Core other) const {
+  return pthread.index < other.pthread.index;
+}
+
+}  // namespace marl
+
+#endif  // marl_thread_h
--- a/3party/marl/include/marl/thread_local.h
+++ b/3party/marl/include/marl/thread_local.h
@ -0,0 +1,67 @@
+// Copyright 2023 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A wrapper around a thread_local variable, or a pthread key
+
+#ifndef marl_thread_local_h
+#define marl_thread_local_h
+
+#ifdef MARL_USE_PTHREAD_THREAD_LOCAL
+#include "debug.h"
+
+#include <pthread.h>
+#include <type_traits>
+
+template <typename T>
+class ThreadLocal {
+  static_assert(std::is_pointer<T>::value,
+                "The current implementation of ThreadLocal requires that T "
+                "must be a pointer");
+
+ public:
+  inline ThreadLocal(T v) {
+    pthread_key_create(&key, NULL);
+    pthread_setspecific(key, v);
+  }
+  inline ~ThreadLocal() { pthread_key_delete(key); }
+  inline operator T() const { return static_cast<T>(pthread_getspecific(key)); }
+  inline ThreadLocal& operator=(T v) {
+    pthread_setspecific(key, v);
+    return *this;
+  }
+
+ private:
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
+
+  pthread_key_t key;
+};
+
+#define MARL_DECLARE_THREAD_LOCAL(TYPE, NAME) static ThreadLocal<TYPE> NAME
+#define MARL_INSTANTIATE_THREAD_LOCAL(TYPE, NAME, VALUE) \
+  ThreadLocal<TYPE> NAME {                               \
+    VALUE                                                \
+  }
+
+#else
+
+#define MARL_DECLARE_THREAD_LOCAL(TYPE, NAME) static thread_local TYPE NAME
+#define MARL_INSTANTIATE_THREAD_LOCAL(TYPE, NAME, VALUE) \
+  thread_local TYPE NAME {                               \
+    VALUE                                                \
+  }
+
+#endif
+
+#endif  // marl_thread_local_h
--- a/3party/marl/include/marl/ticket.h
+++ b/3party/marl/include/marl/ticket.h
@ -0,0 +1,257 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_ticket_h
+#define marl_ticket_h
+
+#include "conditionvariable.h"
+#include "pool.h"
+#include "scheduler.h"
+
+namespace marl {
+
+// Ticket is a synchronization primitive used to serially order execution.
+//
+// Tickets exist in 3 mutually exclusive states: Waiting, Called and Finished.
+//
+// Tickets are obtained from a Ticket::Queue, using the Ticket::Queue::take()
+// methods. The order in which tickets are taken from the queue dictates the
+// order in which they are called.
+//
+// The first ticket to be taken from a queue will be in the 'called' state,
+// subsequent tickets will be in the 'waiting' state.
+//
+// Ticket::wait() will block until the ticket is called.
+//
+// Ticket::done() moves the ticket into the 'finished' state. If all preceeding
+// tickets are finished, done() will call the next unfinished ticket.
+//
+// If the last remaining reference to an unfinished ticket is dropped then
+// done() will be automatically called on that ticket.
+//
+// Example:
+//
+//  void runTasksConcurrentThenSerially(int numConcurrentTasks)
+//  {
+//      marl::Ticket::Queue queue;
+//      for (int i = 0; i < numConcurrentTasks; i++)
+//      {
+//          auto ticket = queue.take();
+//          marl::schedule([=] {
+//              doConcurrentWork(); // <- function may be called concurrently
+//              ticket.wait(); // <- serialize tasks
+//              doSerialWork(); // <- function will not be called concurrently
+//              ticket.done(); // <- optional, as done() is called implicitly on
+//                             // dropping of last reference
+//          });
+//      }
+//  }
+class Ticket {
+  struct Shared;
+  struct Record;
+
+ public:
+  using OnCall = std::function<void()>;
+
+  // Queue hands out Tickets.
+  class Queue {
+   public:
+    // take() returns a single ticket from the queue.
+    MARL_NO_EXPORT inline Ticket take();
+
+    // take() retrieves count tickets from the queue, calling f() with each
+    // retrieved ticket.
+    // F must be a function of the signature: void(Ticket&&)
+    template <typename F>
+    MARL_NO_EXPORT inline void take(size_t count, const F& f);
+
+   private:
+    std::shared_ptr<Shared> shared = std::make_shared<Shared>();
+    UnboundedPool<Record> pool;
+  };
+
+  MARL_NO_EXPORT inline Ticket() = default;
+  MARL_NO_EXPORT inline Ticket(const Ticket& other) = default;
+  MARL_NO_EXPORT inline Ticket(Ticket&& other) = default;
+  MARL_NO_EXPORT inline Ticket& operator=(const Ticket& other) = default;
+
+  // wait() blocks until the ticket is called.
+  MARL_NO_EXPORT inline void wait() const;
+
+  // done() marks the ticket as finished and calls the next ticket.
+  MARL_NO_EXPORT inline void done() const;
+
+  // onCall() registers the function f to be invoked when this ticket is
+  // called. If the ticket is already called prior to calling onCall(), then
+  // f() will be executed immediately.
+  // F must be a function of the OnCall signature.
+  template <typename F>
+  MARL_NO_EXPORT inline void onCall(F&& f) const;
+
+ private:
+  // Internal doubly-linked-list data structure. One per ticket instance.
+  struct Record {
+    MARL_NO_EXPORT inline ~Record();
+
+    MARL_NO_EXPORT inline void done();
+    MARL_NO_EXPORT inline void callAndUnlock(marl::lock& lock);
+    MARL_NO_EXPORT inline void unlink();  // guarded by shared->mutex
+
+    ConditionVariable isCalledCondVar;
+
+    std::shared_ptr<Shared> shared;
+    Record* next = nullptr;  // guarded by shared->mutex
+    Record* prev = nullptr;  // guarded by shared->mutex
+    OnCall onCall;           // guarded by shared->mutex
+    bool isCalled = false;   // guarded by shared->mutex
+    std::atomic<bool> isDone = {false};
+  };
+
+  // Data shared between all tickets and the queue.
+  struct Shared {
+    marl::mutex mutex;
+    Record tail;
+  };
+
+  MARL_NO_EXPORT inline Ticket(Loan<Record>&& record);
+
+  Loan<Record> record;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Ticket
+////////////////////////////////////////////////////////////////////////////////
+
+Ticket::Ticket(Loan<Record>&& record) : record(std::move(record)) {}
+
+void Ticket::wait() const {
+  marl::lock lock(record->shared->mutex);
+  record->isCalledCondVar.wait(lock, [this] { return record->isCalled; });
+}
+
+void Ticket::done() const {
+  record->done();
+}
+
+template <typename Function>
+void Ticket::onCall(Function&& f) const {
+  marl::lock lock(record->shared->mutex);
+  if (record->isCalled) {
+    marl::schedule(std::forward<Function>(f));
+    return;
+  }
+  if (record->onCall) {
+    struct Joined {
+      void operator()() const {
+        a();
+        b();
+      }
+      OnCall a, b;
+    };
+    record->onCall =
+        std::move(Joined{std::move(record->onCall), std::forward<Function>(f)});
+  } else {
+    record->onCall = std::forward<Function>(f);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Ticket::Queue
+////////////////////////////////////////////////////////////////////////////////
+
+Ticket Ticket::Queue::take() {
+  Ticket out;
+  take(1, [&](Ticket&& ticket) { out = std::move(ticket); });
+  return out;
+}
+
+template <typename F>
+void Ticket::Queue::take(size_t n, const F& f) {
+  Loan<Record> first, last;
+  pool.borrow(n, [&](Loan<Record>&& record) {
+    Loan<Record> rec = std::move(record);
+    rec->shared = shared;
+    if (first.get() == nullptr) {
+      first = rec;
+    }
+    if (last.get() != nullptr) {
+      last->next = rec.get();
+      rec->prev = last.get();
+    }
+    last = rec;
+    f(std::move(Ticket(std::move(rec))));
+  });
+  last->next = &shared->tail;
+  marl::lock lock(shared->mutex);
+  first->prev = shared->tail.prev;
+  shared->tail.prev = last.get();
+  if (first->prev == nullptr) {
+    first->callAndUnlock(lock);
+  } else {
+    first->prev->next = first.get();
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Ticket::Record
+////////////////////////////////////////////////////////////////////////////////
+
+Ticket::Record::~Record() {
+  if (shared != nullptr) {
+    done();
+  }
+}
+
+void Ticket::Record::done() {
+  if (isDone.exchange(true)) {
+    return;
+  }
+  marl::lock lock(shared->mutex);
+  auto callNext = (prev == nullptr && next != nullptr) ? next : nullptr;
+  unlink();
+  if (callNext != nullptr) {
+    // lock needs to be held otherwise callNext might be destructed.
+    callNext->callAndUnlock(lock);
+  }
+}
+
+void Ticket::Record::callAndUnlock(marl::lock& lock) {
+  if (isCalled) {
+    return;
+  }
+  isCalled = true;
+  OnCall callback;
+  std::swap(callback, onCall);
+  isCalledCondVar.notify_all();
+  lock.unlock_no_tsa();
+
+  if (callback) {
+    marl::schedule(std::move(callback));
+  }
+}
+
+void Ticket::Record::unlink() {
+  if (prev != nullptr) {
+    prev->next = next;
+  }
+  if (next != nullptr) {
+    next->prev = prev;
+  }
+  prev = nullptr;
+  next = nullptr;
+}
+
+}  // namespace marl
+
+#endif  // marl_ticket_h
--- a/3party/marl/include/marl/trace.h
+++ b/3party/marl/include/marl/trace.h
@ -0,0 +1,249 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The Trace API produces a trace event file that can be consumed with Chrome's
+// chrome://tracing viewer.
+// Documentation can be found at:
+//   https://www.chromium.org/developers/how-tos/trace-event-profiling-tool
+//   https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit
+
+#ifndef marl_trace_h
+#define marl_trace_h
+
+#define MARL_TRACE_ENABLED 0
+
+#if MARL_TRACE_ENABLED
+
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdarg>
+#include <cstring>
+#include <mutex>
+#include <ostream>
+#include <queue>
+#include <thread>
+
+namespace marl {
+
+// Trace writes a trace event file into the current working directory that can
+// be consumed with Chrome's chrome://tracing viewer.
+// Use the MARL_* macros below instead of using this class directly.
+class Trace {
+ public:
+  static constexpr size_t MaxEventNameLength = 64;
+
+  static Trace* get();
+
+  void nameThread(const char* fmt, ...);
+  void beginEvent(const char* fmt, ...);
+  void endEvent();
+  void beginAsyncEvent(uint32_t id, const char* fmt, ...);
+  void endAsyncEvent(uint32_t id, const char* fmt, ...);
+
+  class ScopedEvent {
+   public:
+    inline ScopedEvent(const char* fmt, ...);
+    inline ~ScopedEvent();
+
+   private:
+    Trace* const trace;
+  };
+
+  class ScopedAsyncEvent {
+   public:
+    inline ScopedAsyncEvent(uint32_t id, const char* fmt, ...);
+    inline ~ScopedAsyncEvent();
+
+   private:
+    Trace* const trace;
+    const uint32_t id;
+    std::string name;
+  };
+
+ private:
+  Trace();
+  ~Trace();
+  Trace(const Trace&) = delete;
+  Trace& operator=(const Trace&) = delete;
+
+  struct Event {
+    enum class Type : uint8_t {
+      Begin = 'B',
+      End = 'E',
+      Complete = 'X',
+      Instant = 'i',
+      Counter = 'C',
+      AsyncStart = 'b',
+      AsyncInstant = 'n',
+      AsyncEnd = 'e',
+      FlowStart = 's',
+      FlowStep = 't',
+      FlowEnd = 'f',
+      Sample = 'P',
+      ObjectCreated = 'N',
+      ObjectSnapshot = 'O',
+      ObjectDestroyed = 'D',
+      Metadata = 'M',
+      GlobalMemoryDump = 'V',
+      ProcessMemoryDump = 'v',
+      Mark = 'R',
+      ClockSync = 'c',
+      ContextEnter = '(',
+      ContextLeave = ')',
+
+      // Internal types
+      Shutdown = 'S',
+    };
+
+    Event();
+    virtual ~Event() = default;
+    virtual Type type() const = 0;
+    virtual void write(std::ostream& out) const;
+
+    char name[MaxEventNameLength] = {};
+    const char** categories = nullptr;
+    uint64_t timestamp = 0;  // in microseconds
+    uint32_t processID = 0;
+    uint32_t threadID;
+    uint32_t fiberID;
+  };
+
+  struct BeginEvent : public Event {
+    Type type() const override { return Type::Begin; }
+  };
+  struct EndEvent : public Event {
+    Type type() const override { return Type::End; }
+  };
+  struct MetadataEvent : public Event {
+    Type type() const override { return Type::Metadata; }
+  };
+  struct Shutdown : public Event {
+    Type type() const override { return Type::Shutdown; }
+  };
+
+  struct AsyncEvent : public Event {
+    void write(std::ostream& out) const override;
+    uint32_t id;
+  };
+
+  struct AsyncStartEvent : public AsyncEvent {
+    Type type() const override { return Type::AsyncStart; }
+  };
+  struct AsyncEndEvent : public AsyncEvent {
+    Type type() const override { return Type::AsyncEnd; }
+  };
+
+  struct NameThreadEvent : public MetadataEvent {
+    void write(std::ostream& out) const override;
+  };
+
+  uint64_t timestamp();  // in microseconds
+
+  void put(Event*);
+  std::unique_ptr<Event> take();
+
+  struct EventQueue {
+    std::queue<std::unique_ptr<Event> > data;  // guarded by mutes
+    std::condition_variable condition;
+    std::mutex mutex;
+  };
+  // TODO: Increasing this from 1 can cause events to go out of order.
+  // Investigate, fix.
+  std::array<EventQueue, 1> eventQueues;
+  std::atomic<unsigned int> eventQueueWriteIdx = {0};
+  unsigned int eventQueueReadIdx = 0;
+  std::chrono::time_point<std::chrono::high_resolution_clock> createdAt =
+      std::chrono::high_resolution_clock::now();
+  std::thread thread;
+  std::atomic<bool> stopped = {false};
+};
+
+Trace::ScopedEvent::ScopedEvent(const char* fmt, ...) : trace(Trace::get()) {
+  if (trace != nullptr) {
+    char name[Trace::MaxEventNameLength];
+    va_list vararg;
+    va_start(vararg, fmt);
+    vsnprintf(name, Trace::MaxEventNameLength, fmt, vararg);
+    va_end(vararg);
+
+    trace->beginEvent(name);
+  }
+}
+
+Trace::ScopedEvent::~ScopedEvent() {
+  if (trace != nullptr) {
+    trace->endEvent();
+  }
+}
+
+Trace::ScopedAsyncEvent::ScopedAsyncEvent(uint32_t id, const char* fmt, ...)
+    : trace(Trace::get()), id(id) {
+  if (trace != nullptr) {
+    char buf[Trace::MaxEventNameLength];
+    va_list vararg;
+    va_start(vararg, fmt);
+    vsnprintf(buf, Trace::MaxEventNameLength, fmt, vararg);
+    va_end(vararg);
+    name = buf;
+
+    trace->beginAsyncEvent(id, "%s", buf);
+  }
+}
+
+Trace::ScopedAsyncEvent::~ScopedAsyncEvent() {
+  if (trace != nullptr) {
+    trace->endAsyncEvent(id, "%s", name.c_str());
+  }
+}
+
+}  // namespace marl
+
+#define MARL_CONCAT_(a, b) a##b
+#define MARL_CONCAT(a, b) MARL_CONCAT_(a, b)
+#define MARL_SCOPED_EVENT(...) \
+  marl::Trace::ScopedEvent MARL_CONCAT(scoped_event, __LINE__)(__VA_ARGS__);
+#define MARL_BEGIN_ASYNC_EVENT(id, ...)    \
+  do {                                     \
+    if (auto t = marl::Trace::get()) {     \
+      t->beginAsyncEvent(id, __VA_ARGS__); \
+    }                                      \
+  } while (false);
+#define MARL_END_ASYNC_EVENT(id, ...)    \
+  do {                                   \
+    if (auto t = marl::Trace::get()) {   \
+      t->endAsyncEvent(id, __VA_ARGS__); \
+    }                                    \
+  } while (false);
+#define MARL_SCOPED_ASYNC_EVENT(id, ...) \
+  marl::Trace::ScopedAsyncEvent MARL_CONCAT(defer_, __LINE__)(id, __VA_ARGS__);
+#define MARL_NAME_THREAD(...)          \
+  do {                                 \
+    if (auto t = marl::Trace::get()) { \
+      t->nameThread(__VA_ARGS__);      \
+    }                                  \
+  } while (false);
+
+#else  // MARL_TRACE_ENABLED
+
+#define MARL_SCOPED_EVENT(...)
+#define MARL_BEGIN_ASYNC_EVENT(id, ...)
+#define MARL_END_ASYNC_EVENT(id, ...)
+#define MARL_SCOPED_ASYNC_EVENT(id, ...)
+#define MARL_NAME_THREAD(...)
+
+#endif  // MARL_TRACE_ENABLED
+
+#endif  // marl_trace_h
--- a/3party/marl/include/marl/tsa.h
+++ b/3party/marl/include/marl/tsa.h
@ -0,0 +1,80 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Stubs Thread-Safty-Analysis annotation macros for platforms that do not
+// support them.
+// See https://clang.llvm.org/docs/ThreadSafetyAnalysis.html
+
+#ifndef marl_tsa_h
+#define marl_tsa_h
+
+// Enable thread safety attributes only with clang.
+// The attributes can be safely erased when compiling with other compilers.
+#if defined(__clang__) && (!defined(SWIG))
+#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#endif
+
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+
+#define ACQUIRED_BEFORE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+
+#define ACQUIRED_AFTER(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+
+#define REQUIRES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+
+#define REQUIRES_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+
+#define ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+
+#define RELEASE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+
+#define ASSERT_SHARED_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+#define NO_THREAD_SAFETY_ANALYSIS \
+  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+
+#endif  // marl_tsa_h
--- a/3party/marl/include/marl/waitgroup.h
+++ b/3party/marl/include/marl/waitgroup.h
@ -0,0 +1,108 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef marl_waitgroup_h
+#define marl_waitgroup_h
+
+#include "conditionvariable.h"
+#include "debug.h"
+
+#include <atomic>
+#include <mutex>
+
+namespace marl {
+
+// WaitGroup is a synchronization primitive that holds an internal counter that
+// can incremented, decremented and waited on until it reaches 0.
+// WaitGroups can be used as a simple mechanism for waiting on a number of
+// concurrently execute a number of tasks to complete.
+//
+// Example:
+//
+//  void runTasksConcurrently(int numConcurrentTasks)
+//  {
+//      // Construct the WaitGroup with an initial count of numConcurrentTasks.
+//      marl::WaitGroup wg(numConcurrentTasks);
+//      for (int i = 0; i < numConcurrentTasks; i++)
+//      {
+//          // Schedule a task to be run asynchronously.
+//          // These may all be run concurrently.
+//          marl::schedule([=] {
+//              // Once the task has finished, decrement the waitgroup counter
+//              // to signal that this has completed.
+//              defer(wg.done());
+//              doSomeWork();
+//          });
+//      }
+//      // Block until all tasks have completed.
+//      wg.wait();
+//  }
+class WaitGroup {
+ public:
+  // Constructs the WaitGroup with the specified initial count.
+  MARL_NO_EXPORT inline WaitGroup(unsigned int initialCount = 0,
+                                  Allocator* allocator = Allocator::Default);
+
+  // add() increments the internal counter by count.
+  MARL_NO_EXPORT inline void add(unsigned int count = 1) const;
+
+  // done() decrements the internal counter by one.
+  // Returns true if the internal count has reached zero.
+  MARL_NO_EXPORT inline bool done() const;
+
+  // wait() blocks until the WaitGroup counter reaches zero.
+  MARL_NO_EXPORT inline void wait() const;
+
+ private:
+  struct Data {
+    MARL_NO_EXPORT inline Data(Allocator* allocator);
+
+    std::atomic<unsigned int> count = {0};
+    ConditionVariable cv;
+    marl::mutex mutex;
+  };
+  const std::shared_ptr<Data> data;
+};
+
+WaitGroup::Data::Data(Allocator* allocator) : cv(allocator) {}
+
+WaitGroup::WaitGroup(unsigned int initialCount /* = 0 */,
+                     Allocator* allocator /* = Allocator::Default */)
+    : data(std::make_shared<Data>(allocator)) {
+  data->count = initialCount;
+}
+
+void WaitGroup::add(unsigned int count /* = 1 */) const {
+  data->count += count;
+}
+
+bool WaitGroup::done() const {
+  MARL_ASSERT(data->count > 0, "marl::WaitGroup::done() called too many times");
+  auto count = --data->count;
+  if (count == 0) {
+    marl::lock lock(data->mutex);
+    data->cv.notify_all();
+    return true;
+  }
+  return false;
+}
+
+void WaitGroup::wait() const {
+  marl::lock lock(data->mutex);
+  data->cv.wait(lock, [this] { return data->count == 0; });
+}
+
+}  // namespace marl
+
+#endif  // marl_waitgroup_h
--- a/3party/marl/license-checker.cfg
+++ b/3party/marl/license-checker.cfg
@ -0,0 +1,27 @@
+{
+    "licenses": [
+        "Apache-2.0",
+        "Apache-2.0-Header"
+    ],
+    "paths": [
+        {
+            "exclude": [
+                ".clang-format",
+                ".gitignore",
+                ".gitmodules",
+                ".vscode/*.json",
+                "**.md",
+                "AUTHORS",
+                "LICENSE",
+                "go.mod",
+                "build/**",
+                "docs/imgs/*.svg",
+                "kokoro/**.cfg",
+                "third_party/benchmark/**",
+                "third_party/googletest/**",
+                "examples/run_webserver",
+                "examples/shell.emscripten.html"
+            ]
+        }
+    ]
+}
--- a/3party/marl/src/blockingcall_bench.cpp
+++ b/3party/marl/src/blockingcall_bench.cpp
@ -0,0 +1,24 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/blockingcall.h"
+
+#include "benchmark/benchmark.h"
+
+static void BlockingCall(benchmark::State& state) {
+  for (auto _ : state) {
+    marl::blocking_call([] {});
+  }
+}
+BENCHMARK(BlockingCall);
--- a/3party/marl/src/blockingcall_test.cpp
+++ b/3party/marl/src/blockingcall_test.cpp
@ -0,0 +1,71 @@
+// Copyright 2019 The Marl Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/blockingcall.h"
+
+#include "marl/defer.h"
+
+#include "marl_test.h"
+
+#include <mutex>
+
+TEST_P(WithBoundScheduler, BlockingCallVoidReturn) {
+  auto mutex = std::make_shared<std::mutex>();
+  mutex->lock();
+
+  marl::WaitGroup wg(100);
+  for (int i = 0; i < 100; i++) {
+    marl::schedule([=] {
+      defer(wg.done());
+      marl::blocking_call([=] {
+        mutex->lock();
+        defer(mutex->unlock());
+      });
+    });
+  }
+
+  mutex->unlock();
+  wg.wait();
+}
+
+TEST_P(WithBoundScheduler, BlockingCallIntReturn) {
+  auto mutex = std::make_shared<std::mutex>();
+  mutex->lock();
+
+  marl::WaitGroup wg(100);
+  std::atomic<int> n = {0};
+  for (int i = 0; i < 100; i++) {
+    marl::schedule([=, &n] {
+      defer(wg.done());
+      n += marl::blocking_call([=] {
+        mutex->lock();
+        defer(mutex->unlock());
+        return i;
+      });
+    });
+  }
+
+  mutex->unlock();
+  wg.wait();
+
+  ASSERT_EQ(n.load(), 4950);
+}
+
+TEST_P(WithBoundScheduler, BlockingCallSchedulesTask) {
+  marl::WaitGroup wg(1);
+  marl::schedule([=] {
+    marl::blocking_call([=] { marl::schedule([=] { wg.done(); }); });
+  });
+  wg.wait();
+}
--- a/3party/marl/src/conditionvariable_test.cpp
+++ b/3party/marl/src/conditionvariable_test.cpp
@ -0,0 +1,142 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/conditionvariable.h"
+#include "marl/waitgroup.h"
+
+#include "marl_test.h"
+
+#include <condition_variable>
+
+TEST_F(WithoutBoundScheduler, ConditionVariable) {
+  bool trigger[3] = {false, false, false};
+  bool signal[3] = {false, false, false};
+  marl::mutex mutex;
+  marl::ConditionVariable cv;
+
+  std::thread thread([&] {
+    for (int i = 0; i < 3; i++) {
+      marl::lock lock(mutex);
+      cv.wait(lock, [&] {
+        EXPECT_TRUE(lock.owns_lock());
+        return trigger[i];
+      });
+      EXPECT_TRUE(lock.owns_lock());
+      signal[i] = true;
+      cv.notify_one();
+    }
+  });
+
+  ASSERT_FALSE(signal[0]);
+  ASSERT_FALSE(signal[1]);
+  ASSERT_FALSE(signal[2]);
+
+  for (int i = 0; i < 3; i++) {
+    {
+      marl::lock lock(mutex);
+      trigger[i] = true;
+      cv.notify_one();
+      cv.wait(lock, [&] {
+        EXPECT_TRUE(lock.owns_lock());
+        return signal[i];
+      });
+      EXPECT_TRUE(lock.owns_lock());
+    }
+
+    ASSERT_EQ(signal[0], 0 <= i);
+    ASSERT_EQ(signal[1], 1 <= i);
+    ASSERT_EQ(signal[2], 2 <= i);
+  }
+
+  thread.join();
+}
+
+TEST_P(WithBoundScheduler, ConditionVariable) {
+  bool trigger[3] = {false, false, false};
+  bool signal[3] = {false, false, false};
+  marl::mutex mutex;
+  marl::ConditionVariable cv;
+
+  std::thread thread([&] {
+    for (int i = 0; i < 3; i++) {
+      marl::lock lock(mutex);
+      cv.wait(lock, [&] {
+        EXPECT_TRUE(lock.owns_lock());
+        return trigger[i];
+      });
+      EXPECT_TRUE(lock.owns_lock());
+      signal[i] = true;
+      cv.notify_one();
+    }
+  });
+
+  ASSERT_FALSE(signal[0]);
+  ASSERT_FALSE(signal[1]);
+  ASSERT_FALSE(signal[2]);
+
+  for (int i = 0; i < 3; i++) {
+    {
+      marl::lock lock(mutex);
+      trigger[i] = true;
+      cv.notify_one();
+      cv.wait(lock, [&] {
+        EXPECT_TRUE(lock.owns_lock());
+        return signal[i];
+      });
+      EXPECT_TRUE(lock.owns_lock());
+    }
+
+    ASSERT_EQ(signal[0], 0 <= i);
+    ASSERT_EQ(signal[1], 1 <= i);
+    ASSERT_EQ(signal[2], 2 <= i);
+  }
+
+  thread.join();
+}
+
+// ConditionVariableTimeouts spins up a whole lot of wait_fors(), unblocking
+// some with timeouts and some with a notify, and then let's all the workers
+// go to idle before repeating.
+// This is testing to ensure that the scheduler handles timeouts correctly when
+// they are early-unblocked, along with expected lock state.
+TEST_P(WithBoundScheduler, ConditionVariableTimeouts) {
+  for (int i = 0; i < 10; i++) {
+    marl::mutex mutex;
+    marl::ConditionVariable cv;
+    bool signaled = false;  // guarded by mutex
+    auto wg = marl::WaitGroup(100);
+    for (int j = 0; j < 100; j++) {
+      marl::schedule([=, &mutex, &cv, &signaled] {
+        {
+          marl::lock lock(mutex);
+          cv.wait_for(lock, std::chrono::milliseconds(j), [&] {
+            EXPECT_TRUE(lock.owns_lock());
+            return signaled;
+          });
+          EXPECT_TRUE(lock.owns_lock());
+        }
+        // Ensure the mutex unlock happens *before* the wg.done() call,
+        // otherwise the stack pointer may no longer be valid.
+        wg.done();
+      });
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    {
+      marl::lock lock(mutex);
+      signaled = true;
+      cv.notify_all();
+    }
+    wg.wait();
+  }
+}
--- a/3party/marl/src/containers_test.cpp
+++ b/3party/marl/src/containers_test.cpp
@ -0,0 +1,332 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/containers.h"
+#include "marl_test.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <cstddef>
+#include <string>
+
+class ContainersVectorTest : public WithoutBoundScheduler {};
+
+TEST_F(ContainersVectorTest, Empty) {
+  marl::containers::vector<std::string, 4> vector(allocator);
+  ASSERT_EQ(vector.size(), size_t(0));
+}
+
+TEST_F(ContainersVectorTest, WithinFixedCapIndex) {
+  marl::containers::vector<std::string, 4> vector(allocator);
+  vector.resize(4);
+  vector[0] = "A";
+  vector[1] = "B";
+  vector[2] = "C";
+  vector[3] = "D";
+
+  ASSERT_EQ(vector[0], "A");
+  ASSERT_EQ(vector[1], "B");
+  ASSERT_EQ(vector[2], "C");
+  ASSERT_EQ(vector[3], "D");
+}
+
+TEST_F(ContainersVectorTest, BeyondFixedCapIndex) {
+  marl::containers::vector<std::string, 1> vector(allocator);
+  vector.resize(4);
+  vector[0] = "A";
+  vector[1] = "B";
+  vector[2] = "C";
+  vector[3] = "D";
+
+  ASSERT_EQ(vector[0], "A");
+  ASSERT_EQ(vector[1], "B");
+  ASSERT_EQ(vector[2], "C");
+  ASSERT_EQ(vector[3], "D");
+}
+
+TEST_F(ContainersVectorTest, WithinFixedCapPushPop) {
+  marl::containers::vector<std::string, 4> vector(allocator);
+  vector.push_back("A");
+  vector.push_back("B");
+  vector.push_back("C");
+  vector.push_back("D");
+
+  ASSERT_EQ(vector.size(), size_t(4));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(4));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "D");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(3));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(3));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "C");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(2));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(2));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "B");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(1));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(1));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "A");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(0));
+}
+
+TEST_F(ContainersVectorTest, BeyondFixedCapPushPop) {
+  marl::containers::vector<std::string, 2> vector(allocator);
+  vector.push_back("A");
+  vector.push_back("B");
+  vector.push_back("C");
+  vector.push_back("D");
+
+  ASSERT_EQ(vector.size(), size_t(4));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(4));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "D");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(3));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(3));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "C");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(2));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(2));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "B");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(1));
+  ASSERT_EQ(vector.end() - vector.begin(), ptrdiff_t(1));
+
+  ASSERT_EQ(vector.front(), "A");
+  ASSERT_EQ(vector.back(), "A");
+  vector.pop_back();
+  ASSERT_EQ(vector.size(), size_t(0));
+}
+
+TEST_F(ContainersVectorTest, CopyConstruct) {
+  marl::containers::vector<std::string, 4> vectorA(allocator);
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  marl::containers::vector<std::string, 4> vectorB(vectorA, allocator);
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST_F(ContainersVectorTest, CopyConstructDifferentBaseCapacity) {
+  marl::containers::vector<std::string, 4> vectorA(allocator);
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  marl::containers::vector<std::string, 2> vectorB(vectorA, allocator);
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST_F(ContainersVectorTest, CopyAssignment) {
+  marl::containers::vector<std::string, 4> vectorA(allocator);
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  marl::containers::vector<std::string, 4> vectorB(allocator);
+  vectorB = vectorA;
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST_F(ContainersVectorTest, CopyAssignmentDifferentBaseCapacity) {
+  marl::containers::vector<std::string, 4> vectorA(allocator);
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  marl::containers::vector<std::string, 2> vectorB(allocator);
+  vectorB = vectorA;
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST_F(ContainersVectorTest, MoveConstruct) {
+  marl::containers::vector<std::string, 4> vectorA(allocator);
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  marl::containers::vector<std::string, 2> vectorB(std::move(vectorA),
+                                                   allocator);
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST_F(ContainersVectorTest, Copy) {
+  marl::containers::vector<std::string, 4> vectorA(allocator);
+  marl::containers::vector<std::string, 2> vectorB(allocator);
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  vectorB.resize(1);
+  vectorB[0] = "Z";
+
+  vectorB = vectorA;
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+TEST_F(ContainersVectorTest, Move) {
+  marl::containers::vector<std::string, 4> vectorA(allocator);
+  marl::containers::vector<std::string, 2> vectorB(allocator);
+
+  vectorA.resize(3);
+  vectorA[0] = "A";
+  vectorA[1] = "B";
+  vectorA[2] = "C";
+
+  vectorB.resize(1);
+  vectorB[0] = "Z";
+
+  vectorB = std::move(vectorA);
+  ASSERT_EQ(vectorA.size(), size_t(0));
+  ASSERT_EQ(vectorB.size(), size_t(3));
+  ASSERT_EQ(vectorB[0], "A");
+  ASSERT_EQ(vectorB[1], "B");
+  ASSERT_EQ(vectorB[2], "C");
+}
+
+class ContainersListTest : public WithoutBoundScheduler {};
+
+TEST_F(ContainersListTest, Empty) {
+  marl::containers::list<std::string> list(allocator);
+  ASSERT_EQ(list.size(), size_t(0));
+}
+
+TEST_F(ContainersListTest, EmplaceOne) {
+  marl::containers::list<std::string> list(allocator);
+  auto itEntry = list.emplace_front("hello world");
+  ASSERT_EQ(*itEntry, "hello world");
+  ASSERT_EQ(list.size(), size_t(1));
+  auto it = list.begin();
+  ASSERT_EQ(it, itEntry);
+  ++it;
+  ASSERT_EQ(it, list.end());
+}
+
+TEST_F(ContainersListTest, EmplaceThree) {
+  marl::containers::list<std::string> list(allocator);
+  auto itA = list.emplace_front("a");
+  auto itB = list.emplace_front("b");
+  auto itC = list.emplace_front("c");
+  ASSERT_EQ(*itA, "a");
+  ASSERT_EQ(*itB, "b");
+  ASSERT_EQ(*itC, "c");
+  ASSERT_EQ(list.size(), size_t(3));
+  auto it = list.begin();
+  ASSERT_EQ(it, itC);
+  ++it;
+  ASSERT_EQ(it, itB);
+  ++it;
+  ASSERT_EQ(it, itA);
+  ++it;
+  ASSERT_EQ(it, list.end());
+}
+
+TEST_F(ContainersListTest, EraseFront) {
+  marl::containers::list<std::string> list(allocator);
+  auto itA = list.emplace_front("a");
+  auto itB = list.emplace_front("b");
+  auto itC = list.emplace_front("c");
+  list.erase(itC);
+  ASSERT_EQ(list.size(), size_t(2));
+  auto it = list.begin();
+  ASSERT_EQ(it, itB);
+  ++it;
+  ASSERT_EQ(it, itA);
+  ++it;
+  ASSERT_EQ(it, list.end());
+}
+
+TEST_F(ContainersListTest, EraseBack) {
+  marl::containers::list<std::string> list(allocator);
+  auto itA = list.emplace_front("a");
+  auto itB = list.emplace_front("b");
+  auto itC = list.emplace_front("c");
+  list.erase(itA);
+  ASSERT_EQ(list.size(), size_t(2));
+  auto it = list.begin();
+  ASSERT_EQ(it, itC);
+  ++it;
+  ASSERT_EQ(it, itB);
+  ++it;
+  ASSERT_EQ(it, list.end());
+}
+
+TEST_F(ContainersListTest, EraseMid) {
+  marl::containers::list<std::string> list(allocator);
+  auto itA = list.emplace_front("a");
+  auto itB = list.emplace_front("b");
+  auto itC = list.emplace_front("c");
+  list.erase(itB);
+  ASSERT_EQ(list.size(), size_t(2));
+  auto it = list.begin();
+  ASSERT_EQ(it, itC);
+  ++it;
+  ASSERT_EQ(it, itA);
+  ++it;
+  ASSERT_EQ(it, list.end());
+}
+
+TEST_F(ContainersListTest, Grow) {
+  marl::containers::list<std::string> list(allocator);
+  for (int i = 0; i < 256; i++) {
+    list.emplace_front(std::to_string(i));
+  }
+  ASSERT_EQ(list.size(), size_t(256));
+}
--- a/3party/marl/src/dag_test.cpp
+++ b/3party/marl/src/dag_test.cpp
@ -0,0 +1,190 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/dag.h"
+
+#include "marl_test.h"
+
+using namespace testing;
+
+namespace {
+
+struct Data {
+  std::mutex mutex;
+  std::vector<std::string> order;
+
+  void push(std::string&& s) {
+    std::unique_lock<std::mutex> lock(mutex);
+    order.emplace_back(std::move(s));
+  }
+};
+
+template <typename T>
+std::vector<T> slice(const std::vector<T>& in, size_t from, size_t to) {
+  return {in.begin() + from, in.begin() + to};
+}
+
+}  // namespace
+
+//  [A] --> [B] --> [C]                                                        |
+TEST_P(WithBoundScheduler, DAGChainNoArg) {
+  marl::DAG<>::Builder builder;
+
+  Data data;
+  builder.root()
+      .then([&] { data.push("A"); })
+      .then([&] { data.push("B"); })
+      .then([&] { data.push("C"); });
+
+  auto dag = builder.build();
+  dag->run();
+
+  ASSERT_THAT(data.order, ElementsAre("A", "B", "C"));
+}
+
+//  [A] --> [B] --> [C]                                                        |
+TEST_P(WithBoundScheduler, DAGChain) {
+  marl::DAG<Data&>::Builder builder;
+
+  builder.root()
+      .then([](Data& data) { data.push("A"); })
+      .then([](Data& data) { data.push("B"); })
+      .then([](Data& data) { data.push("C"); });
+
+  auto dag = builder.build();
+
+  Data data;
+  dag->run(data);
+
+  ASSERT_THAT(data.order, ElementsAre("A", "B", "C"));
+}
+
+//  [A] --> [B] --> [C]                                                        |
+TEST_P(WithBoundScheduler, DAGRunRepeat) {
+  marl::DAG<Data&>::Builder builder;
+
+  builder.root()
+      .then([](Data& data) { data.push("A"); })
+      .then([](Data& data) { data.push("B"); })
+      .then([](Data& data) { data.push("C"); });
+
+  auto dag = builder.build();
+
+  Data dataA, dataB;
+  dag->run(dataA);
+  dag->run(dataB);
+  dag->run(dataA);
+
+  ASSERT_THAT(dataA.order, ElementsAre("A", "B", "C", "A", "B", "C"));
+  ASSERT_THAT(dataB.order, ElementsAre("A", "B", "C"));
+}
+
+//           /--> [A]                                                          |
+//  [root] --|--> [B]                                                          |
+//           \--> [C]                                                          |
+TEST_P(WithBoundScheduler, DAGFanOutFromRoot) {
+  marl::DAG<Data&>::Builder builder;
+
+  auto root = builder.root();
+  root.then([](Data& data) { data.push("A"); });
+  root.then([](Data& data) { data.push("B"); });
+  root.then([](Data& data) { data.push("C"); });
+
+  auto dag = builder.build();
+
+  Data data;
+  dag->run(data);
+
+  ASSERT_THAT(data.order, UnorderedElementsAre("A", "B", "C"));
+}
+
+//                /--> [A]                                                     |
+// [root] -->[N]--|--> [B]                                                     |
+//                \--> [C]                                                     |
+TEST_P(WithBoundScheduler, DAGFanOutFromNonRoot) {
+  marl::DAG<Data&>::Builder builder;
+
+  auto root = builder.root();
+  auto node = root.then([](Data& data) { data.push("N"); });
+  node.then([](Data& data) { data.push("A"); });
+  node.then([](Data& data) { data.push("B"); });
+  node.then([](Data& data) { data.push("C"); });
+
+  auto dag = builder.build();
+
+  Data data;
+  dag->run(data);
+
+  ASSERT_THAT(data.order, UnorderedElementsAre("N", "A", "B", "C"));
+  ASSERT_EQ(data.order[0], "N");
+  ASSERT_THAT(slice(data.order, 1, 4), UnorderedElementsAre("A", "B", "C"));
+}
+
+//          /--> [A0] --\        /--> [C0] --\        /--> [E0] --\            |
+// [root] --|--> [A1] --|-->[B]--|--> [C1] --|-->[D]--|--> [E1] --|-->[F]      |
+//                               \--> [C2] --/        |--> [E2] --|            |
+//                                                    \--> [E3] --/            |
+TEST_P(WithBoundScheduler, DAGFanOutFanIn) {
+  marl::DAG<Data&>::Builder builder;
+
+  auto root = builder.root();
+  auto a0 = root.then([](Data& data) { data.push("A0"); });
+  auto a1 = root.then([](Data& data) { data.push("A1"); });
+
+  auto b = builder.node([](Data& data) { data.push("B"); }, {a0, a1});
+
+  auto c0 = b.then([](Data& data) { data.push("C0"); });
+  auto c1 = b.then([](Data& data) { data.push("C1"); });
+  auto c2 = b.then([](Data& data) { data.push("C2"); });
+
+  auto d = builder.node([](Data& data) { data.push("D"); }, {c0, c1, c2});
+
+  auto e0 = d.then([](Data& data) { data.push("E0"); });
+  auto e1 = d.then([](Data& data) { data.push("E1"); });
+  auto e2 = d.then([](Data& data) { data.push("E2"); });
+  auto e3 = d.then([](Data& data) { data.push("E3"); });
+
+  builder.node([](Data& data) { data.push("F"); }, {e0, e1, e2, e3});
+
+  auto dag = builder.build();
+
+  Data data;
+  dag->run(data);
+
+  ASSERT_THAT(data.order,
+              UnorderedElementsAre("A0", "A1", "B", "C0", "C1", "C2", "D", "E0",
+                                   "E1", "E2", "E3", "F"));
+  ASSERT_THAT(slice(data.order, 0, 2), UnorderedElementsAre("A0", "A1"));
+  ASSERT_THAT(data.order[2], "B");
+  ASSERT_THAT(slice(data.order, 3, 6), UnorderedElementsAre("C0", "C1", "C2"));
+  ASSERT_THAT(data.order[6], "D");
+  ASSERT_THAT(slice(data.order, 7, 11),
+              UnorderedElementsAre("E0", "E1", "E2", "E3"));
+  ASSERT_THAT(data.order[11], "F");
+}
+
+TEST_P(WithBoundScheduler, DAGForwardFunc) {
+  marl::DAG<void>::Builder builder;
+  std::function<void()> func([](){});
+
+  ASSERT_TRUE(func);
+
+  auto a = builder.root()
+      .then(func)
+      .then(func);
+
+  builder.node(func, {a});
+
+  ASSERT_TRUE(func);
+}
--- a/3party/marl/src/debug.cpp
+++ b/3party/marl/src/debug.cpp
@ -0,0 +1,48 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/debug.h"
+#include "marl/scheduler.h"
+
+#include <cstdarg>
+#include <cstdlib>
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace marl {
+
+void fatal(const char* msg, ...) {
+  va_list vararg;
+  va_start(vararg, msg);
+  vfprintf(stderr, msg, vararg);
+  va_end(vararg);
+  abort();
+}
+
+void warn(const char* msg, ...) {
+  va_list vararg;
+  va_start(vararg, msg);
+  vfprintf(stdout, msg, vararg);
+  va_end(vararg);
+}
+
+void assert_has_bound_scheduler(const char* feature) {
+  (void)feature;  // unreferenced parameter
+  MARL_ASSERT(Scheduler::get() != nullptr,
+              "%s requires a marl::Scheduler to be bound", feature);
+}
+
+}  // namespace marl
--- a/3party/marl/src/defer_bench.cpp
+++ b/3party/marl/src/defer_bench.cpp
@ -0,0 +1,27 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/defer.h"
+
+#include "benchmark/benchmark.h"
+
+volatile int do_not_optimize_away_result = 0;
+
+static void Defer(benchmark::State& state) {
+  for (auto _ : state) {
+    // Avoid benchmark::DoNotOptimize() as this is unfairly slower on Windows.
+    defer(do_not_optimize_away_result++);
+  }
+}
+BENCHMARK(Defer);
--- a/3party/marl/src/defer_test.cpp
+++ b/3party/marl/src/defer_test.cpp
@ -0,0 +1,36 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/defer.h"
+
+#include "marl_test.h"
+
+TEST_F(WithoutBoundScheduler, Defer) {
+  bool deferCalled = false;
+  { defer(deferCalled = true); }
+  ASSERT_TRUE(deferCalled);
+}
+
+TEST_F(WithoutBoundScheduler, DeferOrder) {
+  int counter = 0;
+  int a = 0, b = 0, c = 0;
+  {
+    defer(a = ++counter);
+    defer(b = ++counter);
+    defer(c = ++counter);
+  }
+  ASSERT_EQ(a, 3);
+  ASSERT_EQ(b, 2);
+  ASSERT_EQ(c, 1);
+}
--- a/3party/marl/src/event_bench.cpp
+++ b/3party/marl/src/event_bench.cpp
@ -0,0 +1,74 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/containers.h"
+#include "marl/event.h"
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_DEFINE_F(Schedule, Event)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      marl::containers::vector<marl::Event, 1> events;
+      events.resize(numTasks + 1);
+      for (auto i = 0; i < numTasks; i++) {
+        marl::Event prev = events[i];
+        marl::Event next = events[i + 1];
+        marl::schedule([=] {
+          prev.wait();
+          next.signal();
+        });
+      }
+      events.front().signal();
+      events.back().wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, Event)->Apply(Schedule::args<512>);
+
+// EventBaton benchmarks alternating execution of two tasks.
+BENCHMARK_DEFINE_F(Schedule, EventBaton)(benchmark::State& state) {
+  run(state, [&](int numPasses) {
+    for (auto _ : state) {
+      marl::Event passToA(marl::Event::Mode::Auto);
+      marl::Event passToB(marl::Event::Mode::Auto);
+      marl::Event done(marl::Event::Mode::Auto);
+
+      marl::schedule(marl::Task(
+          [=] {
+            for (int i = 0; i < numPasses; i++) {
+              passToA.wait();
+              passToB.signal();
+            }
+          },
+          marl::Task::Flags::SameThread));
+
+      marl::schedule(marl::Task(
+          [=] {
+            for (int i = 0; i < numPasses; i++) {
+              passToB.wait();
+              passToA.signal();
+            }
+            done.signal();
+          },
+          marl::Task::Flags::SameThread));
+
+      passToA.signal();
+      done.wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, EventBaton)->Apply(Schedule::args<262144>);
--- a/3party/marl/src/event_test.cpp
+++ b/3party/marl/src/event_test.cpp
@ -0,0 +1,226 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/event.h"
+#include "marl/defer.h"
+#include "marl/waitgroup.h"
+
+#include "marl_test.h"
+
+#include <array>
+
+namespace std {
+namespace chrono {
+template <typename Rep, typename Period>
+std::ostream& operator<<(std::ostream& os, const duration<Rep, Period>& d) {
+  return os << chrono::duration_cast<chrono::microseconds>(d).count() << "ms";
+}
+}  // namespace chrono
+}  // namespace std
+
+TEST_P(WithBoundScheduler, EventIsSignalled) {
+  for (auto mode : {marl::Event::Mode::Manual, marl::Event::Mode::Auto}) {
+    auto event = marl::Event(mode);
+    ASSERT_EQ(event.isSignalled(), false);
+    event.signal();
+    ASSERT_EQ(event.isSignalled(), true);
+    ASSERT_EQ(event.isSignalled(), true);
+    event.clear();
+    ASSERT_EQ(event.isSignalled(), false);
+  }
+}
+
+TEST_P(WithBoundScheduler, EventAutoTest) {
+  auto event = marl::Event(marl::Event::Mode::Auto);
+  ASSERT_EQ(event.test(), false);
+  event.signal();
+  ASSERT_EQ(event.test(), true);
+  ASSERT_EQ(event.test(), false);
+}
+
+TEST_P(WithBoundScheduler, EventManualTest) {
+  auto event = marl::Event(marl::Event::Mode::Manual);
+  ASSERT_EQ(event.test(), false);
+  event.signal();
+  ASSERT_EQ(event.test(), true);
+  ASSERT_EQ(event.test(), true);
+}
+
+TEST_P(WithBoundScheduler, EventAutoWait) {
+  std::atomic<int> counter = {0};
+  auto event = marl::Event(marl::Event::Mode::Auto);
+  auto done = marl::Event(marl::Event::Mode::Auto);
+
+  for (int i = 0; i < 3; i++) {
+    marl::schedule([=, &counter] {
+      event.wait();
+      counter++;
+      done.signal();
+    });
+  }
+
+  ASSERT_EQ(counter.load(), 0);
+  event.signal();
+  done.wait();
+  ASSERT_EQ(counter.load(), 1);
+  event.signal();
+  done.wait();
+  ASSERT_EQ(counter.load(), 2);
+  event.signal();
+  done.wait();
+  ASSERT_EQ(counter.load(), 3);
+}
+
+TEST_P(WithBoundScheduler, EventManualWait) {
+  std::atomic<int> counter = {0};
+  auto event = marl::Event(marl::Event::Mode::Manual);
+  auto wg = marl::WaitGroup(3);
+  for (int i = 0; i < 3; i++) {
+    marl::schedule([=, &counter] {
+      event.wait();
+      counter++;
+      wg.done();
+    });
+  }
+  event.signal();
+  wg.wait();
+  ASSERT_EQ(counter.load(), 3);
+}
+
+TEST_P(WithBoundScheduler, EventSequence) {
+  for (auto mode : {marl::Event::Mode::Manual, marl::Event::Mode::Auto}) {
+    std::string sequence;
+    auto eventA = marl::Event(mode);
+    auto eventB = marl::Event(mode);
+    auto eventC = marl::Event(mode);
+    auto done = marl::Event(mode);
+    marl::schedule([=, &sequence] {
+      eventB.wait();
+      sequence += "B";
+      eventC.signal();
+    });
+    marl::schedule([=, &sequence] {
+      eventA.wait();
+      sequence += "A";
+      eventB.signal();
+    });
+    marl::schedule([=, &sequence] {
+      eventC.wait();
+      sequence += "C";
+      done.signal();
+    });
+    ASSERT_EQ(sequence, "");
+    eventA.signal();
+    done.wait();
+    ASSERT_EQ(sequence, "ABC");
+  }
+}
+
+TEST_P(WithBoundScheduler, EventWaitForUnblocked) {
+  auto event = marl::Event(marl::Event::Mode::Manual);
+  auto wg = marl::WaitGroup(1000);
+  for (int i = 0; i < 1000; i++) {
+    marl::schedule([=] {
+      defer(wg.done());
+      auto duration = std::chrono::seconds(10);
+      event.wait_for(duration);
+    });
+  }
+  event.signal();  // unblock
+  wg.wait();
+}
+
+TEST_P(WithBoundScheduler, EventWaitForTimeTaken) {
+  auto event = marl::Event(marl::Event::Mode::Auto);
+  auto wg = marl::WaitGroup(1000);
+  for (int i = 0; i < 1000; i++) {
+    marl::schedule([=] {
+      defer(wg.done());
+      auto duration = std::chrono::milliseconds(10);
+      auto start = std::chrono::system_clock::now();
+      auto triggered = event.wait_for(duration);
+      auto end = std::chrono::system_clock::now();
+      ASSERT_FALSE(triggered);
+      ASSERT_GE(end - start, duration);
+    });
+  }
+  wg.wait();
+}
+
+TEST_P(WithBoundScheduler, EventWaitUntilUnblocked) {
+  auto event = marl::Event(marl::Event::Mode::Manual);
+  auto wg = marl::WaitGroup(1000);
+  for (int i = 0; i < 1000; i++) {
+    marl::schedule([=] {
+      defer(wg.done());
+      auto duration = std::chrono::seconds(10);
+      auto start = std::chrono::system_clock::now();
+      event.wait_until(start + duration);
+    });
+  }
+  event.signal();  // unblock
+  wg.wait();
+}
+
+TEST_P(WithBoundScheduler, EventWaitUntilTimeTaken) {
+  auto event = marl::Event(marl::Event::Mode::Auto);
+  auto wg = marl::WaitGroup(1000);
+  for (int i = 0; i < 1000; i++) {
+    marl::schedule([=] {
+      defer(wg.done());
+      auto duration = std::chrono::milliseconds(10);
+      auto start = std::chrono::system_clock::now();
+      auto triggered = event.wait_until(start + duration);
+      auto end = std::chrono::system_clock::now();
+      ASSERT_FALSE(triggered);
+      ASSERT_GE(end - start, duration);
+    });
+  }
+  wg.wait();
+}
+
+// EventWaitStressTest spins up a whole lot of wait_fors(), unblocking some
+// with timeouts and some with an event signal, and then let's all the workers
+// go to idle before repeating.
+// This is testing to ensure that the scheduler handles timeouts correctly when
+// they are early-unblocked. Specifically, this is to test that fibers are
+// not double-placed into the idle or working lists.
+TEST_P(WithBoundScheduler, EventWaitStressTest) {
+  auto event = marl::Event(marl::Event::Mode::Manual);
+  for (int i = 0; i < 10; i++) {
+    auto wg = marl::WaitGroup(100);
+    for (int j = 0; j < 100; j++) {
+      marl::schedule([=] {
+        defer(wg.done());
+        event.wait_for(std::chrono::milliseconds(j));
+      });
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    event.signal();  // unblock
+    wg.wait();
+  }
+}
+
+TEST_P(WithBoundScheduler, EventAny) {
+  for (int i = 0; i < 3; i++) {
+    std::array<marl::Event, 3> events = {
+        marl::Event(marl::Event::Mode::Auto),
+        marl::Event(marl::Event::Mode::Auto),
+        marl::Event(marl::Event::Mode::Auto),
+    };
+    auto any = marl::Event::any(events.begin(), events.end());
+    events[i].signal();
+    ASSERT_TRUE(any.isSignalled());
+  }
+}
--- a/3party/marl/src/marl_bench.cpp
+++ b/3party/marl/src/marl_bench.cpp
@ -0,0 +1,50 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/sanitizers.h"
+
+int main(int argc, char** argv) {
+#if MARL_ADDRESS_SANITIZER_ENABLED
+  printf(
+      "***WARNING*** Marl built with address sanitizer enabled. "
+      "Timings will be affected\n");
+#endif
+#if MARL_MEMORY_SANITIZER_ENABLED
+  printf(
+      "***WARNING*** Marl built with memory sanitizer enabled. "
+      "Timings will be affected\n");
+#endif
+#if MARL_THREAD_SANITIZER_ENABLED
+  printf(
+      "***WARNING*** Marl built with thread sanitizer enabled. "
+      "Timings will be affected\n");
+#endif
+  ::benchmark::Initialize(&argc, argv);
+  if (::benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+  ::benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
+
+uint32_t Schedule::doSomeWork(uint32_t x) {
+  uint32_t q = x;
+  for (uint32_t i = 0; i < 100000; i++) {
+    x = (x << 4) | x;
+    x = x | 0x1020;
+    x = (x >> 2) & q;
+  }
+  return x;
+}
--- a/3party/marl/src/marl_bench.h
+++ b/3party/marl/src/marl_bench.h
@ -0,0 +1,96 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/scheduler.h"
+#include "marl/thread.h"
+
+#include "benchmark/benchmark.h"
+
+// Define MARL_FULL_BENCHMARK to 1 if you want to run benchmarks for every
+// available logical CPU core.
+#ifndef MARL_FULL_BENCHMARK
+#define MARL_FULL_BENCHMARK 0
+#endif
+
+class Schedule : public benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State&) {}
+
+  void TearDown(const ::benchmark::State&) {}
+
+  // run() creates a scheduler using the config cfg, sets the number of worker
+  // threads from the benchmark arguments, calls f, then unbinds and destructs
+  // the scheduler.
+  // F must be a function of the signature: void(int numTasks)
+  template <typename F>
+  void run(const ::benchmark::State& state,
+           marl::Scheduler::Config cfg,
+           F&& f) {
+    cfg.setWorkerThreadCount(numThreads(state));
+
+    marl::Scheduler scheduler(cfg);
+    scheduler.bind();
+    f(numTasks(state));
+    scheduler.unbind();
+  }
+
+  // run() creates a scheduler, sets the number of worker threads from the
+  // benchmark arguments, calls f, then unbinds and destructs the scheduler.
+  // F must be a function of the signature: void(int numTasks)
+  template <typename F>
+  void run(const ::benchmark::State& state, F&& f) {
+    run(state, marl::Scheduler::Config{}, f);
+  }
+
+  // args() sets up the benchmark to run a number of tasks over a number of
+  // threads.
+  // If MARL_FULL_BENCHMARK is enabled, then NumTasks tasks will be run
+  // across from 0 to numLogicalCPUs worker threads.
+  // If MARL_FULL_BENCHMARK is not enabled, then NumTasks tasks will be run
+  // across [0 .. numLogicalCPUs] worker threads in 2^n steps.
+  template <int NumTasks = 0x40000>
+  static void args(benchmark::internal::Benchmark* b) {
+    b->ArgNames({"tasks", "threads"});
+    b->Args({NumTasks, 0});
+    auto numLogicalCPUs = marl::Thread::numLogicalCPUs();
+#if MARL_FULL_BENCHMARK
+    for (unsigned int threads = 1U; threads <= numLogicalCPUs; threads++) {
+      b->Args({NumTasks, threads});
+    }
+#else
+    for (unsigned int threads = 1U; threads <= numLogicalCPUs; threads *= 2) {
+      b->Args({NumTasks, threads});
+    }
+    if ((numLogicalCPUs & (numLogicalCPUs - 1)) != 0) {
+      // numLogicalCPUs is not a power-of-two. Also test with numLogicalCPUs.
+      b->Args({NumTasks, numLogicalCPUs});
+    }
+#endif
+  }
+
+  // numThreads() return the number of threads in the benchmark run from the
+  // state.
+  static int numThreads(const ::benchmark::State& state) {
+    return static_cast<int>(state.range(1));
+  }
+
+  // numTasks() return the number of tasks in the benchmark run from the state.
+  static int numTasks(const ::benchmark::State& state) {
+    return static_cast<int>(state.range(0));
+  }
+
+  // doSomeWork() performs some made up bit-shitfy algorithm that's difficult
+  // for a compiler to optimize and produces consistent results.
+  static uint32_t doSomeWork(uint32_t x);
+};
--- a/3party/marl/src/marl_test.cpp
+++ b/3party/marl/src/marl_test.cpp
@ -0,0 +1,30 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+INSTANTIATE_TEST_SUITE_P(
+    SchedulerParams,
+    WithBoundScheduler,
+    testing::Values(SchedulerParams{0},  // Single-threaded mode test
+                    SchedulerParams{1},  // Single worker thread
+                    SchedulerParams{2},  // 2 worker threads...
+                    SchedulerParams{4},
+                    SchedulerParams{8},
+                    SchedulerParams{64}));
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/3party/marl/src/marl_test.h
+++ b/3party/marl/src/marl_test.h
@ -0,0 +1,78 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "marl/scheduler.h"
+
+// SchedulerParams holds Scheduler construction parameters for testing.
+struct SchedulerParams {
+  int numWorkerThreads;
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const SchedulerParams& params) {
+    return os << "SchedulerParams{"
+              << "numWorkerThreads: " << params.numWorkerThreads << "}";
+  }
+};
+
+// WithoutBoundScheduler is a test fixture that does not bind a scheduler.
+class WithoutBoundScheduler : public testing::Test {
+ public:
+  void SetUp() override {
+    allocator = new marl::TrackedAllocator(marl::Allocator::Default);
+  }
+
+  void TearDown() override {
+    auto stats = allocator->stats();
+    ASSERT_EQ(stats.numAllocations(), 0U);
+    ASSERT_EQ(stats.bytesAllocated(), 0U);
+    delete allocator;
+  }
+
+  marl::TrackedAllocator* allocator = nullptr;
+};
+
+// WithBoundScheduler is a parameterized test fixture that performs tests with
+// a bound scheduler using a number of different configurations.
+class WithBoundScheduler : public testing::TestWithParam<SchedulerParams> {
+ public:
+  void SetUp() override {
+    allocator = new marl::TrackedAllocator(marl::Allocator::Default);
+
+    auto& params = GetParam();
+
+    marl::Scheduler::Config cfg;
+    cfg.setAllocator(allocator);
+    cfg.setWorkerThreadCount(params.numWorkerThreads);
+    cfg.setFiberStackSize(0x10000);
+
+    auto scheduler = new marl::Scheduler(cfg);
+    scheduler->bind();
+  }
+
+  void TearDown() override {
+    auto scheduler = marl::Scheduler::get();
+    scheduler->unbind();
+    delete scheduler;
+
+    auto stats = allocator->stats();
+    ASSERT_EQ(stats.numAllocations(), 0U);
+    ASSERT_EQ(stats.bytesAllocated(), 0U);
+    delete allocator;
+  }
+
+  marl::TrackedAllocator* allocator = nullptr;
+};
--- a/3party/marl/src/memory.cpp
+++ b/3party/marl/src/memory.cpp
@ -0,0 +1,249 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/memory.h"
+
+#include "marl/debug.h"
+#include "marl/sanitizers.h"
+
+#include <cstring>
+
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__) || defined(__EMSCRIPTEN__)
+#include <sys/mman.h>
+#include <unistd.h>
+namespace {
+// This was a static in pageSize(), but due to the following TSAN false-positive
+// bug, this has been moved out to a global.
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68338
+const size_t kPageSize = sysconf(_SC_PAGESIZE);
+inline size_t pageSize() {
+  return kPageSize;
+}
+inline void* allocatePages(size_t count) {
+  auto mapping = mmap(nullptr, count * pageSize(), PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  MARL_ASSERT(mapping != MAP_FAILED, "Failed to allocate %d pages", int(count));
+  if (mapping == MAP_FAILED) {
+    mapping = nullptr;
+  }
+  return mapping;
+}
+inline void freePages(void* ptr, size_t count) {
+  auto res = munmap(ptr, count * pageSize());
+  (void)res;
+  MARL_ASSERT(res == 0, "Failed to free %d pages at %p", int(count), ptr);
+}
+inline void protectPage(void* addr) {
+  auto res = mprotect(addr, pageSize(), PROT_NONE);
+  (void)res;
+  MARL_ASSERT(res == 0, "Failed to protect page at %p", addr);
+}
+}  // anonymous namespace
+#elif defined(__Fuchsia__)
+#include <unistd.h>
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
+namespace {
+// This was a static in pageSize(), but due to the following TSAN false-positive
+// bug, this has been moved out to a global.
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68338
+const size_t kPageSize = sysconf(_SC_PAGESIZE);
+inline size_t pageSize() {
+  return kPageSize;
+}
+inline void* allocatePages(size_t count) {
+  auto length = count * kPageSize;
+  zx_handle_t vmo;
+  if (zx_vmo_create(length, 0, &vmo) != ZX_OK) {
+    return nullptr;
+  }
+  zx_vaddr_t reservation;
+  zx_status_t status =
+      zx_vmar_map(zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0,
+                  vmo, 0, length, &reservation);
+  zx_handle_close(vmo);
+  (void)status;
+  MARL_ASSERT(status == ZX_OK, "Failed to allocate %d pages", int(count));
+  return reinterpret_cast<void*>(reservation);
+}
+inline void freePages(void* ptr, size_t count) {
+  auto length = count * kPageSize;
+  zx_status_t status = zx_vmar_unmap(zx_vmar_root_self(),
+                                     reinterpret_cast<zx_vaddr_t>(ptr), length);
+  (void)status;
+  MARL_ASSERT(status == ZX_OK, "Failed to free %d pages at %p", int(count),
+              ptr);
+}
+inline void protectPage(void* addr) {
+  zx_status_t status = zx_vmar_protect(
+      zx_vmar_root_self(), 0, reinterpret_cast<zx_vaddr_t>(addr), kPageSize);
+  (void)status;
+  MARL_ASSERT(status == ZX_OK, "Failed to protect page at %p", addr);
+}
+}  // anonymous namespace
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN 1
+#include <Windows.h>
+namespace {
+inline size_t pageSize() {
+  static auto size = [] {
+    SYSTEM_INFO systemInfo = {};
+    GetSystemInfo(&systemInfo);
+    return systemInfo.dwPageSize;
+  }();
+  return size;
+}
+inline void* allocatePages(size_t count) {
+  auto mapping = VirtualAlloc(nullptr, count * pageSize(),
+                              MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+  MARL_ASSERT(mapping != nullptr, "Failed to allocate %d pages", int(count));
+  return mapping;
+}
+inline void freePages(void* ptr, size_t count) {
+  (void)count;
+  auto res = VirtualFree(ptr, 0, MEM_RELEASE);
+  (void)res;
+  MARL_ASSERT(res != 0, "Failed to free %d pages at %p", int(count), ptr);
+}
+inline void protectPage(void* addr) {
+  DWORD oldVal = 0;
+  auto res = VirtualProtect(addr, pageSize(), PAGE_NOACCESS, &oldVal);
+  (void)res;
+  MARL_ASSERT(res != 0, "Failed to protect page at %p", addr);
+}
+}  // anonymous namespace
+#else
+#error "Page based allocation not implemented for this platform"
+#endif
+
+namespace {
+
+// pagedMalloc() allocates size bytes of uninitialized storage with the
+// specified minimum byte alignment using OS specific page mapping calls.
+// If guardLow is true then reads or writes to the page below the returned
+// address will cause a page fault.
+// If guardHigh is true then reads or writes to the page above the allocated
+// block will cause a page fault.
+// The pointer returned must be freed with pagedFree().
+void* pagedMalloc(size_t alignment,
+                  size_t size,
+                  bool guardLow,
+                  bool guardHigh) {
+  (void)alignment;
+  MARL_ASSERT(alignment < pageSize(),
+              "alignment (0x%x) must be less than the page size (0x%x)",
+              int(alignment), int(pageSize()));
+  auto numRequestedPages = (size + pageSize() - 1) / pageSize();
+  auto numTotalPages =
+      numRequestedPages + (guardLow ? 1 : 0) + (guardHigh ? 1 : 0);
+  auto mem = reinterpret_cast<uint8_t*>(allocatePages(numTotalPages));
+  if (guardLow) {
+    protectPage(mem);
+    mem += pageSize();
+  }
+  if (guardHigh) {
+    protectPage(mem + numRequestedPages * pageSize());
+  }
+  return mem;
+}
+
+// pagedFree() frees the memory allocated with pagedMalloc().
+void pagedFree(void* ptr,
+               size_t alignment,
+               size_t size,
+               bool guardLow,
+               bool guardHigh) {
+  (void)alignment;
+  MARL_ASSERT(alignment < pageSize(),
+              "alignment (0x%x) must be less than the page size (0x%x)",
+              int(alignment), int(pageSize()));
+  auto numRequestedPages = (size + pageSize() - 1) / pageSize();
+  auto numTotalPages =
+      numRequestedPages + (guardLow ? 1 : 0) + (guardHigh ? 1 : 0);
+  if (guardLow) {
+    ptr = reinterpret_cast<uint8_t*>(ptr) - pageSize();
+  }
+  freePages(ptr, numTotalPages);
+}
+
+// alignedMalloc() allocates size bytes of uninitialized storage with the
+// specified minimum byte alignment. The pointer returned must be freed with
+// alignedFree().
+inline void* alignedMalloc(size_t alignment, size_t size) {
+  size_t allocSize = size + alignment + sizeof(void*);
+  auto allocation = malloc(allocSize);
+  auto aligned = reinterpret_cast<uint8_t*>(marl::alignUp(
+      reinterpret_cast<uintptr_t>(allocation), alignment));  // align
+  memcpy(aligned + size, &allocation, sizeof(void*));  // pointer-to-allocation
+  return aligned;
+}
+
+// alignedFree() frees memory allocated by alignedMalloc.
+inline void alignedFree(void* ptr, size_t size) {
+  void* base;
+  memcpy(&base, reinterpret_cast<uint8_t*>(ptr) + size, sizeof(void*));
+  free(base);
+}
+
+class DefaultAllocator : public marl::Allocator {
+ public:
+  static DefaultAllocator instance;
+
+  virtual marl::Allocation allocate(
+      const marl::Allocation::Request& request) override {
+    void* ptr = nullptr;
+
+    if (request.useGuards) {
+      ptr = ::pagedMalloc(request.alignment, request.size, true, true);
+    } else if (request.alignment > 1U) {
+      ptr = ::alignedMalloc(request.alignment, request.size);
+    } else {
+      ptr = ::malloc(request.size);
+    }
+
+    MARL_ASSERT(ptr != nullptr, "Allocation failed");
+    MARL_ASSERT(reinterpret_cast<uintptr_t>(ptr) % request.alignment == 0,
+                "Allocation gave incorrect alignment");
+
+    marl::Allocation allocation;
+    allocation.ptr = ptr;
+    allocation.request = request;
+    return allocation;
+  }
+
+  virtual void free(const marl::Allocation& allocation) override {
+    if (allocation.request.useGuards) {
+      ::pagedFree(allocation.ptr, allocation.request.alignment,
+                  allocation.request.size, true, true);
+    } else if (allocation.request.alignment > 1U) {
+      ::alignedFree(allocation.ptr, allocation.request.size);
+    } else {
+      ::free(allocation.ptr);
+    }
+  }
+};
+
+DefaultAllocator DefaultAllocator::instance;
+
+}  // anonymous namespace
+
+namespace marl {
+
+Allocator* Allocator::Default = &DefaultAllocator::instance;
+
+size_t pageSize() {
+  return ::pageSize();
+}
+
+}  // namespace marl
--- a/3party/marl/src/memory_test.cpp
+++ b/3party/marl/src/memory_test.cpp
@ -0,0 +1,89 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/memory.h"
+
+#include "marl_test.h"
+
+class AllocatorTest : public testing::Test {
+ public:
+  marl::Allocator* allocator = marl::Allocator::Default;
+};
+
+TEST_F(AllocatorTest, AlignedAllocate) {
+  for (auto useGuards : {false, true}) {
+    for (auto alignment : {1, 2, 4, 8, 16, 32, 64, 128}) {
+      for (auto size : {1,   2,   3,   4,   5,   7,   8,   14,  16,  17,
+                        31,  34,  50,  63,  64,  65,  100, 127, 128, 129,
+                        200, 255, 256, 257, 500, 511, 512, 513}) {
+        marl::Allocation::Request request;
+        request.alignment = alignment;
+        request.size = size;
+        request.useGuards = useGuards;
+
+        auto allocation = allocator->allocate(request);
+        auto ptr = allocation.ptr;
+        ASSERT_EQ(allocation.request.size, request.size);
+        ASSERT_EQ(allocation.request.alignment, request.alignment);
+        ASSERT_EQ(allocation.request.useGuards, request.useGuards);
+        ASSERT_EQ(allocation.request.usage, request.usage);
+        ASSERT_EQ(reinterpret_cast<uintptr_t>(ptr) & (alignment - 1), 0U);
+        memset(ptr, 0,
+               size);  // Check the memory was actually allocated.
+        allocator->free(allocation);
+      }
+    }
+  }
+}
+
+struct alignas(16) StructWith16ByteAlignment {
+  uint8_t i;
+  uint8_t padding[15];
+};
+struct alignas(32) StructWith32ByteAlignment {
+  uint8_t i;
+  uint8_t padding[31];
+};
+struct alignas(64) StructWith64ByteAlignment {
+  uint8_t i;
+  uint8_t padding[63];
+};
+
+TEST_F(AllocatorTest, Create) {
+  auto s16 = allocator->create<StructWith16ByteAlignment>();
+  auto s32 = allocator->create<StructWith32ByteAlignment>();
+  auto s64 = allocator->create<StructWith64ByteAlignment>();
+  ASSERT_EQ(alignof(StructWith16ByteAlignment), 16U);
+  ASSERT_EQ(alignof(StructWith32ByteAlignment), 32U);
+  ASSERT_EQ(alignof(StructWith64ByteAlignment), 64U);
+  ASSERT_EQ(reinterpret_cast<uintptr_t>(s16) & 15U, 0U);
+  ASSERT_EQ(reinterpret_cast<uintptr_t>(s32) & 31U, 0U);
+  ASSERT_EQ(reinterpret_cast<uintptr_t>(s64) & 63U, 0U);
+  allocator->destroy(s64);
+  allocator->destroy(s32);
+  allocator->destroy(s16);
+}
+
+#if GTEST_HAS_DEATH_TEST
+TEST_F(AllocatorTest, Guards) {
+  marl::Allocation::Request request;
+  request.alignment = 16;
+  request.size = 16;
+  request.useGuards = true;
+  auto alloc = allocator->allocate(request);
+  auto ptr = reinterpret_cast<uint8_t*>(alloc.ptr);
+  EXPECT_DEATH(ptr[-1] = 1, "");
+  EXPECT_DEATH(ptr[marl::pageSize()] = 1, "");
+}
+#endif
--- a/3party/marl/src/non_marl_bench.cpp
+++ b/3party/marl/src/non_marl_bench.cpp
@ -0,0 +1,167 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains a number of benchmarks that do not use marl.
+// They exist to compare marl's performance against other simple scheduler
+// approaches.
+
+#include "marl_bench.h"
+
+#include "benchmark/benchmark.h"
+
+#include <mutex>
+#include <queue>
+#include <thread>
+
+namespace {
+
+// Event provides a basic wait-and-signal synchronization primitive.
+class Event {
+ public:
+  // wait blocks until the event is fired.
+  void wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [&] { return signalled_; });
+  }
+
+  // signal signals the Event, unblocking any calls to wait.
+  void signal() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    signalled_ = true;
+    cv_.notify_all();
+  }
+
+ private:
+  std::condition_variable cv_;
+  std::mutex mutex_;
+  bool signalled_ = false;
+};
+
+}  // anonymous namespace
+
+// A simple multi-thread, single-queue task executor that shares a single mutex
+// across N threads. This implementation suffers from lock contention.
+static void SingleQueueTaskExecutor(benchmark::State& state) {
+  using Task = std::function<uint32_t(uint32_t)>;
+
+  auto const numTasks = Schedule::numTasks(state);
+  auto const numThreads = Schedule::numThreads(state);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    std::mutex mutex;
+    // Set everything up with the mutex locked to prevent the threads from
+    // performing work while the timing is paused.
+    mutex.lock();
+
+    // Set up the tasks.
+    std::queue<Task> tasks;
+    for (int i = 0; i < numTasks; i++) {
+      tasks.push(Schedule::doSomeWork);
+    }
+
+    auto taskRunner = [&] {
+      while (true) {
+        Task task;
+
+        // Take the next task.
+        // Note that this lock is likely to block while waiting for other
+        // threads.
+        mutex.lock();
+        if (tasks.size() > 0) {
+          task = tasks.front();
+          tasks.pop();
+        }
+        mutex.unlock();
+
+        if (task) {
+          task(123);
+        } else {
+          return;  // done.
+        }
+      }
+    };
+
+    // Set up the threads.
+    std::vector<std::thread> threads;
+    for (int i = 0; i < numThreads; i++) {
+      threads.emplace_back(std::thread(taskRunner));
+    }
+
+    state.ResumeTiming();
+    mutex.unlock();  // Go threads, go!
+
+    if (numThreads > 0) {
+      // Wait for all threads to finish.
+      for (auto& thread : threads) {
+        thread.join();
+      }
+    } else {
+      // Single-threaded test - just run the worker.
+      taskRunner();
+    }
+  }
+}
+BENCHMARK(SingleQueueTaskExecutor)->Apply(Schedule::args);
+
+// A simple multi-thread, multi-queue task executor that avoids lock contention.
+// Tasks queues are evenly balanced, and each should take an equal amount of
+// time to execute.
+static void MultiQueueTaskExecutor(benchmark::State& state) {
+  using Task = std::function<uint32_t(uint32_t)>;
+  using TaskQueue = std::vector<Task>;
+
+  auto const numTasks = Schedule::numTasks(state);
+  auto const numThreads = Schedule::numThreads(state);
+  auto const numQueues = std::max(numThreads, 1);
+
+  // Set up the tasks queues.
+  std::vector<TaskQueue> taskQueues(numQueues);
+  for (int i = 0; i < numTasks; i++) {
+    taskQueues[i % numQueues].emplace_back(Schedule::doSomeWork);
+  }
+
+  for (auto _ : state) {
+    if (numThreads > 0) {
+      state.PauseTiming();
+      Event start;
+
+      // Set up the threads.
+      std::vector<std::thread> threads;
+      for (int i = 0; i < numThreads; i++) {
+        threads.emplace_back(std::thread([&, i] {
+          start.wait();
+          for (auto& task : taskQueues[i]) {
+            task(123);
+          }
+        }));
+      }
+
+      state.ResumeTiming();
+      start.signal();
+
+      // Wait for all threads to finish.
+      for (auto& thread : threads) {
+        thread.join();
+      }
+    } else {
+      // Single-threaded test - just run the tasks.
+      for (auto& task : taskQueues[0]) {
+        task(123);
+      }
+    }
+  }
+}
+BENCHMARK(MultiQueueTaskExecutor)->Apply(Schedule::args);
--- a/3party/marl/src/osfiber.h
+++ b/3party/marl/src/osfiber.h
@ -0,0 +1,35 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/sanitizers.h"
+
+#ifndef MARL_USE_FIBER_STACK_GUARDS
+#if !defined(NDEBUG) && !MARL_ADDRESS_SANITIZER_ENABLED
+#define MARL_USE_FIBER_STACK_GUARDS 1
+#else
+#define MARL_USE_FIBER_STACK_GUARDS 0
+#endif
+#endif  // MARL_USE_FIBER_STACK_GUARDS
+
+#if MARL_USE_FIBER_STACK_GUARDS && MARL_ADDRESS_SANITIZER_ENABLED
+#warning "ASAN can raise spurious failures when using mmap() allocated stacks"
+#endif
+
+#if defined(_WIN32)
+#include "osfiber_windows.h"
+#elif defined(MARL_FIBERS_USE_UCONTEXT)
+#include "osfiber_ucontext.h"
+#else
+#include "osfiber_asm.h"
+#endif
--- a/3party/marl/src/osfiber_aarch64.c
+++ b/3party/marl/src/osfiber_aarch64.c
@ -0,0 +1,58 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__aarch64__)
+
+#include <stddef.h>
+
+#include "osfiber_asm_aarch64.h"
+
+#include "marl/export.h"
+
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+// __attribute__((weak)) doesn't work on MacOS.
+#if defined(linux) || defined(__linux) || defined(__linux__)
+// This is needed for HWSAan runtimes that don't have this commit:
+// https://reviews.llvm.org/D149228.
+__attribute__((weak)) void __hwasan_tag_memory(const volatile void *p,
+    unsigned char tag, size_t size);
+__attribute((weak)) void *__hwasan_tag_pointer(const volatile void *p,
+    unsigned char tag);
+#endif
+
+MARL_EXPORT
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+
+#if defined(linux) || defined(__linux) || defined(__linux__)
+  if (__hwasan_tag_memory && __hwasan_tag_pointer) {
+    stack = __hwasan_tag_pointer(stack, 0);
+    __hwasan_tag_memory(stack, 0, stack_size);
+  }
+#endif
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->LR = (uintptr_t)&marl_fiber_trampoline;
+  ctx->r0 = (uintptr_t)target;
+  ctx->r1 = (uintptr_t)arg;
+  ctx->SP = ((uintptr_t)stack_top) & ~(uintptr_t)15;
+}
+
+#endif  // defined(__aarch64__)
--- a/3party/marl/src/osfiber_arm.c
+++ b/3party/marl/src/osfiber_arm.c
@ -0,0 +1,39 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__arm__)
+
+#include "osfiber_asm_arm.h"
+
+#include "marl/export.h"
+
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+MARL_EXPORT
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->LR = (uintptr_t)&marl_fiber_trampoline;
+  ctx->r0 = (uintptr_t)target;
+  ctx->r1 = (uintptr_t)arg;
+  ctx->SP = ((uintptr_t)stack_top) & ~(uintptr_t)15;
+}
+
+#endif  // defined(__arm__)
--- a/3party/marl/src/osfiber_asm.h
+++ b/3party/marl/src/osfiber_asm.h
@ -0,0 +1,154 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Minimal assembly implementations of fiber context switching for Unix-based
+// platforms.
+//
+// Note: Unlike makecontext, swapcontext or the Windows fiber APIs, these
+// assembly implementations *do not* save or restore signal masks,
+// floating-point control or status registers, FS and GS segment registers,
+// thread-local storage state nor any SIMD registers. This should not be a
+// problem as the marl scheduler requires fibers to be executed on the same
+// thread throughout their lifetime.
+
+#if defined(__x86_64__)
+#include "osfiber_asm_x64.h"
+#elif defined(__i386__)
+#include "osfiber_asm_x86.h"
+#elif defined(__aarch64__)
+#include "osfiber_asm_aarch64.h"
+#elif defined(__arm__)
+#include "osfiber_asm_arm.h"
+#elif defined(__powerpc64__)
+#include "osfiber_asm_ppc64.h"
+#elif defined(__mips__) && _MIPS_SIM == _ABI64
+#include "osfiber_asm_mips64.h"
+#elif defined(__riscv) && __riscv_xlen == 64
+#include "osfiber_asm_rv64.h"
+#elif defined(__loongarch__) && _LOONGARCH_SIM == _ABILP64
+#include "osfiber_asm_loongarch64.h"
+#elif defined(__EMSCRIPTEN__)
+#include "osfiber_emscripten.h"
+#else
+#error "Unsupported target"
+#endif
+
+#include "marl/export.h"
+#include "marl/memory.h"
+
+#include <functional>
+#include <memory>
+
+extern "C" {
+
+#if defined(__EMSCRIPTEN__)
+MARL_EXPORT
+void marl_main_fiber_init(marl_fiber_context* ctx);
+#else
+MARL_EXPORT
+inline void marl_main_fiber_init(marl_fiber_context*) {}
+#endif
+MARL_EXPORT
+extern void marl_fiber_set_target(marl_fiber_context*,
+                                  void* stack,
+                                  uint32_t stack_size,
+                                  void (*target)(void*),
+                                  void* arg);
+MARL_EXPORT
+extern void marl_fiber_swap(marl_fiber_context* from,
+                            const marl_fiber_context* to);
+
+}  // extern "C"
+
+namespace marl {
+
+class OSFiber {
+ public:
+  inline OSFiber(Allocator*);
+  inline ~OSFiber();
+
+  // createFiberFromCurrentThread() returns a fiber created from the current
+  // thread.
+  MARL_NO_EXPORT static inline Allocator::unique_ptr<OSFiber>
+  createFiberFromCurrentThread(Allocator* allocator);
+
+  // createFiber() returns a new fiber with the given stack size that will
+  // call func when switched to. func() must end by switching back to another
+  // fiber, and must not return.
+  MARL_NO_EXPORT static inline Allocator::unique_ptr<OSFiber> createFiber(
+      Allocator* allocator,
+      size_t stackSize,
+      const std::function<void()>& func);
+
+  // switchTo() immediately switches execution to the given fiber.
+  // switchTo() must be called on the currently executing fiber.
+  MARL_NO_EXPORT inline void switchTo(OSFiber*);
+
+ private:
+  MARL_NO_EXPORT
+  static inline void run(OSFiber* self);
+
+  Allocator* allocator;
+  marl_fiber_context context;
+  std::function<void()> target;
+  Allocation stack;
+};
+
+OSFiber::OSFiber(Allocator* allocator) : allocator(allocator) {}
+
+OSFiber::~OSFiber() {
+  if (stack.ptr != nullptr) {
+    allocator->free(stack);
+  }
+}
+
+Allocator::unique_ptr<OSFiber> OSFiber::createFiberFromCurrentThread(
+    Allocator* allocator) {
+  auto out = allocator->make_unique<OSFiber>(allocator);
+  out->context = {};
+  marl_main_fiber_init(&out->context);
+  return out;
+}
+
+Allocator::unique_ptr<OSFiber> OSFiber::createFiber(
+    Allocator* allocator,
+    size_t stackSize,
+    const std::function<void()>& func) {
+  Allocation::Request request;
+  request.size = stackSize;
+  request.alignment = 16;
+  request.usage = Allocation::Usage::Stack;
+#if MARL_USE_FIBER_STACK_GUARDS
+  request.useGuards = true;
+#endif
+
+  auto out = allocator->make_unique<OSFiber>(allocator);
+  out->context = {};
+  out->target = func;
+  out->stack = allocator->allocate(request);
+  marl_fiber_set_target(
+      &out->context, out->stack.ptr, static_cast<uint32_t>(stackSize),
+      reinterpret_cast<void (*)(void*)>(&OSFiber::run), out.get());
+  return out;
+}
+
+void OSFiber::run(OSFiber* self) {
+  self->target();
+}
+
+void OSFiber::switchTo(OSFiber* fiber) {
+  marl_fiber_swap(&context, &fiber->context);
+}
+
+}  // namespace marl
--- a/3party/marl/src/osfiber_asm_aarch64.S
+++ b/3party/marl/src/osfiber_asm_aarch64.S
@ -0,0 +1,166 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__aarch64__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_aarch64.h"
+
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT) && __ARM_FEATURE_PAC_DEFAULT
+// ENABLE_PAUTH must be defined to 1 since this value will be used in
+// bitwise-shift later!
+#define ENABLE_PAUTH 1
+
+#if ((__ARM_FEATURE_PAC_DEFAULT & ((1 << 0) | (1 << 1))) == 0)
+#error Pointer authentication defines no valid key!
+#endif
+#else
+#define ENABLE_PAUTH 0
+#endif
+
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
+// ENABLE_BTI must be defined to 1 since this value will be used in
+// bitwise-shift later!
+#define ENABLE_BTI 1
+#else
+#define ENABLE_BTI 0
+#endif
+
+// Although Pointer Authentication and Branch Target Instructions are
+// technically seperate features they work together, i.e. the paciasp and
+// pacibsp instructions serve as BTI landing pads. Therefore PA-instructions are
+// enabled when PA _or_ BTI is enabled!
+#if ENABLE_PAUTH || ENABLE_BTI
+// See section "Pointer Authentication" of
+// https://developer.arm.com/documentation/101028/0012/5--Feature-test-macros
+// for details how to interpret __ARM_FEATURE_PAC_DEFAULT
+#if (__ARM_FEATURE_PAC_DEFAULT & (1 << 0))
+#define PAUTH_SIGN_SP paciasp
+#define PAUTH_AUTH_SP autiasp
+#else
+#define PAUTH_SIGN_SP pacibsp
+#define PAUTH_AUTH_SP autibsp
+#endif
+#else
+#define PAUTH_SIGN_SP
+#define PAUTH_AUTH_SP
+#endif
+
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// x0: from
+// x1: to
+.text
+.global MARL_ASM_SYMBOL(marl_fiber_swap)
+.align 4
+MARL_ASM_SYMBOL(marl_fiber_swap):
+
+    // Save context 'from'
+    // TODO: pairs of str can be combined with stp.
+
+    PAUTH_SIGN_SP
+
+    // Store special purpose registers
+    str x16, [x0, #MARL_REG_r16]
+    str x17, [x0, #MARL_REG_r17]
+    str x18, [x0, #MARL_REG_r18]
+
+    // Store callee-preserved registers
+    str x19, [x0, #MARL_REG_r19]
+    str x20, [x0, #MARL_REG_r20]
+    str x21, [x0, #MARL_REG_r21]
+    str x22, [x0, #MARL_REG_r22]
+    str x23, [x0, #MARL_REG_r23]
+    str x24, [x0, #MARL_REG_r24]
+    str x25, [x0, #MARL_REG_r25]
+    str x26, [x0, #MARL_REG_r26]
+    str x27, [x0, #MARL_REG_r27]
+    str x28, [x0, #MARL_REG_r28]
+    str x29, [x0, #MARL_REG_r29]
+
+    str d8,  [x0, #MARL_REG_v8]
+    str d9,  [x0, #MARL_REG_v9]
+    str d10, [x0, #MARL_REG_v10]
+    str d11, [x0, #MARL_REG_v11]
+    str d12, [x0, #MARL_REG_v12]
+    str d13, [x0, #MARL_REG_v13]
+    str d14, [x0, #MARL_REG_v14]
+    str d15, [x0, #MARL_REG_v15]
+
+    // Store sp and lr
+    mov x2, sp
+    str x2,  [x0, #MARL_REG_SP]
+    str x30, [x0, #MARL_REG_LR]
+
+    // Load context 'to'
+    mov x7, x1
+
+    // Load special purpose registers
+    ldr x16, [x7, #MARL_REG_r16]
+    ldr x17, [x7, #MARL_REG_r17]
+    ldr x18, [x7, #MARL_REG_r18]
+
+    // Load callee-preserved registers
+    ldr x19, [x7, #MARL_REG_r19]
+    ldr x20, [x7, #MARL_REG_r20]
+    ldr x21, [x7, #MARL_REG_r21]
+    ldr x22, [x7, #MARL_REG_r22]
+    ldr x23, [x7, #MARL_REG_r23]
+    ldr x24, [x7, #MARL_REG_r24]
+    ldr x25, [x7, #MARL_REG_r25]
+    ldr x26, [x7, #MARL_REG_r26]
+    ldr x27, [x7, #MARL_REG_r27]
+    ldr x28, [x7, #MARL_REG_r28]
+    ldr x29, [x7, #MARL_REG_r29]
+
+    ldr d8,  [x7, #MARL_REG_v8]
+    ldr d9,  [x7, #MARL_REG_v9]
+    ldr d10, [x7, #MARL_REG_v10]
+    ldr d11, [x7, #MARL_REG_v11]
+    ldr d12, [x7, #MARL_REG_v12]
+    ldr d13, [x7, #MARL_REG_v13]
+    ldr d14, [x7, #MARL_REG_v14]
+    ldr d15, [x7, #MARL_REG_v15]
+
+    // Load parameter registers
+    ldr x0, [x7, #MARL_REG_r0]
+    ldr x1, [x7, #MARL_REG_r1]
+
+    // Load sp and lr
+    ldr x30, [x7, #MARL_REG_LR]
+    ldr x2,  [x7, #MARL_REG_SP]
+    mov sp, x2
+
+    PAUTH_AUTH_SP
+
+    ret
+
+#if ENABLE_PAUTH || ENABLE_BTI
+// see
+// https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst#program-property
+.pushsection .note.gnu.property, "a";
+    .balign 8
+    .long 4
+    .long 0x10
+    .long 0x5
+    .asciz "GNU"
+    .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+    .long 4
+    .long ((ENABLE_PAUTH)<<1) | ((ENABLE_BTI)<<0) /* PAuth and BTI */
+    .long 0
+.popsection
+#endif
+
+#endif // defined(__aarch64__)
--- a/3party/marl/src/osfiber_asm_aarch64.h
+++ b/3party/marl/src/osfiber_asm_aarch64.h
@ -0,0 +1,146 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_r0 0x00
+#define MARL_REG_r1 0x08
+#define MARL_REG_r16 0x10
+#define MARL_REG_r17 0x18
+#define MARL_REG_r18 0x20
+#define MARL_REG_r19 0x28
+#define MARL_REG_r20 0x30
+#define MARL_REG_r21 0x38
+#define MARL_REG_r22 0x40
+#define MARL_REG_r23 0x48
+#define MARL_REG_r24 0x50
+#define MARL_REG_r25 0x58
+#define MARL_REG_r26 0x60
+#define MARL_REG_r27 0x68
+#define MARL_REG_r28 0x70
+#define MARL_REG_r29 0x78
+#define MARL_REG_v8 0x80
+#define MARL_REG_v9 0x88
+#define MARL_REG_v10 0x90
+#define MARL_REG_v11 0x98
+#define MARL_REG_v12 0xa0
+#define MARL_REG_v13 0xa8
+#define MARL_REG_v14 0xb0
+#define MARL_REG_v15 0xb8
+#define MARL_REG_SP 0xc0
+#define MARL_REG_LR 0xc8
+
+#if defined(__APPLE__)
+#define MARL_ASM_SYMBOL(x) _##x
+#else
+#define MARL_ASM_SYMBOL(x) x
+#endif
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+// Procedure Call Standard for the ARM 64-bit Architecture
+// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+struct marl_fiber_context {
+  // parameter registers
+  uintptr_t r0;
+  uintptr_t r1;
+
+  // special purpose registers
+  uintptr_t r16;
+  uintptr_t r17;
+  uintptr_t r18;  // platform specific (maybe inter-procedural state)
+
+  // callee-saved registers
+  uintptr_t r19;
+  uintptr_t r20;
+  uintptr_t r21;
+  uintptr_t r22;
+  uintptr_t r23;
+  uintptr_t r24;
+  uintptr_t r25;
+  uintptr_t r26;
+  uintptr_t r27;
+  uintptr_t r28;
+  uintptr_t r29;
+
+  uintptr_t v8;
+  uintptr_t v9;
+  uintptr_t v10;
+  uintptr_t v11;
+  uintptr_t v12;
+  uintptr_t v13;
+  uintptr_t v14;
+  uintptr_t v15;
+
+  uintptr_t SP;  // stack pointer
+  uintptr_t LR;  // link register (R30)
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, r0) == MARL_REG_r0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r1) == MARL_REG_r1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r16) == MARL_REG_r16,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r17) == MARL_REG_r17,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r18) == MARL_REG_r18,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r19) == MARL_REG_r19,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r20) == MARL_REG_r20,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r21) == MARL_REG_r21,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r22) == MARL_REG_r22,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r23) == MARL_REG_r23,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r24) == MARL_REG_r24,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r25) == MARL_REG_r25,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r26) == MARL_REG_r26,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r27) == MARL_REG_r27,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r28) == MARL_REG_r28,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r29) == MARL_REG_r29,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v8) == MARL_REG_v8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v9) == MARL_REG_v9,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v10) == MARL_REG_v10,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v11) == MARL_REG_v11,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v12) == MARL_REG_v12,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v13) == MARL_REG_v13,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v14) == MARL_REG_v14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v15) == MARL_REG_v15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, SP) == MARL_REG_SP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, LR) == MARL_REG_LR,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_asm_arm.S
+++ b/3party/marl/src/osfiber_asm_arm.S
@ -0,0 +1,75 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__arm__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_arm.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// x0: from
+// x1: to
+.text
+.global marl_fiber_swap
+.align 4
+.type marl_fiber_swap, %function
+marl_fiber_swap:
+
+    // Save context 'from'
+    // TODO: multiple registers can be stored in a single instruction with: stm rA, {rB-rC}
+
+    // Store special purpose registers
+    str r12, [r0, #MARL_REG_r12]
+
+    // Store callee-preserved registers
+    str r4, [r0, #MARL_REG_r4]
+    str r5, [r0, #MARL_REG_r5]
+    str r6, [r0, #MARL_REG_r6]
+    str r7, [r0, #MARL_REG_r7]
+    str r8, [r0, #MARL_REG_r8]
+    str r9, [r0, #MARL_REG_r9]
+    str r10, [r0, #MARL_REG_r10]
+    str r11, [r0, #MARL_REG_r11]
+
+    // Store sp, lr and pc
+    str sp, [r0, #MARL_REG_SP]
+    str lr, [r0, #MARL_REG_LR]
+
+    // Load context 'to'
+    // TODO: multiple registers can be loaded in a single instruction with: ldm rA, {rB-rC}
+    mov r3, r1
+
+    // Load special purpose registers
+    ldr r12, [r3, #MARL_REG_r12]
+
+    // Load callee-preserved registers
+    ldr r4, [r3, #MARL_REG_r4]
+    ldr r5, [r3, #MARL_REG_r5]
+    ldr r6, [r3, #MARL_REG_r6]
+    ldr r7, [r3, #MARL_REG_r7]
+    ldr r8, [r3, #MARL_REG_r8]
+    ldr r9, [r3, #MARL_REG_r9]
+    ldr r10, [r3, #MARL_REG_r10]
+    ldr r11, [r3, #MARL_REG_r11]
+
+    // Load parameter registers
+    ldr r0, [r3, #MARL_REG_r0]
+    ldr r1, [r3, #MARL_REG_r1]
+
+    // Load sp, lr and pc
+    ldr sp, [r3, #MARL_REG_SP]
+    ldr lr, [r3, #MARL_REG_LR]
+    mov pc, lr
+
+#endif // defined(__arm__)
--- a/3party/marl/src/osfiber_asm_arm.h
+++ b/3party/marl/src/osfiber_asm_arm.h
@ -0,0 +1,119 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_r0 0x00
+#define MARL_REG_r1 0x04
+#define MARL_REG_r12 0x08
+#define MARL_REG_r4 0x0c
+#define MARL_REG_r5 0x10
+#define MARL_REG_r6 0x14
+#define MARL_REG_r7 0x18
+#define MARL_REG_r8 0x1c
+#define MARL_REG_r9 0x20
+#define MARL_REG_r10 0x24
+#define MARL_REG_r11 0x28
+#define MARL_REG_v8 0x2c
+#define MARL_REG_v9 0x30
+#define MARL_REG_v10 0x34
+#define MARL_REG_v11 0x38
+#define MARL_REG_v12 0x3c
+#define MARL_REG_v13 0x40
+#define MARL_REG_v14 0x44
+#define MARL_REG_v15 0x48
+#define MARL_REG_SP 0x4c
+#define MARL_REG_LR 0x50
+
+#ifndef MARL_BUILD_ASM
+#include <stdint.h>
+
+// Procedure Call Standard for the ARM 64-bit Architecture
+// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+struct marl_fiber_context {
+  // parameter registers
+  uintptr_t r0;
+  uintptr_t r1;
+
+  // special purpose registers
+  uintptr_t r12;  // Intra-Procedure-call
+
+  // callee-saved registers
+  uintptr_t r4;
+  uintptr_t r5;
+  uintptr_t r6;
+  uintptr_t r7;
+  uintptr_t r8;
+  uintptr_t r9;
+  uintptr_t r10;
+  uintptr_t r11;
+
+  uintptr_t v8;
+  uintptr_t v9;
+  uintptr_t v10;
+  uintptr_t v11;
+  uintptr_t v12;
+  uintptr_t v13;
+  uintptr_t v14;
+  uintptr_t v15;
+
+  uintptr_t SP;  // stack pointer (r13)
+  uintptr_t LR;  // link register (r14)
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, r0) == MARL_REG_r0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r1) == MARL_REG_r1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r12) == MARL_REG_r12,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r4) == MARL_REG_r4,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r5) == MARL_REG_r5,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r6) == MARL_REG_r6,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r7) == MARL_REG_r7,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r8) == MARL_REG_r8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r9) == MARL_REG_r9,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r10) == MARL_REG_r10,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r11) == MARL_REG_r11,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v8) == MARL_REG_v8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v9) == MARL_REG_v9,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v10) == MARL_REG_v10,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v11) == MARL_REG_v11,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v12) == MARL_REG_v12,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v13) == MARL_REG_v13,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v14) == MARL_REG_v14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, v15) == MARL_REG_v15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, SP) == MARL_REG_SP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, LR) == MARL_REG_LR,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_asm_loongarch64.S
+++ b/3party/marl/src/osfiber_asm_loongarch64.S
@ -0,0 +1,84 @@
+// Copyright 2022 The Marl Authors.
+//
+// Licensed under the Apache License. Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__loongarch_lp64)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_loongarch64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// a0: from
+// a1: to
+.text
+.global marl_fiber_swap
+.align 4
+marl_fiber_swap:
+
+    // Save context 'from'
+
+    // Store callee-preserved registers
+    st.d $s0, $a0, MARL_REG_s0
+    st.d $s1, $a0, MARL_REG_s1
+    st.d $s2, $a0, MARL_REG_s2
+    st.d $s3, $a0, MARL_REG_s3
+    st.d $s4, $a0, MARL_REG_s4
+    st.d $s5, $a0, MARL_REG_s5
+    st.d $s6, $a0, MARL_REG_s6
+    st.d $s7, $a0, MARL_REG_s7
+    st.d $s8, $a0, MARL_REG_s8
+
+    fst.d $fs0, $a0, MARL_REG_fs0
+    fst.d $fs1, $a0, MARL_REG_fs1
+    fst.d $fs2, $a0, MARL_REG_fs2
+    fst.d $fs3, $a0, MARL_REG_fs3
+    fst.d $fs4, $a0, MARL_REG_fs4
+    fst.d $fs5, $a0, MARL_REG_fs5
+    fst.d $fs6, $a0, MARL_REG_fs6
+    fst.d $fs7, $a0, MARL_REG_fs7
+
+    st.d $ra, $a0, MARL_REG_ra
+    st.d $sp, $a0, MARL_REG_sp
+    st.d $fp, $a0, MARL_REG_fp
+
+    // Recover callee-preserved registers
+    ld.d $s0, $a1, MARL_REG_s0
+    ld.d $s1, $a1, MARL_REG_s1
+    ld.d $s2, $a1, MARL_REG_s2
+    ld.d $s3, $a1, MARL_REG_s3
+    ld.d $s4, $a1, MARL_REG_s4
+    ld.d $s5, $a1, MARL_REG_s5
+    ld.d $s6, $a1, MARL_REG_s6
+    ld.d $s7, $a1, MARL_REG_s7
+    ld.d $s8, $a1, MARL_REG_s8
+
+    fld.d $fs0, $a1, MARL_REG_fs0
+    fld.d $fs1, $a1, MARL_REG_fs1
+    fld.d $fs2, $a1, MARL_REG_fs2
+    fld.d $fs3, $a1, MARL_REG_fs3
+    fld.d $fs4, $a1, MARL_REG_fs4
+    fld.d $fs5, $a1, MARL_REG_fs5
+    fld.d $fs6, $a1, MARL_REG_fs6
+    fld.d $fs7, $a1, MARL_REG_fs7
+
+    ld.d $ra, $a1, MARL_REG_ra
+    ld.d $sp, $a1, MARL_REG_sp
+    ld.d $fp, $a1, MARL_REG_fp
+
+    // Recover arguments
+    ld.d $a0, $a1, MARL_REG_a0
+    ld.d $a1, $a1, MARL_REG_a1
+
+    jr $ra // Jump to the trampoline
+
+#endif // defined(__loongarch_lp64)
--- a/3party/marl/src/osfiber_asm_loongarch64.h
+++ b/3party/marl/src/osfiber_asm_loongarch64.h
@ -0,0 +1,122 @@
+// Copyright 2022 The Marl Authors.
+//
+// Licensed under the Apache License. Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_a0 0x00
+#define MARL_REG_a1 0x08
+#define MARL_REG_s0 0x10
+#define MARL_REG_s1 0x18
+#define MARL_REG_s2 0x20
+#define MARL_REG_s3 0x28
+#define MARL_REG_s4 0x30
+#define MARL_REG_s5 0x38
+#define MARL_REG_s6 0x40
+#define MARL_REG_s7 0x48
+#define MARL_REG_s8 0x50
+#define MARL_REG_fs0 0x58
+#define MARL_REG_fs1 0x60
+#define MARL_REG_fs2 0x68
+#define MARL_REG_fs3 0x70
+#define MARL_REG_fs4 0x78
+#define MARL_REG_fs5 0x80
+#define MARL_REG_fs6 0x88
+#define MARL_REG_fs7 0x90
+#define MARL_REG_ra 0x98
+#define MARL_REG_sp 0xa0
+#define MARL_REG_fp 0xa8
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+// Procedure Call Standard for the LoongArch 64-bit Architecture
+// https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html
+struct marl_fiber_context {
+  // parameter registers (First two)
+  uintptr_t a0;
+  uintptr_t a1;
+
+  // callee-saved registers
+  uintptr_t s0;
+  uintptr_t s1;
+  uintptr_t s2;
+  uintptr_t s3;
+  uintptr_t s4;
+  uintptr_t s5;
+  uintptr_t s6;
+  uintptr_t s7;
+  uintptr_t s8;
+
+  uintptr_t fs0;
+  uintptr_t fs1;
+  uintptr_t fs2;
+  uintptr_t fs3;
+  uintptr_t fs4;
+  uintptr_t fs5;
+  uintptr_t fs6;
+  uintptr_t fs7;
+
+  uintptr_t ra;
+  uintptr_t sp;
+  uintptr_t fp;
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, a0) == MARL_REG_a0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, a1) == MARL_REG_a1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s0) == MARL_REG_s0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s1) == MARL_REG_s1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s2) == MARL_REG_s2,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s3) == MARL_REG_s3,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s4) == MARL_REG_s4,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s5) == MARL_REG_s5,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s6) == MARL_REG_s6,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s7) == MARL_REG_s7,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s8) == MARL_REG_s8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs0) == MARL_REG_fs0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs1) == MARL_REG_fs1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs2) == MARL_REG_fs2,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs3) == MARL_REG_fs3,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs4) == MARL_REG_fs4,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs5) == MARL_REG_fs5,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs6) == MARL_REG_fs6,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs7) == MARL_REG_fs7,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ra) == MARL_REG_ra,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, sp) == MARL_REG_sp,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fp) == MARL_REG_fp,
+              "Bad register offset");
+#endif // __cplusplus
+
+#endif // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_asm_mips64.S
+++ b/3party/marl/src/osfiber_asm_mips64.S
@ -0,0 +1,86 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__mips__) && _MIPS_SIM == _ABI64
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_mips64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// a0: from
+// v0: to
+.text
+.global MARL_ASM_SYMBOL(marl_fiber_swap)
+.align 4
+MARL_ASM_SYMBOL(marl_fiber_swap):
+
+    // Save context 'from'
+
+    // Store callee-preserved registers
+    sd  $s0, MARL_REG_s0($a0)
+    sd  $s1, MARL_REG_s1($a0)
+    sd  $s2, MARL_REG_s2($a0)
+    sd  $s3, MARL_REG_s3($a0)
+    sd  $s4, MARL_REG_s4($a0)
+    sd  $s5, MARL_REG_s5($a0)
+    sd  $s6, MARL_REG_s6($a0)
+    sd  $s7, MARL_REG_s7($a0)
+
+    s.d  $f24, MARL_REG_f24($a0)
+    s.d  $f25, MARL_REG_f25($a0)
+    s.d  $f26, MARL_REG_f26($a0)
+    s.d  $f27, MARL_REG_f27($a0)
+    s.d  $f28, MARL_REG_f28($a0)
+    s.d  $f29, MARL_REG_f29($a0)
+    s.d  $f31, MARL_REG_f30($a0)
+    s.d  $f31, MARL_REG_f31($a0)
+
+    sd  $gp, MARL_REG_gp($a0)
+    sd  $sp, MARL_REG_sp($a0)
+    sd  $fp, MARL_REG_fp($a0)
+    sd  $ra, MARL_REG_ra($a0)
+
+    move  $v0, $a1 // Function have no return, so safe to touch v0
+
+    // Recover callee-preserved registers
+    ld  $s0, MARL_REG_s0($v0)
+    ld  $s1, MARL_REG_s1($v0)
+    ld  $s2, MARL_REG_s2($v0)
+    ld  $s3, MARL_REG_s3($v0)
+    ld  $s4, MARL_REG_s4($v0)
+    ld  $s5, MARL_REG_s5($v0)
+    ld  $s6, MARL_REG_s6($v0)
+    ld  $s7, MARL_REG_s7($v0)
+
+    l.d  $f24, MARL_REG_f24($v0)
+    l.d  $f25, MARL_REG_f25($v0)
+    l.d  $f26, MARL_REG_f26($v0)
+    l.d  $f27, MARL_REG_f27($v0)
+    l.d  $f28, MARL_REG_f28($v0)
+    l.d  $f29, MARL_REG_f29($v0)
+    l.d  $f31, MARL_REG_f30($v0)
+    l.d  $f31, MARL_REG_f31($v0)
+
+    ld  $gp, MARL_REG_gp($v0)
+    ld  $sp, MARL_REG_sp($v0)
+    ld  $fp, MARL_REG_fp($v0)
+    ld  $ra, MARL_REG_ra($v0)
+
+    // Recover arguments
+    ld  $a0, MARL_REG_a0($v0)
+    ld  $a1, MARL_REG_a1($v0)
+
+    jr	$ra
+
+#endif // defined(__mips__) && _MIPS_SIM == _ABI64
--- a/3party/marl/src/osfiber_asm_mips64.h
+++ b/3party/marl/src/osfiber_asm_mips64.h
@ -0,0 +1,126 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_a0 0x00
+#define MARL_REG_a1 0x08
+#define MARL_REG_s0 0x10
+#define MARL_REG_s1 0x18
+#define MARL_REG_s2 0x20
+#define MARL_REG_s3 0x28
+#define MARL_REG_s4 0x30
+#define MARL_REG_s5 0x38
+#define MARL_REG_s6 0x40
+#define MARL_REG_s7 0x48
+#define MARL_REG_f24 0x50
+#define MARL_REG_f25 0x58
+#define MARL_REG_f26 0x60
+#define MARL_REG_f27 0x68
+#define MARL_REG_f28 0x70
+#define MARL_REG_f29 0x78
+#define MARL_REG_f30 0x80
+#define MARL_REG_f31 0x88
+#define MARL_REG_gp 0x90
+#define MARL_REG_sp 0x98
+#define MARL_REG_fp 0xa0
+#define MARL_REG_ra 0xa8
+
+#if defined(__APPLE__)
+#define MARL_ASM_SYMBOL(x) _##x
+#else
+#define MARL_ASM_SYMBOL(x) x
+#endif
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+struct marl_fiber_context {
+  // parameter registers (First two)
+  uintptr_t a0;
+  uintptr_t a1;
+
+  // callee-saved registers
+  uintptr_t s0;
+  uintptr_t s1;
+  uintptr_t s2;
+  uintptr_t s3;
+  uintptr_t s4;
+  uintptr_t s5;
+  uintptr_t s6;
+  uintptr_t s7;
+
+  uintptr_t f24;
+  uintptr_t f25;
+  uintptr_t f26;
+  uintptr_t f27;
+  uintptr_t f28;
+  uintptr_t f29;
+  uintptr_t f30;
+  uintptr_t f31;
+
+  uintptr_t gp;
+  uintptr_t sp;
+  uintptr_t fp;
+  uintptr_t ra;
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, a0) == MARL_REG_a0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, a1) == MARL_REG_a1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s0) == MARL_REG_s0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s1) == MARL_REG_s1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s2) == MARL_REG_s2,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s3) == MARL_REG_s3,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s4) == MARL_REG_s4,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s5) == MARL_REG_s5,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s6) == MARL_REG_s6,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s7) == MARL_REG_s7,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f24) == MARL_REG_f24,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f25) == MARL_REG_f25,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f26) == MARL_REG_f26,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f27) == MARL_REG_f27,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f28) == MARL_REG_f28,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f29) == MARL_REG_f29,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f30) == MARL_REG_f30,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, f31) == MARL_REG_f31,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, gp) == MARL_REG_gp,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, sp) == MARL_REG_sp,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fp) == MARL_REG_fp,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ra) == MARL_REG_ra,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_asm_ppc64.S
+++ b/3party/marl/src/osfiber_asm_ppc64.S
@ -0,0 +1,204 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__powerpc64__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_ppc64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// r3: from
+// r4: to
+.text
+.global marl_fiber_swap
+.align 4
+#if !defined(_CALL_ELF) || (_CALL_ELF != 2)
+.global .marl_fiber_swap
+.pushsection ".opd","aw"
+marl_fiber_swap:
+.quad .marl_fiber_swap
+.quad .TOC.@tocbase
+.quad 0
+.popsection
+.type .marl_fiber_swap,@function
+.marl_fiber_swap:
+#else
+.type marl_fiber_swap @function
+marl_fiber_swap:
+#endif
+
+    // Store non-volatile registers
+    std 1, MARL_REG_R1(3)
+    std 2, MARL_REG_R2(3)
+    std 13, MARL_REG_R13(3)
+    std 14, MARL_REG_R14(3)
+    std 15, MARL_REG_R15(3)
+    std 16, MARL_REG_R16(3)
+    std 17, MARL_REG_R17(3)
+    std 18, MARL_REG_R18(3)
+    std 19, MARL_REG_R19(3)
+    std 20, MARL_REG_R20(3)
+    std 21, MARL_REG_R21(3)
+    std 22, MARL_REG_R22(3)
+    std 23, MARL_REG_R23(3)
+    std 24, MARL_REG_R24(3)
+    std 25, MARL_REG_R25(3)
+    std 26, MARL_REG_R26(3)
+    std 27, MARL_REG_R27(3)
+    std 28, MARL_REG_R28(3)
+    std 29, MARL_REG_R29(3)
+    std 30, MARL_REG_R30(3)
+    std 31, MARL_REG_R31(3)
+
+    // Store special registers
+    mflr 5
+    std 5, MARL_REG_LR(3)
+    mfcr 5
+    std 5, MARL_REG_CCR(3)
+
+    // Store non-volatile floating point registers
+    stfd 14, MARL_REG_FPR14(3)
+    stfd 15, MARL_REG_FPR15(3)
+    stfd 16, MARL_REG_FPR16(3)
+    stfd 17, MARL_REG_FPR17(3)
+    stfd 18, MARL_REG_FPR18(3)
+    stfd 19, MARL_REG_FPR19(3)
+    stfd 20, MARL_REG_FPR20(3)
+    stfd 21, MARL_REG_FPR21(3)
+    stfd 22, MARL_REG_FPR22(3)
+    stfd 23, MARL_REG_FPR23(3)
+    stfd 24, MARL_REG_FPR24(3)
+    stfd 25, MARL_REG_FPR25(3)
+    stfd 26, MARL_REG_FPR26(3)
+    stfd 27, MARL_REG_FPR27(3)
+    stfd 28, MARL_REG_FPR28(3)
+    stfd 29, MARL_REG_FPR29(3)
+    stfd 30, MARL_REG_FPR30(3)
+    stfd 31, MARL_REG_FPR31(3)
+
+    // Store non-volatile altivec registers
+#ifdef __ALTIVEC__
+    li 5, MARL_REG_VMX
+    stvxl 20, 3, 5
+    addi 5, 5, 16
+    stvxl 21, 3, 5
+    addi 5, 5, 16
+    stvxl 22, 3, 5
+    addi 5, 5, 16
+    stvxl 23, 3, 5
+    addi 5, 5, 16
+    stvxl 24, 3, 5
+    addi 5, 5, 16
+    stvxl 25, 3, 5
+    addi 5, 5, 16
+    stvxl 26, 3, 5
+    addi 5, 5, 16
+    stvxl 27, 3, 5
+    addi 5, 5, 16
+    stvxl 28, 3, 5
+    addi 5, 5, 16
+    stvxl 29, 3, 5
+    addi 5, 5, 16
+    stvxl 30, 3, 5
+    addi 5, 5, 16
+    stvxl 31, 3, 5
+
+    mfvrsave 5
+    stw 5, MARL_REG_VRSAVE(3)
+#endif // __ALTIVEC__
+
+    // Load non-volatile registers
+    ld 1, MARL_REG_R1(4)
+    ld 2, MARL_REG_R2(4)
+    ld 13, MARL_REG_R13(4)
+    ld 14, MARL_REG_R14(4)
+    ld 15, MARL_REG_R15(4)
+    ld 16, MARL_REG_R16(4)
+    ld 17, MARL_REG_R17(4)
+    ld 18, MARL_REG_R18(4)
+    ld 19, MARL_REG_R19(4)
+    ld 20, MARL_REG_R20(4)
+    ld 21, MARL_REG_R21(4)
+    ld 22, MARL_REG_R22(4)
+    ld 23, MARL_REG_R23(4)
+    ld 24, MARL_REG_R24(4)
+    ld 25, MARL_REG_R25(4)
+    ld 26, MARL_REG_R26(4)
+    ld 27, MARL_REG_R27(4)
+    ld 28, MARL_REG_R28(4)
+    ld 29, MARL_REG_R29(4)
+    ld 30, MARL_REG_R30(4)
+    ld 31, MARL_REG_R31(4)
+
+    // Load non-volatile floating point registers
+    lfd 14, MARL_REG_FPR14(4)
+    lfd 15, MARL_REG_FPR15(4)
+    lfd 16, MARL_REG_FPR16(4)
+    lfd 17, MARL_REG_FPR17(4)
+    lfd 18, MARL_REG_FPR18(4)
+    lfd 19, MARL_REG_FPR19(4)
+    lfd 20, MARL_REG_FPR20(4)
+    lfd 21, MARL_REG_FPR21(4)
+    lfd 22, MARL_REG_FPR22(4)
+    lfd 23, MARL_REG_FPR23(4)
+    lfd 24, MARL_REG_FPR24(4)
+    lfd 25, MARL_REG_FPR25(4)
+    lfd 26, MARL_REG_FPR26(4)
+    lfd 27, MARL_REG_FPR27(4)
+    lfd 28, MARL_REG_FPR28(4)
+    lfd 29, MARL_REG_FPR29(4)
+    lfd 30, MARL_REG_FPR30(4)
+    lfd 31, MARL_REG_FPR31(4)
+
+    // Load non-volatile altivec registers
+#ifdef __ALTIVEC__
+    li 5, MARL_REG_VMX
+    lvxl 20, 4, 5
+    addi 5, 5, 16
+    lvxl 21, 4, 5
+    addi 5, 5, 16
+    lvxl 22, 4, 5
+    addi 5, 5, 16
+    lvxl 23, 4, 5
+    addi 5, 5, 16
+    lvxl 24, 4, 5
+    addi 5, 5, 16
+    lvxl 25, 4, 5
+    addi 5, 5, 16
+    lvxl 26, 4, 5
+    addi 5, 5, 16
+    lvxl 27, 4, 5
+    addi 5, 5, 16
+    lvxl 28, 4, 5
+    addi 5, 5, 16
+    lvxl 29, 4, 5
+    addi 5, 5, 16
+    lvxl 30, 4, 5
+    addi 5, 5, 16
+    lvxl 31, 4, 5
+
+    lwz 5, MARL_REG_VRSAVE(4)
+    mtvrsave 5
+#endif // __ALTIVEC__
+
+    // Load parameters and entrypoint
+    ld 12, MARL_REG_LR(4)
+    ld 3, MARL_REG_R3(4)
+    ld 4, MARL_REG_R4(4)
+    mtlr 12
+
+    // Branch to entrypoint
+    blr
+
+#endif // defined(__powerpc64__)
--- a/3party/marl/src/osfiber_asm_ppc64.h
+++ b/3party/marl/src/osfiber_asm_ppc64.h
@ -0,0 +1,218 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_R1 0x00
+#define MARL_REG_R2 0x08
+#define MARL_REG_R13 0x10
+#define MARL_REG_R14 0x18
+#define MARL_REG_R15 0x20
+#define MARL_REG_R16 0x28
+#define MARL_REG_R17 0x30
+#define MARL_REG_R18 0x38
+#define MARL_REG_R19 0x40
+#define MARL_REG_R20 0x48
+#define MARL_REG_R21 0x50
+#define MARL_REG_R22 0x58
+#define MARL_REG_R23 0x60
+#define MARL_REG_R24 0x68
+#define MARL_REG_R25 0x70
+#define MARL_REG_R26 0x78
+#define MARL_REG_R27 0x80
+#define MARL_REG_R28 0x88
+#define MARL_REG_R29 0x90
+#define MARL_REG_R30 0x98
+#define MARL_REG_R31 0xa0
+
+#define MARL_REG_R3 0xa8
+#define MARL_REG_R4 0xb0
+
+#define MARL_REG_LR 0xb8
+#define MARL_REG_CCR 0xc0
+
+#define MARL_REG_FPR14 0xc8
+#define MARL_REG_FPR15 0xd0
+#define MARL_REG_FPR16 0xd8
+#define MARL_REG_FPR17 0xe0
+#define MARL_REG_FPR18 0xe8
+#define MARL_REG_FPR19 0xf0
+#define MARL_REG_FPR20 0xf8
+#define MARL_REG_FPR21 0x100
+#define MARL_REG_FPR22 0x108
+#define MARL_REG_FPR23 0x110
+#define MARL_REG_FPR24 0x118
+#define MARL_REG_FPR25 0x120
+#define MARL_REG_FPR26 0x128
+#define MARL_REG_FPR27 0x130
+#define MARL_REG_FPR28 0x138
+#define MARL_REG_FPR29 0x140
+#define MARL_REG_FPR30 0x148
+#define MARL_REG_FPR31 0x150
+
+#define MARL_REG_VRSAVE 0x158
+#define MARL_REG_VMX 0x160
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+struct marl_fiber_context {
+  // non-volatile registers
+  uintptr_t r1;
+  uintptr_t r2;
+  uintptr_t r13;
+  uintptr_t r14;
+  uintptr_t r15;
+  uintptr_t r16;
+  uintptr_t r17;
+  uintptr_t r18;
+  uintptr_t r19;
+  uintptr_t r20;
+  uintptr_t r21;
+  uintptr_t r22;
+  uintptr_t r23;
+  uintptr_t r24;
+  uintptr_t r25;
+  uintptr_t r26;
+  uintptr_t r27;
+  uintptr_t r28;
+  uintptr_t r29;
+  uintptr_t r30;
+  uintptr_t r31;
+
+  // first two parameter registers (r3, r4)
+  uintptr_t r3;
+  uintptr_t r4;
+
+  // special registers
+  uintptr_t lr;
+  uintptr_t ccr;
+
+  // non-volatile floating-point registers (f14-f31)
+  uintptr_t fpr14;
+  uintptr_t fpr15;
+  uintptr_t fpr16;
+  uintptr_t fpr17;
+  uintptr_t fpr18;
+  uintptr_t fpr19;
+  uintptr_t fpr20;
+  uintptr_t fpr21;
+  uintptr_t fpr22;
+  uintptr_t fpr23;
+  uintptr_t fpr24;
+  uintptr_t fpr25;
+  uintptr_t fpr26;
+  uintptr_t fpr27;
+  uintptr_t fpr28;
+  uintptr_t fpr29;
+  uintptr_t fpr30;
+  uintptr_t fpr31;
+
+  // non-volatile altivec registers
+  uint32_t vrsave;
+  uintptr_t vmx[12 * 2];
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, r1) == MARL_REG_R1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r2) == MARL_REG_R2,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r13) == MARL_REG_R13,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r15) == MARL_REG_R15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r16) == MARL_REG_R16,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r17) == MARL_REG_R17,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r18) == MARL_REG_R18,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r19) == MARL_REG_R19,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r20) == MARL_REG_R20,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r21) == MARL_REG_R21,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r22) == MARL_REG_R22,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r23) == MARL_REG_R23,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r24) == MARL_REG_R24,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r25) == MARL_REG_R25,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r26) == MARL_REG_R26,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r27) == MARL_REG_R27,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r28) == MARL_REG_R28,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r29) == MARL_REG_R29,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r30) == MARL_REG_R30,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r31) == MARL_REG_R31,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, r14) == MARL_REG_R14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, lr) == MARL_REG_LR,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ccr) == MARL_REG_CCR,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr14) == MARL_REG_FPR14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr15) == MARL_REG_FPR15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr16) == MARL_REG_FPR16,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr17) == MARL_REG_FPR17,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr18) == MARL_REG_FPR18,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr19) == MARL_REG_FPR19,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr20) == MARL_REG_FPR20,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr21) == MARL_REG_FPR21,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr22) == MARL_REG_FPR22,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr23) == MARL_REG_FPR23,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr24) == MARL_REG_FPR24,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr25) == MARL_REG_FPR25,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr26) == MARL_REG_FPR26,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr27) == MARL_REG_FPR27,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr28) == MARL_REG_FPR28,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr29) == MARL_REG_FPR29,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr30) == MARL_REG_FPR30,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fpr31) == MARL_REG_FPR31,
+              "Bad register offset");
+static_assert((offsetof(marl_fiber_context, vmx) % 16) == 0,
+              "VMX must be quadword aligned");
+static_assert(offsetof(marl_fiber_context, vmx) == MARL_REG_VMX,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, vrsave) == MARL_REG_VRSAVE,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_asm_rv64.S
+++ b/3party/marl/src/osfiber_asm_rv64.S
@ -0,0 +1,100 @@
+// Copyright 2021 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__riscv) && __riscv_xlen == 64
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_rv64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// a0: from
+// a1: to
+.text
+.global marl_fiber_swap
+.align 4
+marl_fiber_swap:
+
+    // Save context 'from'
+
+    // Store callee-preserved registers
+    sd  s0, MARL_REG_s0(a0)
+    sd  s1, MARL_REG_s1(a0)
+    sd  s2, MARL_REG_s2(a0)
+    sd  s3, MARL_REG_s3(a0)
+    sd  s4, MARL_REG_s4(a0)
+    sd  s5, MARL_REG_s5(a0)
+    sd  s6, MARL_REG_s6(a0)
+    sd  s7, MARL_REG_s7(a0)
+    sd  s8, MARL_REG_s8(a0)
+    sd  s9, MARL_REG_s9(a0)
+    sd  s10, MARL_REG_s10(a0)
+    sd  s11, MARL_REG_s11(a0)
+
+    fsd  fs0, MARL_REG_fs0(a0)
+    fsd  fs1, MARL_REG_fs1(a0)
+    fsd  fs2, MARL_REG_fs2(a0)
+    fsd  fs3, MARL_REG_fs3(a0)
+    fsd  fs4, MARL_REG_fs4(a0)
+    fsd  fs5, MARL_REG_fs5(a0)
+    fsd  fs6, MARL_REG_fs6(a0)
+    fsd  fs7, MARL_REG_fs7(a0)
+    fsd  fs8, MARL_REG_fs8(a0)
+    fsd  fs9, MARL_REG_fs9(a0)
+    fsd  fs10, MARL_REG_fs10(a0)
+    fsd  fs11, MARL_REG_fs11(a0)
+
+    sd  sp, MARL_REG_sp(a0)
+    // On RISC-V ra is caller-saved
+    // but we need ra to jump to the trampoline
+    sd  ra, MARL_REG_ra(a0)
+
+    move  t0, a1 // Store a1 in temporary register
+
+    // Recover callee-preserved registers
+    ld  s0, MARL_REG_s0(t0)
+    ld  s1, MARL_REG_s1(t0)
+    ld  s2, MARL_REG_s2(t0)
+    ld  s3, MARL_REG_s3(t0)
+    ld  s4, MARL_REG_s4(t0)
+    ld  s5, MARL_REG_s5(t0)
+    ld  s6, MARL_REG_s6(t0)
+    ld  s7, MARL_REG_s7(t0)
+    ld  s8, MARL_REG_s8(t0)
+    ld  s9, MARL_REG_s9(t0)
+    ld  s10, MARL_REG_s10(t0)
+    ld  s11, MARL_REG_s11(t0)
+
+    fld  fs0, MARL_REG_fs0(t0)
+    fld  fs1, MARL_REG_fs1(t0)
+    fld  fs2, MARL_REG_fs2(t0)
+    fld  fs3, MARL_REG_fs3(t0)
+    fld  fs4, MARL_REG_fs4(t0)
+    fld  fs5, MARL_REG_fs5(t0)
+    fld  fs6, MARL_REG_fs6(t0)
+    fld  fs7, MARL_REG_fs7(t0)
+    fld  fs8, MARL_REG_fs8(t0)
+    fld  fs9, MARL_REG_fs9(t0)
+    fld  fs10, MARL_REG_fs10(t0)
+    fld  fs11, MARL_REG_fs11(t0)
+
+    ld  sp, MARL_REG_sp(t0)
+    ld  ra, MARL_REG_ra(t0)
+
+    // Recover arguments
+    ld  a0, MARL_REG_a0(t0)
+    ld  a1, MARL_REG_a1(t0)
+
+    jr	ra // Jump to the trampoline
+
+#endif // defined(__riscv) && __riscv_xlen == 64
--- a/3party/marl/src/osfiber_asm_rv64.h
+++ b/3party/marl/src/osfiber_asm_rv64.h
@ -0,0 +1,145 @@
+// Copyright 2021 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#define MARL_REG_a0 0x00
+#define MARL_REG_a1 0x08
+#define MARL_REG_s0 0x10
+#define MARL_REG_s1 0x18
+#define MARL_REG_s2 0x20
+#define MARL_REG_s3 0x28
+#define MARL_REG_s4 0x30
+#define MARL_REG_s5 0x38
+#define MARL_REG_s6 0x40
+#define MARL_REG_s7 0x48
+#define MARL_REG_s8 0x50
+#define MARL_REG_s9 0x58
+#define MARL_REG_s10 0x60
+#define MARL_REG_s11 0x68
+#define MARL_REG_fs0 0x70
+#define MARL_REG_fs1 0x78
+#define MARL_REG_fs2 0x80
+#define MARL_REG_fs3 0x88
+#define MARL_REG_fs4 0x90
+#define MARL_REG_fs5 0x98
+#define MARL_REG_fs6 0xa0
+#define MARL_REG_fs7 0xa8
+#define MARL_REG_fs8 0xb0
+#define MARL_REG_fs9 0xb8
+#define MARL_REG_fs10 0xc0
+#define MARL_REG_fs11 0xc8
+#define MARL_REG_sp 0xd0
+#define MARL_REG_ra 0xd8
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+struct marl_fiber_context {
+  // parameter registers (First two)
+  uintptr_t a0;
+  uintptr_t a1;
+
+  // callee-saved registers
+  uintptr_t s0;
+  uintptr_t s1;
+  uintptr_t s2;
+  uintptr_t s3;
+  uintptr_t s4;
+  uintptr_t s5;
+  uintptr_t s6;
+  uintptr_t s7;
+  uintptr_t s8;
+  uintptr_t s9;
+  uintptr_t s10;
+  uintptr_t s11;
+
+  uintptr_t fs0;
+  uintptr_t fs1;
+  uintptr_t fs2;
+  uintptr_t fs3;
+  uintptr_t fs4;
+  uintptr_t fs5;
+  uintptr_t fs6;
+  uintptr_t fs7;
+  uintptr_t fs8;
+  uintptr_t fs9;
+  uintptr_t fs10;
+  uintptr_t fs11;
+
+  uintptr_t sp;
+  uintptr_t ra;
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, a0) == MARL_REG_a0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, a1) == MARL_REG_a1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s0) == MARL_REG_s0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s1) == MARL_REG_s1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s2) == MARL_REG_s2,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s3) == MARL_REG_s3,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s4) == MARL_REG_s4,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s5) == MARL_REG_s5,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s6) == MARL_REG_s6,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s7) == MARL_REG_s7,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s8) == MARL_REG_s8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s9) == MARL_REG_s9,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s10) == MARL_REG_s10,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, s11) == MARL_REG_s11,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs0) == MARL_REG_fs0,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs1) == MARL_REG_fs1,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs2) == MARL_REG_fs2,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs3) == MARL_REG_fs3,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs4) == MARL_REG_fs4,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs5) == MARL_REG_fs5,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs6) == MARL_REG_fs6,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs7) == MARL_REG_fs7,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs8) == MARL_REG_fs8,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs9) == MARL_REG_fs9,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs10) == MARL_REG_fs10,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, fs11) == MARL_REG_fs11,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, sp) == MARL_REG_sp,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ra) == MARL_REG_ra,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_asm_x64.S
+++ b/3party/marl/src/osfiber_asm_x64.S
@ -0,0 +1,65 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__x86_64__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_x64.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// rdi: from
+// rsi: to
+.text
+.global MARL_ASM_SYMBOL(marl_fiber_swap)
+.align 4
+MARL_ASM_SYMBOL(marl_fiber_swap):
+
+    // Save context 'from'
+
+    // Store callee-preserved registers
+    movq        %rbx, MARL_REG_RBX(%rdi)
+    movq        %rbp, MARL_REG_RBP(%rdi)
+    movq        %r12, MARL_REG_R12(%rdi)
+    movq        %r13, MARL_REG_R13(%rdi)
+    movq        %r14, MARL_REG_R14(%rdi)
+    movq        %r15, MARL_REG_R15(%rdi)
+
+    movq        (%rsp), %rcx             /* call stores the return address on the stack before jumping */
+    movq        %rcx, MARL_REG_RIP(%rdi)
+    leaq        8(%rsp), %rcx            /* skip the pushed return address */
+    movq        %rcx, MARL_REG_RSP(%rdi)
+
+    // Load context 'to'
+    movq        %rsi, %r8
+
+    // Load callee-preserved registers
+    movq        MARL_REG_RBX(%r8), %rbx
+    movq        MARL_REG_RBP(%r8), %rbp
+    movq        MARL_REG_R12(%r8), %r12
+    movq        MARL_REG_R13(%r8), %r13
+    movq        MARL_REG_R14(%r8), %r14
+    movq        MARL_REG_R15(%r8), %r15
+
+    // Load first two call parameters
+    movq        MARL_REG_RDI(%r8), %rdi
+    movq        MARL_REG_RSI(%r8), %rsi
+
+    // Load stack pointer
+    movq        MARL_REG_RSP(%r8), %rsp
+
+    // Load instruction pointer, and jump
+    movq        MARL_REG_RIP(%r8), %rcx
+    jmp         *%rcx
+
+#endif // defined(__x86_64__)
--- a/3party/marl/src/osfiber_asm_x64.h
+++ b/3party/marl/src/osfiber_asm_x64.h
@ -0,0 +1,78 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_RBX 0x00
+#define MARL_REG_RBP 0x08
+#define MARL_REG_R12 0x10
+#define MARL_REG_R13 0x18
+#define MARL_REG_R14 0x20
+#define MARL_REG_R15 0x28
+#define MARL_REG_RDI 0x30
+#define MARL_REG_RSI 0x38
+#define MARL_REG_RSP 0x40
+#define MARL_REG_RIP 0x48
+
+#if defined(__APPLE__)
+#define MARL_ASM_SYMBOL(x) _##x
+#else
+#define MARL_ASM_SYMBOL(x) x
+#endif
+
+#ifndef MARL_BUILD_ASM
+
+#include <stdint.h>
+
+struct marl_fiber_context {
+  // callee-saved registers
+  uintptr_t RBX;
+  uintptr_t RBP;
+  uintptr_t R12;
+  uintptr_t R13;
+  uintptr_t R14;
+  uintptr_t R15;
+
+  // parameter registers
+  uintptr_t RDI;
+  uintptr_t RSI;
+
+  // stack and instruction registers
+  uintptr_t RSP;
+  uintptr_t RIP;
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, RBX) == MARL_REG_RBX,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RBP) == MARL_REG_RBP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, R12) == MARL_REG_R12,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, R13) == MARL_REG_R13,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, R14) == MARL_REG_R14,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, R15) == MARL_REG_R15,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RDI) == MARL_REG_RDI,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RSI) == MARL_REG_RSI,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RSP) == MARL_REG_RSP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, RIP) == MARL_REG_RIP,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_asm_x86.S
+++ b/3party/marl/src/osfiber_asm_x86.S
@ -0,0 +1,57 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__i386__)
+
+#define MARL_BUILD_ASM 1
+#include "osfiber_asm_x86.h"
+
+// void marl_fiber_swap(marl_fiber_context* from, const marl_fiber_context* to)
+// esp+4: from
+// esp+8: to
+.text
+.global marl_fiber_swap
+.align 4
+marl_fiber_swap:
+    // Save context 'from'
+    movl        4(%esp), %eax
+
+    // Store callee-preserved registers
+    movl        %ebx, MARL_REG_EBX(%eax)
+    movl        %ebp, MARL_REG_EBP(%eax)
+    movl        %esi, MARL_REG_ESI(%eax)
+    movl        %edi, MARL_REG_EDI(%eax)
+
+    movl        (%esp), %ecx             /* call stores the return address on the stack before jumping */
+    movl        %ecx, MARL_REG_EIP(%eax)
+    lea         4(%esp), %ecx            /* skip the pushed return address */
+    movl        %ecx, MARL_REG_ESP(%eax)
+
+    // Load context 'to'
+    movl        8(%esp), %ecx
+
+    // Load callee-preserved registers
+    movl        MARL_REG_EBX(%ecx), %ebx
+    movl        MARL_REG_EBP(%ecx), %ebp
+    movl        MARL_REG_ESI(%ecx), %esi
+    movl        MARL_REG_EDI(%ecx), %edi
+
+    // Load stack pointer
+    movl        MARL_REG_ESP(%ecx), %esp
+
+    // Load instruction pointer, and jump
+    movl        MARL_REG_EIP(%ecx), %ecx
+    jmp         *%ecx
+
+#endif // defined(__i386__)
--- a/3party/marl/src/osfiber_asm_x86.h
+++ b/3party/marl/src/osfiber_asm_x86.h
@ -0,0 +1,55 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define MARL_REG_EBX 0x00
+#define MARL_REG_EBP 0x04
+#define MARL_REG_ESI 0x08
+#define MARL_REG_EDI 0x0c
+#define MARL_REG_ESP 0x10
+#define MARL_REG_EIP 0x14
+
+#ifndef MARL_BUILD_ASM
+#include <stdint.h>
+
+// Assumes cdecl calling convention.
+// Registers EAX, ECX, and EDX are caller-saved, and the rest are callee-saved.
+struct marl_fiber_context {
+  // callee-saved registers
+  uintptr_t EBX;
+  uintptr_t EBP;
+  uintptr_t ESI;
+  uintptr_t EDI;
+
+  // stack and instruction registers
+  uintptr_t ESP;
+  uintptr_t EIP;
+};
+
+#ifdef __cplusplus
+#include <cstddef>
+static_assert(offsetof(marl_fiber_context, EBX) == MARL_REG_EBX,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, EBP) == MARL_REG_EBP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ESI) == MARL_REG_ESI,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, EDI) == MARL_REG_EDI,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, ESP) == MARL_REG_ESP,
+              "Bad register offset");
+static_assert(offsetof(marl_fiber_context, EIP) == MARL_REG_EIP,
+              "Bad register offset");
+#endif  // __cplusplus
+
+#endif  // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_emscripten.cpp
+++ b/3party/marl/src/osfiber_emscripten.cpp
@ -0,0 +1,60 @@
+// Copyright 2023 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__EMSCRIPTEN__)
+
+#include "osfiber_emscripten.h"
+
+#include "marl/export.h"
+
+
+extern "C" {
+
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+MARL_EXPORT
+void marl_main_fiber_init(marl_fiber_context* ctx) {
+  emscripten_fiber_init_from_current_context(
+          &ctx->context,
+          ctx->asyncify_stack.data(),
+          ctx->asyncify_stack.size());
+}
+
+MARL_EXPORT
+void marl_fiber_set_target(marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+
+  emscripten_fiber_init(
+          &ctx->context,
+          target,
+          arg,
+          stack,
+          stack_size,
+          ctx->asyncify_stack.data(),
+          ctx->asyncify_stack.size());
+}
+
+MARL_EXPORT
+extern void marl_fiber_swap(marl_fiber_context* from,
+                            const marl_fiber_context* to) {
+  emscripten_fiber_swap(&from->context, const_cast<emscripten_fiber_t*>(&to->context));
+}
+}
+#endif  // defined(__EMSCRIPTEN__)
--- a/3party/marl/src/osfiber_emscripten.h
+++ b/3party/marl/src/osfiber_emscripten.h
@ -0,0 +1,30 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MARL_BUILD_WASM
+
+#include <cstdint>
+#include <cstddef>
+#include <array>
+#include <emscripten.h>
+#include <emscripten/fiber.h>
+
+struct marl_fiber_context {
+  // callee-saved data
+  static constexpr size_t asyncify_stack_size = 1024 * 1024;
+  emscripten_fiber_t context;
+  std::array</*std::byte*/ char, asyncify_stack_size> asyncify_stack;
+};
+
+#endif  // MARL_BUILD_ASM
--- a/3party/marl/src/osfiber_loongarch64.c
+++ b/3party/marl/src/osfiber_loongarch64.c
@ -0,0 +1,39 @@
+// Copyright 2022 The Marl Authors.
+//
+// Licensed under the Apache License. Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__loongarch_lp64)
+
+#include "osfiber_asm_loongarch64.h"
+
+#include "marl/export.h"
+
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+MARL_EXPORT
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->ra = (uintptr_t)&marl_fiber_trampoline;
+  ctx->a0 = (uintptr_t)target;
+  ctx->a1 = (uintptr_t)arg;
+  ctx->sp = ((uintptr_t)stack_top) & ~(uintptr_t)15;
+}
+
+#endif // defined(__loongarch_lp64)
--- a/3party/marl/src/osfiber_mips64.c
+++ b/3party/marl/src/osfiber_mips64.c
@ -0,0 +1,39 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__mips__) && _MIPS_SIM == _ABI64
+
+#include "osfiber_asm_mips64.h"
+
+#include "marl/export.h"
+
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+MARL_EXPORT
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->ra = (uintptr_t)&marl_fiber_trampoline;
+  ctx->a0 = (uintptr_t)target;
+  ctx->a1 = (uintptr_t)arg;
+  ctx->sp = ((uintptr_t)stack_top) & ~(uintptr_t)15;
+}
+
+#endif  // defined(__mips__) && _MIPS_SIM == _ABI64
--- a/3party/marl/src/osfiber_ppc64.c
+++ b/3party/marl/src/osfiber_ppc64.c
@ -0,0 +1,62 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__powerpc64__)
+
+#include "osfiber_asm_ppc64.h"
+
+#include "marl/export.h"
+
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+MARL_EXPORT
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t stack_top = (uintptr_t)((uint8_t*)(stack) + stack_size - sizeof(uintptr_t));
+  if ((stack_top % 16) != 0) {
+    stack_top -= (stack_top % 16);
+  }
+
+  // Write a backchain and subtract a minimum stack frame size (32/48)
+  *(uintptr_t*)stack_top = 0;
+#if !defined(_CALL_ELF) || (_CALL_ELF != 2)
+  stack_top -= 48;
+  *(uintptr_t*)stack_top = stack_top + 48;
+#else
+  stack_top -= 32;
+  *(uintptr_t*)stack_top = stack_top + 32;
+#endif
+
+  // Load registers
+  ctx->r1 = stack_top;
+#if !defined(_CALL_ELF) || (_CALL_ELF != 2)
+  ctx->lr = ((const uintptr_t *)marl_fiber_trampoline)[0];
+  ctx->r2 = ((const uintptr_t *)marl_fiber_trampoline)[1];
+#else
+  ctx->lr = (uintptr_t)marl_fiber_trampoline;
+#endif
+  ctx->r3 = (uintptr_t)target;
+  ctx->r4 = (uintptr_t)arg;
+
+  // Thread pointer must be saved in r13
+  __asm__ volatile("mr %0, 13\n" : "=r"(ctx->r13));
+}
+
+#endif  // __powerpc64__
--- a/3party/marl/src/osfiber_rv64.c
+++ b/3party/marl/src/osfiber_rv64.c
@ -0,0 +1,39 @@
+// Copyright 2021 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__riscv) && __riscv_xlen == 64
+
+#include "osfiber_asm_rv64.h"
+
+#include "marl/export.h"
+
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+MARL_EXPORT
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->ra = (uintptr_t)&marl_fiber_trampoline;
+  ctx->a0 = (uintptr_t)target;
+  ctx->a1 = (uintptr_t)arg;
+  ctx->sp = ((uintptr_t)stack_top) & ~(uintptr_t)15;
+}
+
+#endif  // defined(__riscv) && __riscv_xlen == 64
--- a/3party/marl/src/osfiber_test.cpp
+++ b/3party/marl/src/osfiber_test.cpp
@ -0,0 +1,71 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "osfiber.h"
+
+#include "marl_test.h"
+
+namespace {
+
+// A custom, small stack size for the fibers in these tests.
+// Note: Stack sizes less than 16KB may cause issues on some platforms.
+// See: https://github.com/google/marl/issues/201
+constexpr size_t fiberStackSize = 16 * 1024;
+
+}  // anonymous namespace
+
+TEST_F(WithoutBoundScheduler, OSFiber) {
+  std::string str;
+  auto main = marl::OSFiber::createFiberFromCurrentThread(allocator);
+  marl::Allocator::unique_ptr<marl::OSFiber> fiberA, fiberB, fiberC;
+  fiberC = marl::OSFiber::createFiber(allocator, fiberStackSize, [&] {
+    str += "C";
+    fiberC->switchTo(fiberB.get());
+  });
+  fiberB = marl::OSFiber::createFiber(allocator, fiberStackSize, [&] {
+    str += "B";
+    fiberB->switchTo(fiberA.get());
+  });
+  fiberA = marl::OSFiber::createFiber(allocator, fiberStackSize, [&] {
+    str += "A";
+    fiberA->switchTo(main.get());
+  });
+
+  main->switchTo(fiberC.get());
+
+  ASSERT_EQ(str, "CBA");
+}
+
+TEST_F(WithoutBoundScheduler, StackAlignment) {
+  uintptr_t address = 0;
+
+  struct alignas(16) AlignTo16Bytes {
+    uint64_t a, b;
+  };
+
+  auto main = marl::OSFiber::createFiberFromCurrentThread(allocator);
+  marl::Allocator::unique_ptr<marl::OSFiber> fiber;
+  fiber = marl::OSFiber::createFiber(allocator, fiberStackSize, [&] {
+    AlignTo16Bytes stack_var;
+
+    address = reinterpret_cast<uintptr_t>(&stack_var);
+
+    fiber->switchTo(main.get());
+  });
+
+  main->switchTo(fiber.get());
+
+  ASSERT_TRUE((address & 15) == 0)
+      << "Stack variable had unaligned address: 0x" << std::hex << address;
+}
--- a/3party/marl/src/osfiber_ucontext.h
+++ b/3party/marl/src/osfiber_ucontext.h
@ -0,0 +1,140 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if !defined(_XOPEN_SOURCE)
+// This must come before other #includes, otherwise we'll end up with ucontext_t
+// definition mismatches, leading to memory corruption hilarity.
+#define _XOPEN_SOURCE
+#endif  //  !defined(_XOPEN_SOURCE)
+
+#include "marl/debug.h"
+#include "marl/memory.h"
+
+#include <functional>
+#include <memory>
+
+#include <ucontext.h>
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif  // defined(__clang__)
+
+namespace marl {
+
+class OSFiber {
+ public:
+  inline OSFiber(Allocator*);
+  inline ~OSFiber();
+
+  // createFiberFromCurrentThread() returns a fiber created from the current
+  // thread.
+  static inline Allocator::unique_ptr<OSFiber> createFiberFromCurrentThread(
+      Allocator* allocator);
+
+  // createFiber() returns a new fiber with the given stack size that will
+  // call func when switched to. func() must end by switching back to another
+  // fiber, and must not return.
+  static inline Allocator::unique_ptr<OSFiber> createFiber(
+      Allocator* allocator,
+      size_t stackSize,
+      const std::function<void()>& func);
+
+  // switchTo() immediately switches execution to the given fiber.
+  // switchTo() must be called on the currently executing fiber.
+  inline void switchTo(OSFiber*);
+
+ private:
+  Allocator* allocator;
+  ucontext_t context;
+  std::function<void()> target;
+  Allocation stack;
+};
+
+OSFiber::OSFiber(Allocator* allocator) : allocator(allocator) {}
+
+OSFiber::~OSFiber() {
+  if (stack.ptr != nullptr) {
+    allocator->free(stack);
+  }
+}
+
+Allocator::unique_ptr<OSFiber> OSFiber::createFiberFromCurrentThread(
+    Allocator* allocator) {
+  auto out = allocator->make_unique<OSFiber>(allocator);
+  out->context = {};
+  getcontext(&out->context);
+  return out;
+}
+
+Allocator::unique_ptr<OSFiber> OSFiber::createFiber(
+    Allocator* allocator,
+    size_t stackSize,
+    const std::function<void()>& func) {
+  union Args {
+    OSFiber* self;
+    struct {
+      int a;
+      int b;
+    };
+  };
+
+  struct Target {
+    static void Main(int a, int b) {
+      Args u;
+      u.a = a;
+      u.b = b;
+      u.self->target();
+    }
+  };
+
+  Allocation::Request request;
+  request.size = stackSize;
+  request.alignment = 16;
+  request.usage = Allocation::Usage::Stack;
+#if MARL_USE_FIBER_STACK_GUARDS
+  request.useGuards = true;
+#endif
+
+  auto out = allocator->make_unique<OSFiber>(allocator);
+  out->context = {};
+  out->stack = allocator->allocate(request);
+  out->target = func;
+
+  auto res = getcontext(&out->context);
+  (void)res;
+  MARL_ASSERT(res == 0, "getcontext() returned %d", int(res));
+  out->context.uc_stack.ss_sp = out->stack.ptr;
+  out->context.uc_stack.ss_size = stackSize;
+  out->context.uc_link = nullptr;
+
+  Args args{};
+  args.self = out.get();
+  makecontext(&out->context, reinterpret_cast<void (*)()>(&Target::Main), 2,
+              args.a, args.b);
+
+  return out;
+}
+
+void OSFiber::switchTo(OSFiber* fiber) {
+  auto res = swapcontext(&context, &fiber->context);
+  (void)res;
+  MARL_ASSERT(res == 0, "swapcontext() returned %d", int(res));
+}
+
+}  // namespace marl
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif  // defined(__clang__)
--- a/3party/marl/src/osfiber_windows.h
+++ b/3party/marl/src/osfiber_windows.h
@ -0,0 +1,100 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/debug.h"
+#include "marl/memory.h"
+
+#include <functional>
+#include <memory>
+
+#define WIN32_LEAN_AND_MEAN 1
+#include <Windows.h>
+
+namespace marl {
+
+class OSFiber {
+ public:
+  inline ~OSFiber();
+
+  // createFiberFromCurrentThread() returns a fiber created from the current
+  // thread.
+  static inline Allocator::unique_ptr<OSFiber> createFiberFromCurrentThread(
+      Allocator* allocator);
+
+  // createFiber() returns a new fiber with the given stack size that will
+  // call func when switched to. func() must end by switching back to another
+  // fiber, and must not return.
+  static inline Allocator::unique_ptr<OSFiber> createFiber(
+      Allocator* allocator,
+      size_t stackSize,
+      const std::function<void()>& func);
+
+  // switchTo() immediately switches execution to the given fiber.
+  // switchTo() must be called on the currently executing fiber.
+  inline void switchTo(OSFiber*);
+
+ private:
+  static inline void WINAPI run(void* self);
+  LPVOID fiber = nullptr;
+  bool isFiberFromThread = false;
+  std::function<void()> target;
+};
+
+OSFiber::~OSFiber() {
+  if (fiber != nullptr) {
+    if (isFiberFromThread) {
+      ConvertFiberToThread();
+    } else {
+      DeleteFiber(fiber);
+    }
+  }
+}
+
+Allocator::unique_ptr<OSFiber> OSFiber::createFiberFromCurrentThread(
+    Allocator* allocator) {
+  auto out = allocator->make_unique<OSFiber>();
+  out->fiber = ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH);
+  out->isFiberFromThread = true;
+  MARL_ASSERT(out->fiber != nullptr,
+              "ConvertThreadToFiberEx() failed with error 0x%x",
+              int(GetLastError()));
+  return out;
+}
+
+Allocator::unique_ptr<OSFiber> OSFiber::createFiber(
+    Allocator* allocator,
+    size_t stackSize,
+    const std::function<void()>& func) {
+  auto out = allocator->make_unique<OSFiber>();
+  // stackSize is rounded up to the system's allocation granularity (typically
+  // 64 KB).
+  out->fiber = CreateFiberEx(stackSize - 1, stackSize, FIBER_FLAG_FLOAT_SWITCH,
+                             &OSFiber::run, out.get());
+  out->target = func;
+  MARL_ASSERT(out->fiber != nullptr, "CreateFiberEx() failed with error 0x%x",
+              int(GetLastError()));
+  return out;
+}
+
+void OSFiber::switchTo(OSFiber* to) {
+  SwitchToFiber(to->fiber);
+}
+
+void WINAPI OSFiber::run(void* self) {
+  std::function<void()> func;
+  std::swap(func, reinterpret_cast<OSFiber*>(self)->target);
+  func();
+}
+
+}  // namespace marl
--- a/3party/marl/src/osfiber_x64.c
+++ b/3party/marl/src/osfiber_x64.c
@ -0,0 +1,45 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__x86_64__)
+
+#include "osfiber_asm_x64.h"
+
+#include "marl/export.h"
+#include "marl/sanitizers.h"
+
+// You can find an explanation of this code here:
+// https://github.com/google/marl/issues/199
+
+MARL_UNDEFINED_SANITIZER_ONLY(__attribute__((no_sanitize("function"))))
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+MARL_EXPORT
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->RIP = (uintptr_t)&marl_fiber_trampoline;
+  ctx->RDI = (uintptr_t)target;
+  ctx->RSI = (uintptr_t)arg;
+  ctx->RSP = (uintptr_t)&stack_top[-3];
+  stack_top[-2] = 0;  // No return target.
+}
+
+#endif  // defined(__x86_64__)
--- a/3party/marl/src/osfiber_x86.c
+++ b/3party/marl/src/osfiber_x86.c
@ -0,0 +1,47 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(__i386__)
+
+#include "osfiber_asm_x86.h"
+
+#include "marl/export.h"
+
+MARL_EXPORT
+void marl_fiber_trampoline(void (*target)(void*), void* arg) {
+  target(arg);
+}
+
+MARL_EXPORT
+void marl_fiber_set_target(struct marl_fiber_context* ctx,
+                           void* stack,
+                           uint32_t stack_size,
+                           void (*target)(void*),
+                           void* arg) {
+  // The stack pointer needs to be 16-byte aligned when making a 'call'.
+  // The 'call' instruction automatically pushes the return instruction to the
+  // stack (4-bytes), before making the jump.
+  // The marl_fiber_swap() assembly function does not use 'call', instead it
+  // uses 'jmp', so we need to offset the ESP pointer by 4 bytes so that the
+  // stack is still 16-byte aligned when the return target is stack-popped by
+  // the callee.
+  uintptr_t* stack_top = (uintptr_t*)((uint8_t*)(stack) + stack_size);
+  ctx->EIP = (uintptr_t)&marl_fiber_trampoline;
+  ctx->ESP = (uintptr_t)&stack_top[-5];
+  stack_top[-3] = (uintptr_t)arg;
+  stack_top[-4] = (uintptr_t)target;
+  stack_top[-5] = 0;  // No return target.
+}
+
+#endif  // defined(__i386__)
--- a/3party/marl/src/parallelize_test.cpp
+++ b/3party/marl/src/parallelize_test.cpp
@ -0,0 +1,27 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/parallelize.h"
+
+TEST_P(WithBoundScheduler, Parallelize) {
+  bool a = false;
+  bool b = false;
+  bool c = false;
+  marl::parallelize([&] { a = true; }, [&] { b = true; }, [&] { c = true; });
+  ASSERT_TRUE(a);
+  ASSERT_TRUE(b);
+  ASSERT_TRUE(c);
+}
--- a/3party/marl/src/pool_test.cpp
+++ b/3party/marl/src/pool_test.cpp
@ -0,0 +1,206 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/memory.h"
+#include "marl/pool.h"
+#include "marl/waitgroup.h"
+
+TEST_P(WithBoundScheduler, UnboundedPool_ConstructDestruct) {
+  marl::UnboundedPool<int> pool;
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_ConstructDestruct) {
+  marl::BoundedPool<int, 10> pool;
+}
+
+TEST_P(WithBoundScheduler, UnboundedPoolLoan_GetNull) {
+  marl::UnboundedPool<int>::Loan loan;
+  ASSERT_EQ(loan.get(), nullptr);
+}
+
+TEST_P(WithBoundScheduler, BoundedPoolLoan_GetNull) {
+  marl::BoundedPool<int, 10>::Loan loan;
+  ASSERT_EQ(loan.get(), nullptr);
+}
+
+TEST_P(WithBoundScheduler, UnboundedPool_Borrow) {
+  marl::UnboundedPool<int> pool;
+  for (int i = 0; i < 100; i++) {
+    pool.borrow();
+  }
+}
+
+TEST_P(WithBoundScheduler, UnboundedPool_ConcurrentBorrow) {
+  marl::UnboundedPool<int> pool;
+  constexpr int iterations = 10000;
+  marl::WaitGroup wg(iterations);
+  for (int i = 0; i < iterations; i++) {
+    marl::schedule([=] {
+      pool.borrow();
+      wg.done();
+    });
+  }
+  wg.wait();
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_Borrow) {
+  marl::BoundedPool<int, 100> pool;
+  for (int i = 0; i < 100; i++) {
+    pool.borrow();
+  }
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_ConcurrentBorrow) {
+  marl::BoundedPool<int, 10> pool;
+  constexpr int iterations = 10000;
+  marl::WaitGroup wg(iterations);
+  for (int i = 0; i < iterations; i++) {
+    marl::schedule([=] {
+      pool.borrow();
+      wg.done();
+    });
+  }
+  wg.wait();
+}
+
+struct CtorDtorCounter {
+  CtorDtorCounter() { ctor_count++; }
+  ~CtorDtorCounter() { dtor_count++; }
+  static void reset() {
+    ctor_count = 0;
+    dtor_count = 0;
+  }
+  static int ctor_count;
+  static int dtor_count;
+};
+
+int CtorDtorCounter::ctor_count = -1;
+int CtorDtorCounter::dtor_count = -1;
+
+TEST_P(WithBoundScheduler, UnboundedPool_PolicyReconstruct) {
+  CtorDtorCounter::reset();
+  marl::UnboundedPool<CtorDtorCounter, marl::PoolPolicy::Reconstruct> pool;
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 0);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  {
+    auto loan = pool.borrow();
+    ASSERT_EQ(CtorDtorCounter::ctor_count, 1);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 1);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 1);
+  {
+    auto loan = pool.borrow();
+    ASSERT_EQ(CtorDtorCounter::ctor_count, 2);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 1);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 2);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 2);
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_PolicyReconstruct) {
+  CtorDtorCounter::reset();
+  marl::BoundedPool<CtorDtorCounter, 10, marl::PoolPolicy::Reconstruct> pool;
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 0);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  {
+    auto loan = pool.borrow();
+    ASSERT_EQ(CtorDtorCounter::ctor_count, 1);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 1);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 1);
+  {
+    auto loan = pool.borrow();
+    ASSERT_EQ(CtorDtorCounter::ctor_count, 2);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 1);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, 2);
+  ASSERT_EQ(CtorDtorCounter::dtor_count, 2);
+}
+
+TEST_P(WithBoundScheduler, UnboundedPool_PolicyPreserve) {
+  CtorDtorCounter::reset();
+  {
+    marl::UnboundedPool<CtorDtorCounter, marl::PoolPolicy::Preserve> pool;
+    int ctor_count;
+    {
+      auto loan = pool.borrow();
+      ASSERT_NE(CtorDtorCounter::ctor_count, 0);
+      ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+      ctor_count = CtorDtorCounter::ctor_count;
+    }
+    ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+    {
+      auto loan = pool.borrow();
+      ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+      ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+    }
+    ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, CtorDtorCounter::dtor_count);
+}
+
+TEST_P(WithBoundScheduler, BoundedPool_PolicyPreserve) {
+  CtorDtorCounter::reset();
+  {
+    marl::BoundedPool<CtorDtorCounter, 10, marl::PoolPolicy::Preserve> pool;
+    int ctor_count;
+    {
+      auto loan = pool.borrow();
+      ASSERT_NE(CtorDtorCounter::ctor_count, 0);
+      ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+      ctor_count = CtorDtorCounter::ctor_count;
+    }
+    ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+    {
+      auto loan = pool.borrow();
+      ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+      ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+    }
+    ASSERT_EQ(CtorDtorCounter::ctor_count, ctor_count);
+    ASSERT_EQ(CtorDtorCounter::dtor_count, 0);
+  }
+  ASSERT_EQ(CtorDtorCounter::ctor_count, CtorDtorCounter::dtor_count);
+}
+
+struct alignas(64) StructWithAlignment {
+  uint8_t i;
+  uint8_t padding[63];
+};
+
+TEST_P(WithBoundScheduler, BoundedPool_AlignedTypes) {
+  marl::BoundedPool<StructWithAlignment, 100> pool;
+  for (int i = 0; i < 100; i++) {
+    auto loan = pool.borrow();
+    ASSERT_EQ(reinterpret_cast<uintptr_t>(&loan->i) &
+                  (alignof(StructWithAlignment) - 1),
+              0U);
+  }
+}
+
+TEST_P(WithBoundScheduler, UnboundedPool_AlignedTypes) {
+  marl::UnboundedPool<StructWithAlignment> pool;
+  for (int i = 0; i < 100; i++) {
+    auto loan = pool.borrow();
+    ASSERT_EQ(reinterpret_cast<uintptr_t>(&loan->i) &
+                  (alignof(StructWithAlignment) - 1),
+              0U);
+  }
+}
--- a/3party/marl/src/scheduler.cpp
+++ b/3party/marl/src/scheduler.cpp
@ -0,0 +1,763 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "osfiber.h"  // Must come first. See osfiber_ucontext.h.
+
+#include "marl/scheduler.h"
+
+#include "marl/debug.h"
+#include "marl/thread.h"
+#include "marl/trace.h"
+
+#if defined(_WIN32)
+#include <intrin.h>  // __nop()
+#endif
+
+// Enable to trace scheduler events.
+#define ENABLE_TRACE_EVENTS 0
+
+// Enable to print verbose debug logging.
+#define ENABLE_DEBUG_LOGGING 0
+
+#if ENABLE_TRACE_EVENTS
+#define TRACE(...) MARL_SCOPED_EVENT(__VA_ARGS__)
+#else
+#define TRACE(...)
+#endif
+
+#if ENABLE_DEBUG_LOGGING
+#define DBG_LOG(msg, ...) \
+  printf("%.3x " msg "\n", (int)threadID() & 0xfff, __VA_ARGS__)
+#else
+#define DBG_LOG(msg, ...)
+#endif
+
+#define ASSERT_FIBER_STATE(FIBER, STATE)                                   \
+  MARL_ASSERT(FIBER->state == STATE,                                       \
+              "fiber %d was in state %s, but expected %s", (int)FIBER->id, \
+              Fiber::toString(FIBER->state), Fiber::toString(STATE))
+
+namespace {
+
+#if ENABLE_DEBUG_LOGGING
+// threadID() returns a uint64_t representing the currently executing thread.
+// threadID() is only intended to be used for debugging purposes.
+inline uint64_t threadID() {
+  auto id = std::this_thread::get_id();
+  return std::hash<std::thread::id>()(id);
+}
+#endif
+
+inline void nop() {
+#if defined(_WIN32)
+  __nop();
+#else
+  __asm__ __volatile__("nop");
+#endif
+}
+
+inline marl::Scheduler::Config setConfigDefaults(
+    const marl::Scheduler::Config& cfgIn) {
+  marl::Scheduler::Config cfg{cfgIn};
+  if (cfg.workerThread.count > 0 && !cfg.workerThread.affinityPolicy) {
+    cfg.workerThread.affinityPolicy = marl::Thread::Affinity::Policy::anyOf(
+        marl::Thread::Affinity::all(cfg.allocator), cfg.allocator);
+  }
+  return cfg;
+}
+
+}  // anonymous namespace
+
+namespace marl {
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler
+////////////////////////////////////////////////////////////////////////////////
+MARL_INSTANTIATE_THREAD_LOCAL(Scheduler*, Scheduler::bound, nullptr);
+
+Scheduler* Scheduler::get() {
+  return bound;
+}
+
+void Scheduler::setBound(Scheduler* scheduler) {
+  bound = scheduler;
+}
+
+void Scheduler::bind() {
+  MARL_ASSERT(get() == nullptr, "Scheduler already bound");
+  setBound(this);
+  {
+    marl::lock lock(singleThreadedWorkers.mutex);
+    auto worker = cfg.allocator->make_unique<Worker>(
+        this, Worker::Mode::SingleThreaded, -1);
+    worker->start();
+    auto tid = std::this_thread::get_id();
+    singleThreadedWorkers.byTid.emplace(tid, std::move(worker));
+  }
+}
+
+void Scheduler::unbind() {
+  MARL_ASSERT(get() != nullptr, "No scheduler bound");
+  auto worker = Worker::getCurrent();
+  worker->stop();
+  {
+    marl::lock lock(get()->singleThreadedWorkers.mutex);
+    auto tid = std::this_thread::get_id();
+    auto& workers = get()->singleThreadedWorkers.byTid;
+    auto it = workers.find(tid);
+    MARL_ASSERT(it != workers.end(), "singleThreadedWorker not found");
+    MARL_ASSERT(it->second.get() == worker, "worker is not bound?");
+    workers.erase(it);
+    if (workers.empty()) {
+      get()->singleThreadedWorkers.unbind.notify_one();
+    }
+  }
+  setBound(nullptr);
+}
+
+Scheduler::Scheduler(const Config& config)
+    : cfg(setConfigDefaults(config)),
+      workerThreads{},
+      singleThreadedWorkers(config.allocator) {
+  for (int i = 0; i < cfg.workerThread.count; i++) {
+    spinningWorkers[i] = -1;
+    workerThreads[i] =
+        cfg.allocator->create<Worker>(this, Worker::Mode::MultiThreaded, i);
+  }
+  for (int i = 0; i < cfg.workerThread.count; i++) {
+    workerThreads[i]->start();
+  }
+}
+
+Scheduler::~Scheduler() {
+  {
+    // Wait until all the single threaded workers have been unbound.
+    marl::lock lock(singleThreadedWorkers.mutex);
+    lock.wait(singleThreadedWorkers.unbind,
+              [this]() REQUIRES(singleThreadedWorkers.mutex) {
+                return singleThreadedWorkers.byTid.empty();
+              });
+  }
+
+  // Release all worker threads.
+  // This will wait for all in-flight tasks to complete before returning.
+  for (int i = cfg.workerThread.count - 1; i >= 0; i--) {
+    workerThreads[i]->stop();
+  }
+  for (int i = cfg.workerThread.count - 1; i >= 0; i--) {
+    cfg.allocator->destroy(workerThreads[i]);
+  }
+}
+
+void Scheduler::enqueue(Task&& task) {
+  if (task.is(Task::Flags::SameThread)) {
+    Worker::getCurrent()->enqueue(std::move(task));
+    return;
+  }
+  if (cfg.workerThread.count > 0) {
+    while (true) {
+      // Prioritize workers that have recently started spinning.
+      auto i = --nextSpinningWorkerIdx % cfg.workerThread.count;
+      auto idx = spinningWorkers[i].exchange(-1);
+      if (idx < 0) {
+        // If a spinning worker couldn't be found, round-robin the
+        // workers.
+        idx = nextEnqueueIndex++ % cfg.workerThread.count;
+      }
+
+      auto worker = workerThreads[idx];
+      if (worker->tryLock()) {
+        worker->enqueueAndUnlock(std::move(task));
+        return;
+      }
+    }
+  } else {
+    if (auto worker = Worker::getCurrent()) {
+      worker->enqueue(std::move(task));
+    } else {
+      MARL_FATAL(
+          "singleThreadedWorker not found. Did you forget to call "
+          "marl::Scheduler::bind()?");
+    }
+  }
+}
+
+const Scheduler::Config& Scheduler::config() const {
+  return cfg;
+}
+
+bool Scheduler::stealWork(Worker* thief, uint64_t from, Task& out) {
+  if (cfg.workerThread.count > 0) {
+    auto thread = workerThreads[from % cfg.workerThread.count];
+    if (thread != thief) {
+      if (thread->steal(out)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void Scheduler::onBeginSpinning(int workerId) {
+  auto idx = nextSpinningWorkerIdx++ % cfg.workerThread.count;
+  spinningWorkers[idx] = workerId;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::Config
+////////////////////////////////////////////////////////////////////////////////
+Scheduler::Config Scheduler::Config::allCores() {
+  return Config().setWorkerThreadCount(Thread::numLogicalCPUs());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::Fiber
+////////////////////////////////////////////////////////////////////////////////
+Scheduler::Fiber::Fiber(Allocator::unique_ptr<OSFiber>&& impl, uint32_t id)
+    : id(id), impl(std::move(impl)), worker(Worker::getCurrent()) {
+  MARL_ASSERT(worker != nullptr, "No Scheduler::Worker bound");
+}
+
+Scheduler::Fiber* Scheduler::Fiber::current() {
+  auto worker = Worker::getCurrent();
+  return worker != nullptr ? worker->getCurrentFiber() : nullptr;
+}
+
+void Scheduler::Fiber::notify() {
+  worker->enqueue(this);
+}
+
+void Scheduler::Fiber::wait(marl::lock& lock, const Predicate& pred) {
+  MARL_ASSERT(worker == Worker::getCurrent(),
+              "Scheduler::Fiber::wait() must only be called on the currently "
+              "executing fiber");
+  worker->wait(lock, nullptr, pred);
+}
+
+void Scheduler::Fiber::switchTo(Fiber* to) {
+  MARL_ASSERT(worker == Worker::getCurrent(),
+              "Scheduler::Fiber::switchTo() must only be called on the "
+              "currently executing fiber");
+  if (to != this) {
+    impl->switchTo(to->impl.get());
+  }
+}
+
+Allocator::unique_ptr<Scheduler::Fiber> Scheduler::Fiber::create(
+    Allocator* allocator,
+    uint32_t id,
+    size_t stackSize,
+    const std::function<void()>& func) {
+  return allocator->make_unique<Fiber>(
+      OSFiber::createFiber(allocator, stackSize, func), id);
+}
+
+Allocator::unique_ptr<Scheduler::Fiber>
+Scheduler::Fiber::createFromCurrentThread(Allocator* allocator, uint32_t id) {
+  return allocator->make_unique<Fiber>(
+      OSFiber::createFiberFromCurrentThread(allocator), id);
+}
+
+const char* Scheduler::Fiber::toString(State state) {
+  switch (state) {
+    case State::Idle:
+      return "Idle";
+    case State::Yielded:
+      return "Yielded";
+    case State::Queued:
+      return "Queued";
+    case State::Running:
+      return "Running";
+    case State::Waiting:
+      return "Waiting";
+  }
+  MARL_ASSERT(false, "bad fiber state");
+  return "<unknown>";
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::WaitingFibers
+////////////////////////////////////////////////////////////////////////////////
+Scheduler::WaitingFibers::WaitingFibers(Allocator* allocator)
+    : timeouts(allocator), fibers(allocator) {}
+
+Scheduler::WaitingFibers::operator bool() const {
+  return !fibers.empty();
+}
+
+Scheduler::Fiber* Scheduler::WaitingFibers::take(const TimePoint& timeout) {
+  if (!*this) {
+    return nullptr;
+  }
+  auto it = timeouts.begin();
+  if (timeout < it->timepoint) {
+    return nullptr;
+  }
+  auto fiber = it->fiber;
+  timeouts.erase(it);
+  auto deleted = fibers.erase(fiber) != 0;
+  (void)deleted;
+  MARL_ASSERT(deleted, "WaitingFibers::take() maps out of sync");
+  return fiber;
+}
+
+Scheduler::TimePoint Scheduler::WaitingFibers::next() const {
+  MARL_ASSERT(*this,
+              "WaitingFibers::next() called when there' no waiting fibers");
+  return timeouts.begin()->timepoint;
+}
+
+void Scheduler::WaitingFibers::add(const TimePoint& timeout, Fiber* fiber) {
+  timeouts.emplace(Timeout{timeout, fiber});
+  bool added = fibers.emplace(fiber, timeout).second;
+  (void)added;
+  MARL_ASSERT(added, "WaitingFibers::add() fiber already waiting");
+}
+
+void Scheduler::WaitingFibers::erase(Fiber* fiber) {
+  auto it = fibers.find(fiber);
+  if (it != fibers.end()) {
+    auto timeout = it->second;
+    auto erased = timeouts.erase(Timeout{timeout, fiber}) != 0;
+    (void)erased;
+    MARL_ASSERT(erased, "WaitingFibers::erase() maps out of sync");
+    fibers.erase(it);
+  }
+}
+
+bool Scheduler::WaitingFibers::contains(Fiber* fiber) const {
+  return fibers.count(fiber) != 0;
+}
+
+bool Scheduler::WaitingFibers::Timeout::operator<(const Timeout& o) const {
+  if (timepoint != o.timepoint) {
+    return timepoint < o.timepoint;
+  }
+  return fiber < o.fiber;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::Worker
+////////////////////////////////////////////////////////////////////////////////
+MARL_INSTANTIATE_THREAD_LOCAL(Scheduler::Worker*,
+                              Scheduler::Worker::current,
+                              nullptr);
+
+Scheduler::Worker::Worker(Scheduler* scheduler, Mode mode, uint32_t id)
+    : id(id),
+      mode(mode),
+      scheduler(scheduler),
+      work(scheduler->cfg.allocator),
+      idleFibers(scheduler->cfg.allocator) {}
+
+void Scheduler::Worker::start() {
+  switch (mode) {
+    case Mode::MultiThreaded: {
+      auto allocator = scheduler->cfg.allocator;
+      auto& affinityPolicy = scheduler->cfg.workerThread.affinityPolicy;
+      auto affinity = affinityPolicy->get(id, allocator);
+      thread = Thread(std::move(affinity), [=] {
+        Thread::setName("Thread<%.2d>", int(id));
+
+        if (auto const& initFunc = scheduler->cfg.workerThread.initializer) {
+          initFunc(id);
+        }
+
+        Scheduler::setBound(scheduler);
+        Worker::current = this;
+        mainFiber = Fiber::createFromCurrentThread(scheduler->cfg.allocator, 0);
+        currentFiber = mainFiber.get();
+        {
+          marl::lock lock(work.mutex);
+          run();
+        }
+        mainFiber.reset();
+        Worker::current = nullptr;
+      });
+      break;
+    }
+    case Mode::SingleThreaded: {
+      Worker::current = this;
+      mainFiber = Fiber::createFromCurrentThread(scheduler->cfg.allocator, 0);
+      currentFiber = mainFiber.get();
+      break;
+    }
+    default:
+      MARL_ASSERT(false, "Unknown mode: %d", int(mode));
+  }
+}
+
+void Scheduler::Worker::stop() {
+  switch (mode) {
+    case Mode::MultiThreaded: {
+      enqueue(Task([this] { shutdown = true; }, Task::Flags::SameThread));
+      thread.join();
+      break;
+    }
+    case Mode::SingleThreaded: {
+      marl::lock lock(work.mutex);
+      shutdown = true;
+      runUntilShutdown();
+      Worker::current = nullptr;
+      break;
+    }
+    default:
+      MARL_ASSERT(false, "Unknown mode: %d", int(mode));
+  }
+}
+
+bool Scheduler::Worker::wait(const TimePoint* timeout) {
+  DBG_LOG("%d: WAIT(%d)", (int)id, (int)currentFiber->id);
+  {
+    marl::lock lock(work.mutex);
+    suspend(timeout);
+  }
+  return timeout == nullptr || std::chrono::system_clock::now() < *timeout;
+}
+
+bool Scheduler::Worker::wait(lock& waitLock,
+                             const TimePoint* timeout,
+                             const Predicate& pred) {
+  DBG_LOG("%d: WAIT(%d)", (int)id, (int)currentFiber->id);
+  while (!pred()) {
+    // Lock the work mutex to call suspend().
+    work.mutex.lock();
+
+    // Unlock the wait mutex with the work mutex lock held.
+    // Order is important here as we need to ensure that the fiber is not
+    // enqueued (via Fiber::notify()) between the waitLock.unlock() and fiber
+    // switch, otherwise the Fiber::notify() call may be ignored and the fiber
+    // is never woken.
+    waitLock.unlock_no_tsa();
+
+    // suspend the fiber.
+    suspend(timeout);
+
+    // Fiber resumed. We don't need the work mutex locked any more.
+    work.mutex.unlock();
+
+    // Re-lock to either return due to timeout, or call pred().
+    waitLock.lock_no_tsa();
+
+    // Check timeout.
+    if (timeout != nullptr && std::chrono::system_clock::now() >= *timeout) {
+      return false;
+    }
+
+    // Spurious wake up. Spin again.
+  }
+  return true;
+}
+
+void Scheduler::Worker::suspend(
+    const std::chrono::system_clock::time_point* timeout) {
+  // Current fiber is yielding as it is blocked.
+  if (timeout != nullptr) {
+    changeFiberState(currentFiber, Fiber::State::Running,
+                     Fiber::State::Waiting);
+    work.waiting.add(*timeout, currentFiber);
+  } else {
+    changeFiberState(currentFiber, Fiber::State::Running,
+                     Fiber::State::Yielded);
+  }
+
+  // First wait until there's something else this worker can do.
+  waitForWork();
+
+  work.numBlockedFibers++;
+
+  if (!work.fibers.empty()) {
+    // There's another fiber that has become unblocked, resume that.
+    work.num--;
+    auto to = containers::take(work.fibers);
+    ASSERT_FIBER_STATE(to, Fiber::State::Queued);
+    switchToFiber(to);
+  } else if (!idleFibers.empty()) {
+    // There's an old fiber we can reuse, resume that.
+    auto to = containers::take(idleFibers);
+    ASSERT_FIBER_STATE(to, Fiber::State::Idle);
+    switchToFiber(to);
+  } else {
+    // Tasks to process and no existing fibers to resume.
+    // Spawn a new fiber.
+    switchToFiber(createWorkerFiber());
+  }
+
+  work.numBlockedFibers--;
+
+  setFiberState(currentFiber, Fiber::State::Running);
+}
+
+bool Scheduler::Worker::tryLock() {
+  return work.mutex.try_lock();
+}
+
+void Scheduler::Worker::enqueue(Fiber* fiber) {
+  bool notify = false;
+  {
+    marl::lock lock(work.mutex);
+    DBG_LOG("%d: ENQUEUE(%d %s)", (int)id, (int)fiber->id,
+            Fiber::toString(fiber->state));
+    switch (fiber->state) {
+      case Fiber::State::Running:
+      case Fiber::State::Queued:
+        return;  // Nothing to do here - task is already queued or running.
+      case Fiber::State::Waiting:
+        work.waiting.erase(fiber);
+        break;
+      case Fiber::State::Idle:
+      case Fiber::State::Yielded:
+        break;
+    }
+    notify = work.notifyAdded;
+    work.fibers.push_back(fiber);
+    MARL_ASSERT(!work.waiting.contains(fiber),
+                "fiber is unexpectedly in the waiting list");
+    setFiberState(fiber, Fiber::State::Queued);
+    work.num++;
+  }
+
+  if (notify) {
+    work.added.notify_one();
+  }
+}
+
+void Scheduler::Worker::enqueue(Task&& task) {
+  work.mutex.lock();
+  enqueueAndUnlock(std::move(task));
+}
+
+void Scheduler::Worker::enqueueAndUnlock(Task&& task) {
+  auto notify = work.notifyAdded;
+  work.tasks.push_back(std::move(task));
+  work.num++;
+  work.mutex.unlock();
+  if (notify) {
+    work.added.notify_one();
+  }
+}
+
+bool Scheduler::Worker::steal(Task& out) {
+  if (work.num.load() == 0) {
+    return false;
+  }
+  if (!work.mutex.try_lock()) {
+    return false;
+  }
+  if (work.tasks.empty() || work.tasks.front().is(Task::Flags::SameThread)) {
+    work.mutex.unlock();
+    return false;
+  }
+  work.num--;
+  out = containers::take(work.tasks);
+  work.mutex.unlock();
+  return true;
+}
+
+void Scheduler::Worker::run() {
+  if (mode == Mode::MultiThreaded) {
+    MARL_NAME_THREAD("Thread<%.2d> Fiber<%.2d>", int(id), Fiber::current()->id);
+    // This is the entry point for a multi-threaded worker.
+    // Start with a regular condition-variable wait for work. This avoids
+    // starting the thread with a spinForWorkAndLock().
+    work.wait([this]() REQUIRES(work.mutex) {
+      return work.num > 0 || work.waiting || shutdown;
+    });
+  }
+  ASSERT_FIBER_STATE(currentFiber, Fiber::State::Running);
+  runUntilShutdown();
+  switchToFiber(mainFiber.get());
+}
+
+void Scheduler::Worker::runUntilShutdown() {
+  while (!shutdown || work.num > 0 || work.numBlockedFibers > 0U) {
+    waitForWork();
+    runUntilIdle();
+  }
+}
+
+void Scheduler::Worker::waitForWork() {
+  MARL_ASSERT(work.num == work.fibers.size() + work.tasks.size(),
+              "work.num out of sync");
+  if (work.num > 0) {
+    return;
+  }
+
+  if (mode == Mode::MultiThreaded) {
+    scheduler->onBeginSpinning(id);
+    work.mutex.unlock();
+    spinForWorkAndLock();
+  }
+
+  work.wait([this]() REQUIRES(work.mutex) {
+    return work.num > 0 || (shutdown && work.numBlockedFibers == 0U);
+  });
+  if (work.waiting) {
+    enqueueFiberTimeouts();
+  }
+}
+
+void Scheduler::Worker::enqueueFiberTimeouts() {
+  auto now = std::chrono::system_clock::now();
+  while (auto fiber = work.waiting.take(now)) {
+    changeFiberState(fiber, Fiber::State::Waiting, Fiber::State::Queued);
+    DBG_LOG("%d: TIMEOUT(%d)", (int)id, (int)fiber->id);
+    work.fibers.push_back(fiber);
+    work.num++;
+  }
+}
+
+void Scheduler::Worker::changeFiberState(Fiber* fiber,
+                                         Fiber::State from,
+                                         Fiber::State to) const {
+  (void)from;  // Unusued parameter when ENABLE_DEBUG_LOGGING is disabled.
+  DBG_LOG("%d: CHANGE_FIBER_STATE(%d %s -> %s)", (int)id, (int)fiber->id,
+          Fiber::toString(from), Fiber::toString(to));
+  ASSERT_FIBER_STATE(fiber, from);
+  fiber->state = to;
+}
+
+void Scheduler::Worker::setFiberState(Fiber* fiber, Fiber::State to) const {
+  DBG_LOG("%d: SET_FIBER_STATE(%d %s -> %s)", (int)id, (int)fiber->id,
+          Fiber::toString(fiber->state), Fiber::toString(to));
+  fiber->state = to;
+}
+
+void Scheduler::Worker::spinForWorkAndLock() {
+  TRACE("SPIN");
+  Task stolen;
+
+  constexpr auto duration = std::chrono::milliseconds(1);
+  auto start = std::chrono::high_resolution_clock::now();
+  while (std::chrono::high_resolution_clock::now() - start < duration) {
+    for (int i = 0; i < 256; i++)  // Empirically picked magic number!
+    {
+      // clang-format off
+      nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+      nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+      nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+      nop(); nop(); nop(); nop(); nop(); nop(); nop(); nop();
+      // clang-format on
+
+      if (work.num > 0) {
+        work.mutex.lock();
+        if (work.num > 0) {
+          return;
+        }
+        else {
+          // Our new task was stolen by another worker. Keep spinning.
+          work.mutex.unlock();
+        }
+      }
+    }
+
+    if (scheduler->stealWork(this, rng(), stolen)) {
+      work.mutex.lock();
+      work.tasks.emplace_back(std::move(stolen));
+      work.num++;
+      return;
+    }
+
+    std::this_thread::yield();
+  }
+  work.mutex.lock();
+}
+
+void Scheduler::Worker::runUntilIdle() {
+  ASSERT_FIBER_STATE(currentFiber, Fiber::State::Running);
+  MARL_ASSERT(work.num == work.fibers.size() + work.tasks.size(),
+              "work.num out of sync");
+  while (!work.fibers.empty() || !work.tasks.empty()) {
+    // Note: we cannot take and store on the stack more than a single fiber
+    // or task at a time, as the Fiber may yield and these items may get
+    // held on suspended fiber stack.
+
+    while (!work.fibers.empty()) {
+      work.num--;
+      auto fiber = containers::take(work.fibers);
+      // Sanity checks,
+      MARL_ASSERT(idleFibers.count(fiber) == 0, "dequeued fiber is idle");
+      MARL_ASSERT(fiber != currentFiber, "dequeued fiber is currently running");
+      ASSERT_FIBER_STATE(fiber, Fiber::State::Queued);
+
+      changeFiberState(currentFiber, Fiber::State::Running, Fiber::State::Idle);
+      auto added = idleFibers.emplace(currentFiber).second;
+      (void)added;
+      MARL_ASSERT(added, "fiber already idle");
+
+      switchToFiber(fiber);
+      changeFiberState(currentFiber, Fiber::State::Idle, Fiber::State::Running);
+    }
+
+    if (!work.tasks.empty()) {
+      work.num--;
+      auto task = containers::take(work.tasks);
+      work.mutex.unlock();
+
+      // Run the task.
+      task();
+
+      // std::function<> can carry arguments with complex destructors.
+      // Ensure these are destructed outside of the lock.
+      task = Task();
+
+      work.mutex.lock();
+    }
+  }
+}
+
+Scheduler::Fiber* Scheduler::Worker::createWorkerFiber() {
+  auto fiberId = static_cast<uint32_t>(workerFibers.size() + 1);
+  DBG_LOG("%d: CREATE(%d)", (int)id, (int)fiberId);
+  auto fiber = Fiber::create(scheduler->cfg.allocator, fiberId,
+                             scheduler->cfg.fiberStackSize,
+                             [&]() REQUIRES(work.mutex) { run(); });
+  auto ptr = fiber.get();
+  workerFibers.emplace_back(std::move(fiber));
+  return ptr;
+}
+
+void Scheduler::Worker::switchToFiber(Fiber* to) {
+  DBG_LOG("%d: SWITCH(%d -> %d)", (int)id, (int)currentFiber->id, (int)to->id);
+  MARL_ASSERT(to == mainFiber.get() || idleFibers.count(to) == 0,
+              "switching to idle fiber");
+  auto from = currentFiber;
+  currentFiber = to;
+  from->switchTo(to);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::Worker::Work
+////////////////////////////////////////////////////////////////////////////////
+Scheduler::Worker::Work::Work(Allocator* allocator)
+    : tasks(allocator), fibers(allocator), waiting(allocator) {}
+
+template <typename F>
+void Scheduler::Worker::Work::wait(F&& f) {
+  notifyAdded = true;
+  if (waiting) {
+    mutex.wait_until_locked(added, waiting.next(), f);
+  } else {
+    mutex.wait_locked(added, f);
+  }
+  notifyAdded = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scheduler::Worker::Work
+////////////////////////////////////////////////////////////////////////////////
+Scheduler::SingleThreadedWorkers::SingleThreadedWorkers(Allocator* allocator)
+    : byTid(allocator) {}
+
+}  // namespace marl
--- a/3party/marl/src/scheduler_bench.cpp
+++ b/3party/marl/src/scheduler_bench.cpp
@ -0,0 +1,101 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/waitgroup.h"
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_DEFINE_F(Schedule, Empty)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      for (auto i = 0; i < numTasks; i++) {
+        marl::schedule([] {});
+      }
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, Empty)->Apply(Schedule::args);
+
+BENCHMARK_DEFINE_F(Schedule, SomeWork)
+(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      marl::WaitGroup wg;
+      wg.add(numTasks);
+      for (auto i = 0; i < numTasks; i++) {
+        marl::schedule([=] {
+          uint32_t value = doSomeWork(i);
+          benchmark::DoNotOptimize(value);
+          wg.done();
+        });
+      }
+      wg.wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, SomeWork)->Apply(Schedule::args);
+
+BENCHMARK_DEFINE_F(Schedule, MultipleForkAndJoin)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    const int batchSize = std::max(1, Schedule::numThreads(state));
+    for (auto _ : state) {
+      marl::WaitGroup wg;
+      for (int i = 0; i < numTasks; i++) {
+        wg.add(1);
+        marl::schedule([=] {
+          // Give each task a significant amount of work so that concurrency matters.
+          // If any worker performs more than one task, it will affect the results.
+          int value = i;
+          for (int j = 0; j < 256; ++j) {
+            value = doSomeWork(value);
+          }
+          benchmark::DoNotOptimize(value);
+          wg.done();
+        });
+        // Wait for completion after every batch. This simulates the fork-and-join pattern.
+        if ((i + 1) % batchSize == 0) {
+          wg.wait();
+        }
+      }
+      wg.wait();
+    }
+  });
+}
+
+BENCHMARK_REGISTER_F(Schedule, MultipleForkAndJoin)->Apply(Schedule::args<512>);
+
+BENCHMARK_DEFINE_F(Schedule, SomeWorkWorkerAffinityOneOf)
+(benchmark::State& state) {
+  marl::Scheduler::Config cfg;
+  cfg.setWorkerThreadAffinityPolicy(
+      marl::Thread::Affinity::Policy::oneOf(marl::Thread::Affinity::all()));
+  run(state, cfg, [&](int numTasks) {
+    for (auto _ : state) {
+      marl::WaitGroup wg;
+      wg.add(numTasks);
+      for (auto i = 0; i < numTasks; i++) {
+        marl::schedule([=] {
+          uint32_t value = doSomeWork(i);
+          benchmark::DoNotOptimize(value);
+          wg.done();
+        });
+      }
+      wg.wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, SomeWorkWorkerAffinityOneOf)
+    ->Apply(Schedule::args);
--- a/3party/marl/src/scheduler_test.cpp
+++ b/3party/marl/src/scheduler_test.cpp
@ -0,0 +1,227 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/containers.h"
+#include "marl/defer.h"
+#include "marl/event.h"
+#include "marl/waitgroup.h"
+
+#include <atomic>
+
+TEST_F(WithoutBoundScheduler, SchedulerConstructAndDestruct) {
+  auto scheduler = std::unique_ptr<marl::Scheduler>(
+      new marl::Scheduler(marl::Scheduler::Config()));
+}
+
+TEST_F(WithoutBoundScheduler, SchedulerBindGetUnbind) {
+  auto scheduler = std::unique_ptr<marl::Scheduler>(
+      new marl::Scheduler(marl::Scheduler::Config()));
+  scheduler->bind();
+  auto got = marl::Scheduler::get();
+  ASSERT_EQ(scheduler.get(), got);
+  scheduler->unbind();
+  got = marl::Scheduler::get();
+  ASSERT_EQ(got, nullptr);
+}
+
+TEST_F(WithoutBoundScheduler, CheckConfig) {
+  marl::Scheduler::Config cfg;
+  cfg.setAllocator(allocator).setWorkerThreadCount(10);
+
+  auto scheduler = std::unique_ptr<marl::Scheduler>(new marl::Scheduler(cfg));
+
+  auto gotCfg = scheduler->config();
+  ASSERT_EQ(gotCfg.allocator, allocator);
+  ASSERT_EQ(gotCfg.workerThread.count, 10);
+}
+
+TEST_P(WithBoundScheduler, DestructWithPendingTasks) {
+  std::atomic<int> counter = {0};
+  for (int i = 0; i < 1000; i++) {
+    marl::schedule([&] { counter++; });
+  }
+
+  auto scheduler = marl::Scheduler::get();
+  scheduler->unbind();
+  delete scheduler;
+
+  // All scheduled tasks should be completed before the scheduler is destructed.
+  ASSERT_EQ(counter.load(), 1000);
+
+  // Rebind a new scheduler so WithBoundScheduler::TearDown() is happy.
+  (new marl::Scheduler(marl::Scheduler::Config()))->bind();
+}
+
+TEST_P(WithBoundScheduler, DestructWithPendingFibers) {
+  std::atomic<int> counter = {0};
+
+  marl::WaitGroup wg(1);
+  for (int i = 0; i < 1000; i++) {
+    marl::schedule([&] {
+      wg.wait();
+      counter++;
+    });
+  }
+
+  // Schedule a task to unblock all the tasks scheduled above.
+  // We assume that some of these tasks will not finish before the scheduler
+  // destruction logic kicks in.
+  marl::schedule([=] {
+    wg.done();  // Ready, steady, go...
+  });
+
+  auto scheduler = marl::Scheduler::get();
+  scheduler->unbind();
+  delete scheduler;
+
+  // All scheduled tasks should be completed before the scheduler is destructed.
+  ASSERT_EQ(counter.load(), 1000);
+
+  // Rebind a new scheduler so WithBoundScheduler::TearDown() is happy.
+  (new marl::Scheduler(marl::Scheduler::Config()))->bind();
+}
+
+TEST_P(WithBoundScheduler, ScheduleWithArgs) {
+  std::string got;
+  marl::WaitGroup wg(1);
+  marl::schedule(
+      [wg, &got](std::string s, int i, bool b) {
+        got = "s: '" + s + "', i: " + std::to_string(i) +
+              ", b: " + (b ? "true" : "false");
+        wg.done();
+      },
+      "a string", 42, true);
+  wg.wait();
+  ASSERT_EQ(got, "s: 'a string', i: 42, b: true");
+}
+
+TEST_P(WithBoundScheduler, FibersResumeOnSameThread) {
+  marl::WaitGroup fence(1);
+  marl::WaitGroup wg(1000);
+  for (int i = 0; i < 1000; i++) {
+    marl::schedule([=] {
+      auto threadID = std::this_thread::get_id();
+      fence.wait();
+      ASSERT_EQ(threadID, std::this_thread::get_id());
+      wg.done();
+    });
+  }
+  // just to try and get some tasks to yield.
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  fence.done();
+  wg.wait();
+}
+
+TEST_P(WithBoundScheduler, FibersResumeOnSameStdThread) {
+  auto scheduler = marl::Scheduler::get();
+
+  // on 32-bit OSs, excessive numbers of threads can run out of address space.
+  constexpr auto num_threads = sizeof(void*) > 4 ? 1000 : 100;
+
+  marl::WaitGroup fence(1);
+  marl::WaitGroup wg(num_threads);
+
+  marl::containers::vector<std::thread, 32> threads;
+  for (int i = 0; i < num_threads; i++) {
+    threads.emplace_back(std::thread([=] {
+      scheduler->bind();
+      defer(scheduler->unbind());
+
+      auto threadID = std::this_thread::get_id();
+      fence.wait();
+      ASSERT_EQ(threadID, std::this_thread::get_id());
+      wg.done();
+    }));
+  }
+  // just to try and get some tasks to yield.
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  fence.done();
+  wg.wait();
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+TEST_F(WithoutBoundScheduler, TasksOnlyScheduledOnWorkerThreads) {
+  marl::Scheduler::Config cfg;
+  cfg.setWorkerThreadCount(8);
+
+  auto scheduler = std::unique_ptr<marl::Scheduler>(new marl::Scheduler(cfg));
+  scheduler->bind();
+  defer(scheduler->unbind());
+
+  std::mutex mutex;
+  marl::containers::unordered_set<std::thread::id> threads(allocator);
+  marl::WaitGroup wg;
+  for (int i = 0; i < 10000; i++) {
+    wg.add(1);
+    marl::schedule([&mutex, &threads, wg] {
+      defer(wg.done());
+      std::unique_lock<std::mutex> lock(mutex);
+      threads.emplace(std::this_thread::get_id());
+    });
+  }
+  wg.wait();
+
+  ASSERT_LE(threads.size(), 8U);
+  ASSERT_EQ(threads.count(std::this_thread::get_id()), 0U);
+}
+
+// Test that a marl::Scheduler *with dedicated worker threads* can be used
+// without first binding to the scheduling thread.
+TEST_F(WithoutBoundScheduler, ScheduleMTWWithNoBind) {
+  marl::Scheduler::Config cfg;
+  cfg.setWorkerThreadCount(8);
+  auto scheduler = std::unique_ptr<marl::Scheduler>(new marl::Scheduler(cfg));
+
+  marl::WaitGroup wg;
+  for (int i = 0; i < 100; i++) {
+    wg.add(1);
+
+    marl::Event event;
+    scheduler->enqueue(marl::Task([event, wg] {
+      event.wait();  // Test that tasks can wait on other tasks.
+      wg.done();
+    }));
+
+    scheduler->enqueue(marl::Task([event, &scheduler] {
+      // Despite the main thread never binding the scheduler, the scheduler
+      // should be automatically bound to worker threads.
+      ASSERT_EQ(marl::Scheduler::get(), scheduler.get());
+
+      event.signal();
+    }));
+  }
+
+  // As the scheduler has not been bound to the main thread, the wait() call
+  // here will block **without** fiber yielding.
+  wg.wait();
+}
+
+// Test that a marl::Scheduler *without dedicated worker threads* cannot be used
+// without first binding to the scheduling thread.
+TEST_F(WithoutBoundScheduler, ScheduleSTWWithNoBind) {
+  marl::Scheduler::Config cfg;
+  auto scheduler = std::unique_ptr<marl::Scheduler>(new marl::Scheduler(cfg));
+
+#if MARL_DEBUG_ENABLED && GTEST_HAS_DEATH_TEST
+  EXPECT_DEATH(scheduler->enqueue(marl::Task([] {})),
+               "Did you forget to call marl::Scheduler::bind");
+#elif !MARL_DEBUG_ENABLED
+  scheduler->enqueue(marl::Task([] { FAIL() << "Should not be called"; }));
+#endif
+}
--- a/3party/marl/src/thread.cpp
+++ b/3party/marl/src/thread.cpp
@ -0,0 +1,474 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl/thread.h"
+
+#include "marl/debug.h"
+#include "marl/defer.h"
+#include "marl/trace.h"
+
+#include <algorithm>  // std::sort
+
+#include <cstdarg>
+#include <cstdio>
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN 1
+#include <windows.h>
+#include <array>
+#include <cstdlib>  // mbstowcs
+#include <limits>   // std::numeric_limits
+#include <vector>
+#undef max
+#elif defined(__APPLE__)
+#include <mach/thread_act.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <thread>
+#elif defined(__FreeBSD__)
+#include <pthread.h>
+#include <pthread_np.h>
+#include <unistd.h>
+#include <thread>
+#else
+#include <pthread.h>
+#include <unistd.h>
+#include <thread>
+#endif
+
+namespace {
+
+struct CoreHasher {
+  inline uint64_t operator()(marl::Thread::Core core) const {
+    return core.pthread.index;
+  }
+};
+
+}  // anonymous namespace
+
+namespace marl {
+
+#if defined(_WIN32)
+static constexpr size_t MaxCoreCount =
+    std::numeric_limits<decltype(Thread::Core::windows.index)>::max() + 1ULL;
+static constexpr size_t MaxGroupCount =
+    std::numeric_limits<decltype(Thread::Core::windows.group)>::max() + 1ULL;
+static_assert(sizeof(KAFFINITY) * 8ULL <= MaxCoreCount,
+              "Thread::Core::windows.index is too small");
+
+namespace {
+#define CHECK_WIN32(expr)                                    \
+  do {                                                       \
+    auto res = expr;                                         \
+    (void)res;                                               \
+    MARL_ASSERT(res == TRUE, #expr " failed with error: %d", \
+                (int)GetLastError());                        \
+  } while (false)
+
+struct ProcessorGroup {
+  unsigned int count;  // number of logical processors in this group.
+  KAFFINITY affinity;  // affinity mask.
+};
+
+struct ProcessorGroups {
+  std::array<ProcessorGroup, MaxGroupCount> groups;
+  size_t count;
+};
+
+const ProcessorGroups& getProcessorGroups() {
+  static ProcessorGroups groups = [] {
+    ProcessorGroups out = {};
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info[32] = {};
+    DWORD size = sizeof(info);
+    CHECK_WIN32(GetLogicalProcessorInformationEx(RelationGroup, info, &size));
+    DWORD count = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX);
+    for (DWORD i = 0; i < count; i++) {
+      if (info[i].Relationship == RelationGroup) {
+        auto groupCount = info[i].Group.ActiveGroupCount;
+        for (WORD groupIdx = 0; groupIdx < groupCount; groupIdx++) {
+          auto const& groupInfo = info[i].Group.GroupInfo[groupIdx];
+          out.groups[out.count++] = ProcessorGroup{
+              groupInfo.ActiveProcessorCount, groupInfo.ActiveProcessorMask};
+          MARL_ASSERT(out.count <= MaxGroupCount, "Group index overflow");
+        }
+      }
+    }
+    return out;
+  }();
+  return groups;
+}
+}  // namespace
+#endif  // defined(_WIN32)
+
+////////////////////////////////////////////////////////////////////////////////
+// Thread::Affinty
+////////////////////////////////////////////////////////////////////////////////
+
+Thread::Affinity::Affinity(Allocator* allocator) : cores(allocator) {}
+Thread::Affinity::Affinity(Affinity&& other) : cores(std::move(other.cores)) {}
+Thread::Affinity& Thread::Affinity::operator=(Affinity&& other) {
+  cores = std::move(other.cores);
+  return *this;
+}
+Thread::Affinity::Affinity(const Affinity& other, Allocator* allocator)
+    : cores(other.cores, allocator) {}
+
+Thread::Affinity::Affinity(std::initializer_list<Core> list,
+                           Allocator* allocator)
+    : cores(allocator) {
+  cores.reserve(list.size());
+  for (auto core : list) {
+    cores.push_back(core);
+  }
+}
+
+Thread::Affinity::Affinity(const containers::vector<Core, 32>& coreList,
+                           Allocator* allocator)
+    : cores(coreList, allocator) {}
+
+Thread::Affinity Thread::Affinity::all(
+    Allocator* allocator /* = Allocator::Default */) {
+  Thread::Affinity affinity(allocator);
+
+#if defined(_WIN32)
+  const auto& groups = getProcessorGroups();
+  for (size_t groupIdx = 0; groupIdx < groups.count; groupIdx++) {
+    const auto& group = groups.groups[groupIdx];
+    Core core;
+    core.windows.group = static_cast<decltype(Core::windows.group)>(groupIdx);
+    for (unsigned int coreIdx = 0; coreIdx < group.count; coreIdx++) {
+      if ((group.affinity >> coreIdx) & 1) {
+        core.windows.index = static_cast<decltype(core.windows.index)>(coreIdx);
+        affinity.cores.emplace_back(std::move(core));
+      }
+    }
+  }
+#elif defined(__linux__) && !defined(__ANDROID__) && !defined(__BIONIC__)
+  auto thread = pthread_self();
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  if (pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset) == 0) {
+    int count = CPU_COUNT(&cpuset);
+    for (int i = 0; i < count; i++) {
+      Core core;
+      core.pthread.index = static_cast<uint16_t>(i);
+      affinity.cores.emplace_back(std::move(core));
+    }
+  }
+#elif defined(__FreeBSD__)
+  auto thread = pthread_self();
+  cpuset_t cpuset;
+  CPU_ZERO(&cpuset);
+  if (pthread_getaffinity_np(thread, sizeof(cpuset_t), &cpuset) == 0) {
+    int count = CPU_COUNT(&cpuset);
+    for (int i = 0; i < count; i++) {
+      Core core;
+      core.pthread.index = static_cast<uint16_t>(i);
+      affinity.cores.emplace_back(std::move(core));
+    }
+  }
+#else
+  static_assert(!supported,
+                "marl::Thread::Affinity::supported is true, but "
+                "Thread::Affinity::all() is not implemented for this platform");
+#endif
+
+  return affinity;
+}
+
+std::shared_ptr<Thread::Affinity::Policy> Thread::Affinity::Policy::anyOf(
+    Affinity&& affinity,
+    Allocator* allocator /* = Allocator::Default */) {
+  struct Policy : public Thread::Affinity::Policy {
+    Affinity affinity;
+    Policy(Affinity&& affinity) : affinity(std::move(affinity)) {}
+
+    Affinity get(uint32_t threadId, Allocator* allocator) const override {
+#if defined(_WIN32)
+      auto count = affinity.count();
+      if (count == 0) {
+        return Affinity(affinity, allocator);
+      }
+      auto group = affinity[threadId % affinity.count()].windows.group;
+      Affinity out(allocator);
+      out.cores.reserve(count);
+      for (auto core : affinity.cores) {
+        if (core.windows.group == group) {
+          out.cores.push_back(core);
+        }
+      }
+      return out;
+#else
+      return Affinity(affinity, allocator);
+#endif
+    }
+  };
+
+  return allocator->make_shared<Policy>(std::move(affinity));
+}
+
+std::shared_ptr<Thread::Affinity::Policy> Thread::Affinity::Policy::oneOf(
+    Affinity&& affinity,
+    Allocator* allocator /* = Allocator::Default */) {
+  struct Policy : public Thread::Affinity::Policy {
+    Affinity affinity;
+    Policy(Affinity&& affinity) : affinity(std::move(affinity)) {}
+
+    Affinity get(uint32_t threadId, Allocator* allocator) const override {
+      auto count = affinity.count();
+      if (count == 0) {
+        return Affinity(affinity, allocator);
+      }
+      return Affinity({affinity[threadId % affinity.count()]}, allocator);
+    }
+  };
+
+  return allocator->make_shared<Policy>(std::move(affinity));
+}
+
+size_t Thread::Affinity::count() const {
+  return cores.size();
+}
+
+Thread::Core Thread::Affinity::operator[](size_t index) const {
+  return cores[index];
+}
+
+Thread::Affinity& Thread::Affinity::add(const Thread::Affinity& other) {
+  containers::unordered_set<Core, CoreHasher> set(cores.allocator);
+  for (auto core : cores) {
+    set.emplace(core);
+  }
+  for (auto core : other.cores) {
+    if (set.count(core) == 0) {
+      cores.push_back(core);
+    }
+  }
+  std::sort(cores.begin(), cores.end());
+  return *this;
+}
+
+Thread::Affinity& Thread::Affinity::remove(const Thread::Affinity& other) {
+  containers::unordered_set<Core, CoreHasher> set(cores.allocator);
+  for (auto core : other.cores) {
+    set.emplace(core);
+  }
+  for (size_t i = 0; i < cores.size(); i++) {
+    if (set.count(cores[i]) != 0) {
+      cores[i] = cores.back();
+      cores.resize(cores.size() - 1);
+    }
+  }
+  std::sort(cores.begin(), cores.end());
+  return *this;
+}
+
+#if defined(_WIN32)
+
+class Thread::Impl {
+ public:
+  Impl(Func&& func, _PROC_THREAD_ATTRIBUTE_LIST* attributes)
+      : func(std::move(func)),
+        handle(CreateRemoteThreadEx(GetCurrentProcess(),
+                                    nullptr,
+                                    0,
+                                    &Impl::run,
+                                    this,
+                                    0,
+                                    attributes,
+                                    nullptr)) {}
+  ~Impl() { CloseHandle(handle); }
+
+  Impl(const Impl&) = delete;
+  Impl(Impl&&) = delete;
+  Impl& operator=(const Impl&) = delete;
+  Impl& operator=(Impl&&) = delete;
+
+  void Join() const { WaitForSingleObject(handle, INFINITE); }
+
+  static DWORD WINAPI run(void* self) {
+    reinterpret_cast<Impl*>(self)->func();
+    return 0;
+  }
+
+ private:
+  const Func func;
+  const HANDLE handle;
+};
+
+Thread::Thread(Affinity&& affinity, Func&& func) {
+  SIZE_T size = 0;
+  InitializeProcThreadAttributeList(nullptr, 1, 0, &size);
+  MARL_ASSERT(size > 0,
+              "InitializeProcThreadAttributeList() did not give a size");
+
+  std::vector<uint8_t> buffer(size);
+  LPPROC_THREAD_ATTRIBUTE_LIST attributes =
+      reinterpret_cast<LPPROC_THREAD_ATTRIBUTE_LIST>(buffer.data());
+  CHECK_WIN32(InitializeProcThreadAttributeList(attributes, 1, 0, &size));
+  defer(DeleteProcThreadAttributeList(attributes));
+
+  GROUP_AFFINITY groupAffinity = {};
+
+  auto count = affinity.count();
+  if (count > 0) {
+    groupAffinity.Group = affinity[0].windows.group;
+    for (size_t i = 0; i < count; i++) {
+      auto core = affinity[i];
+      MARL_ASSERT(groupAffinity.Group == core.windows.group,
+                  "Cannot create thread that uses multiple affinity groups");
+      groupAffinity.Mask |= (1ULL << core.windows.index);
+    }
+    CHECK_WIN32(UpdateProcThreadAttribute(
+        attributes, 0, PROC_THREAD_ATTRIBUTE_GROUP_AFFINITY, &groupAffinity,
+        sizeof(groupAffinity), nullptr, nullptr));
+  }
+
+  impl = new Impl(std::move(func), attributes);
+}
+
+Thread::~Thread() {
+  delete impl;
+}
+
+void Thread::join() {
+  MARL_ASSERT(impl != nullptr, "join() called on unjoinable thread");
+  impl->Join();
+}
+
+void Thread::setName(const char* fmt, ...) {
+  static auto setThreadDescription =
+      reinterpret_cast<HRESULT(WINAPI*)(HANDLE, PCWSTR)>(GetProcAddress(
+          GetModuleHandleA("kernelbase.dll"), "SetThreadDescription"));
+  if (setThreadDescription == nullptr) {
+    return;
+  }
+
+  char name[1024];
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(name, sizeof(name), fmt, vararg);
+  va_end(vararg);
+
+  wchar_t wname[1024];
+  mbstowcs(wname, name, 1024);
+  setThreadDescription(GetCurrentThread(), wname);
+  MARL_NAME_THREAD("%s", name);
+}
+
+unsigned int Thread::numLogicalCPUs() {
+  unsigned int count = 0;
+  const auto& groups = getProcessorGroups();
+  for (size_t groupIdx = 0; groupIdx < groups.count; groupIdx++) {
+    const auto& group = groups.groups[groupIdx];
+    count += group.count;
+  }
+  return count;
+}
+
+#else
+
+class Thread::Impl {
+ public:
+  Impl(Affinity&& affinity, Thread::Func&& f)
+      : affinity(std::move(affinity)), func(std::move(f)), thread([this] {
+          setAffinity();
+          func();
+        }) {}
+
+  Affinity affinity;
+  Func func;
+  std::thread thread;
+
+  void setAffinity() {
+    auto count = affinity.count();
+    if (count == 0) {
+      return;
+    }
+
+#if defined(__linux__) && !defined(__ANDROID__) && !defined(__BIONIC__)
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    for (size_t i = 0; i < count; i++) {
+      CPU_SET(affinity[i].pthread.index, &cpuset);
+    }
+    auto thread = pthread_self();
+    pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+#elif defined(__FreeBSD__)
+    cpuset_t cpuset;
+    CPU_ZERO(&cpuset);
+    for (size_t i = 0; i < count; i++) {
+      CPU_SET(affinity[i].pthread.index, &cpuset);
+    }
+    auto thread = pthread_self();
+    pthread_setaffinity_np(thread, sizeof(cpuset_t), &cpuset);
+#else
+    MARL_ASSERT(!marl::Thread::Affinity::supported,
+                "Attempting to use thread affinity on a unsupported platform");
+#endif
+  }
+};
+
+Thread::Thread(Affinity&& affinity, Func&& func)
+    : impl(new Thread::Impl(std::move(affinity), std::move(func))) {}
+
+Thread::~Thread() {
+  MARL_ASSERT(!impl, "Thread::join() was not called before destruction");
+}
+
+void Thread::join() {
+  impl->thread.join();
+  delete impl;
+  impl = nullptr;
+}
+
+void Thread::setName(const char* fmt, ...) {
+  char name[1024];
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(name, sizeof(name), fmt, vararg);
+  va_end(vararg);
+
+#if defined(__APPLE__)
+  pthread_setname_np(name);
+#elif defined(__FreeBSD__)
+  pthread_set_name_np(pthread_self(), name);
+#elif !defined(__Fuchsia__) && !defined(__EMSCRIPTEN__)
+  pthread_setname_np(pthread_self(), name);
+#endif
+
+  MARL_NAME_THREAD("%s", name);
+}
+
+unsigned int Thread::numLogicalCPUs() {
+  return static_cast<unsigned int>(sysconf(_SC_NPROCESSORS_ONLN));
+}
+
+#endif  // OS
+
+Thread::Thread(Thread&& rhs) : impl(rhs.impl) {
+  rhs.impl = nullptr;
+}
+
+Thread& Thread::operator=(Thread&& rhs) {
+  if (impl) {
+    delete impl;
+    impl = nullptr;
+  }
+  impl = rhs.impl;
+  rhs.impl = nullptr;
+  return *this;
+}
+
+}  // namespace marl
--- a/3party/marl/src/thread_test.cpp
+++ b/3party/marl/src/thread_test.cpp
@ -0,0 +1,137 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/thread.h"
+
+namespace {
+
+marl::Thread::Core core(int idx) {
+  marl::Thread::Core c;
+  c.pthread.index = static_cast<uint16_t>(idx);
+  return c;
+}
+
+}  // anonymous namespace
+
+TEST_F(WithoutBoundScheduler, ThreadAffinityCount) {
+  auto affinity = marl::Thread::Affinity(
+      {
+          core(10),
+          core(20),
+          core(30),
+          core(40),
+      },
+      allocator);
+  EXPECT_EQ(affinity.count(), 4U);
+}
+
+TEST_F(WithoutBoundScheduler, ThreadAdd) {
+  auto affinity = marl::Thread::Affinity(
+      {
+          core(10),
+          core(20),
+          core(30),
+          core(40),
+      },
+      allocator);
+
+  affinity
+      .add(marl::Thread::Affinity(
+          {
+              core(25),
+              core(15),
+          },
+          allocator))
+      .add(marl::Thread::Affinity({core(35)}, allocator));
+
+  EXPECT_EQ(affinity.count(), 7U);
+  EXPECT_EQ(affinity[0], core(10));
+  EXPECT_EQ(affinity[1], core(15));
+  EXPECT_EQ(affinity[2], core(20));
+  EXPECT_EQ(affinity[3], core(25));
+  EXPECT_EQ(affinity[4], core(30));
+  EXPECT_EQ(affinity[5], core(35));
+  EXPECT_EQ(affinity[6], core(40));
+}
+
+TEST_F(WithoutBoundScheduler, ThreadRemove) {
+  auto affinity = marl::Thread::Affinity(
+      {
+          core(10),
+          core(20),
+          core(30),
+          core(40),
+      },
+      allocator);
+
+  affinity
+      .remove(marl::Thread::Affinity(
+          {
+              core(25),
+              core(20),
+          },
+          allocator))
+      .remove(marl::Thread::Affinity({core(40)}, allocator));
+
+  EXPECT_EQ(affinity.count(), 2U);
+  EXPECT_EQ(affinity[0], core(10));
+  EXPECT_EQ(affinity[1], core(30));
+}
+
+TEST_F(WithoutBoundScheduler, ThreadAffinityAllCountNonzero) {
+  auto affinity = marl::Thread::Affinity::all(allocator);
+  if (marl::Thread::Affinity::supported) {
+    EXPECT_NE(affinity.count(), 0U);
+  } else {
+    EXPECT_EQ(affinity.count(), 0U);
+  }
+}
+
+TEST_F(WithoutBoundScheduler, ThreadAffinityFromVector) {
+  marl::containers::vector<marl::Thread::Core, 32> cores(allocator);
+  cores.push_back(core(10));
+  cores.push_back(core(20));
+  cores.push_back(core(30));
+  cores.push_back(core(40));
+  auto affinity = marl::Thread::Affinity(cores, allocator);
+  EXPECT_EQ(affinity.count(), cores.size());
+  EXPECT_EQ(affinity[0], core(10));
+  EXPECT_EQ(affinity[1], core(20));
+  EXPECT_EQ(affinity[2], core(30));
+  EXPECT_EQ(affinity[3], core(40));
+}
+
+TEST_F(WithoutBoundScheduler, ThreadAffinityPolicyOneOf) {
+  auto all = marl::Thread::Affinity(
+      {
+          core(10),
+          core(20),
+          core(30),
+          core(40),
+      },
+      allocator);
+
+  auto policy =
+      marl::Thread::Affinity::Policy::oneOf(std::move(all), allocator);
+  EXPECT_EQ(policy->get(0, allocator).count(), 1U);
+  EXPECT_EQ(policy->get(0, allocator)[0].pthread.index, 10);
+  EXPECT_EQ(policy->get(1, allocator).count(), 1U);
+  EXPECT_EQ(policy->get(1, allocator)[0].pthread.index, 20);
+  EXPECT_EQ(policy->get(2, allocator).count(), 1U);
+  EXPECT_EQ(policy->get(2, allocator)[0].pthread.index, 30);
+  EXPECT_EQ(policy->get(3, allocator).count(), 1U);
+  EXPECT_EQ(policy->get(3, allocator)[0].pthread.index, 40);
+}
--- a/3party/marl/src/ticket_bench.cpp
+++ b/3party/marl/src/ticket_bench.cpp
@ -0,0 +1,39 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/defer.h"
+#include "marl/scheduler.h"
+#include "marl/thread.h"
+#include "marl/ticket.h"
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_DEFINE_F(Schedule, Ticket)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      marl::Ticket::Queue queue;
+      for (int i = 0; i < numTasks; i++) {
+        auto ticket = queue.take();
+        marl::schedule([ticket] {
+          ticket.wait();
+          ticket.done();
+        });
+      }
+      queue.take().wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, Ticket)->Apply(Schedule::args<512>);
--- a/3party/marl/src/ticket_test.cpp
+++ b/3party/marl/src/ticket_test.cpp
@ -0,0 +1,40 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_test.h"
+
+#include "marl/ticket.h"
+
+TEST_P(WithBoundScheduler, Ticket) {
+  marl::Ticket::Queue queue;
+
+  constexpr int count = 1000;
+  std::atomic<int> next = {0};
+  int result[count] = {};
+
+  for (int i = 0; i < count; i++) {
+    auto ticket = queue.take();
+    marl::schedule([ticket, i, &result, &next] {
+      ticket.wait();
+      result[next++] = i;
+      ticket.done();
+    });
+  }
+
+  queue.take().wait();
+
+  for (int i = 0; i < count; i++) {
+    ASSERT_EQ(result[i], i);
+  }
+}
--- a/3party/marl/src/trace.cpp
+++ b/3party/marl/src/trace.cpp
@ -0,0 +1,246 @@
+// Copyright 2019 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The Trace API produces a trace event file that can be consumed with Chrome's
+// about:tracing viewer.
+// Documentation can be found at:
+//   https://www.chromium.org/developers/how-tos/trace-event-profiling-tool
+//   https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit
+
+#include "marl/trace.h"
+
+#include "marl/defer.h"
+#include "marl/scheduler.h"
+#include "marl/thread.h"
+
+#if MARL_TRACE_ENABLED
+
+#include <atomic>
+#include <fstream>
+
+namespace {
+
+// Chrome traces can choke or error on very large trace files.
+// Limit the number of events created to this number.
+static constexpr int MaxEvents = 100000;
+
+uint64_t threadFiberID(uint32_t threadID, uint32_t fiberID) {
+  return static_cast<uint64_t>(threadID) * 31 + static_cast<uint64_t>(fiberID);
+}
+
+}  // anonymous namespace
+
+namespace marl {
+
+Trace* Trace::get() {
+  static Trace trace;
+  return &trace;
+}
+
+Trace::Trace() {
+  nameThread("main");
+  thread = std::thread([&] {
+    Thread::setName("Trace worker");
+
+    auto out = std::fstream("chrome.trace", std::ios_base::out);
+
+    out << "[" << std::endl;
+    defer(out << std::endl << "]" << std::endl);
+
+    auto first = true;
+    for (int i = 0; i < MaxEvents; i++) {
+      auto event = take();
+      if (event->type() == Event::Type::Shutdown) {
+        return;
+      }
+      if (!first) {
+        out << "," << std::endl;
+      };
+      first = false;
+      out << "{" << std::endl;
+      event->write(out);
+      out << "}";
+    }
+
+    stopped = true;
+
+    while (take()->type() != Event::Type::Shutdown) {
+    }
+  });
+}
+
+Trace::~Trace() {
+  put(new Shutdown());
+  thread.join();
+}
+
+void Trace::nameThread(const char* fmt, ...) {
+  if (stopped) {
+    return;
+  }
+  auto event = new NameThreadEvent();
+
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(event->name, Trace::MaxEventNameLength, fmt, vararg);
+  va_end(vararg);
+
+  put(event);
+}
+
+void Trace::beginEvent(const char* fmt, ...) {
+  if (stopped) {
+    return;
+  }
+  auto event = new BeginEvent();
+
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(event->name, Trace::MaxEventNameLength, fmt, vararg);
+  va_end(vararg);
+
+  event->timestamp = timestamp();
+  put(event);
+}
+
+void Trace::endEvent() {
+  if (stopped) {
+    return;
+  }
+  auto event = new EndEvent();
+  event->timestamp = timestamp();
+  put(event);
+}
+
+void Trace::beginAsyncEvent(uint32_t id, const char* fmt, ...) {
+  if (stopped) {
+    return;
+  }
+  auto event = new AsyncStartEvent();
+
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(event->name, Trace::MaxEventNameLength, fmt, vararg);
+  va_end(vararg);
+
+  event->timestamp = timestamp();
+  event->id = id;
+  put(event);
+}
+
+void Trace::endAsyncEvent(uint32_t id, const char* fmt, ...) {
+  if (stopped) {
+    return;
+  }
+  auto event = new AsyncEndEvent();
+
+  va_list vararg;
+  va_start(vararg, fmt);
+  vsnprintf(event->name, Trace::MaxEventNameLength, fmt, vararg);
+  va_end(vararg);
+
+  event->timestamp = timestamp();
+  event->id = id;
+  put(event);
+}
+
+uint64_t Trace::timestamp() {
+  auto now = std::chrono::high_resolution_clock::now();
+  auto diff =
+      std::chrono::duration_cast<std::chrono::microseconds>(now - createdAt);
+  return static_cast<uint64_t>(diff.count());
+}
+
+void Trace::put(Event* event) {
+  auto idx = eventQueueWriteIdx++ % eventQueues.size();
+  auto& queue = eventQueues[idx];
+  std::unique_lock<std::mutex> lock(queue.mutex);
+  auto notify = queue.data.size() == 0;
+  queue.data.push(std::unique_ptr<Event>(event));
+  lock.unlock();
+  if (notify) {
+    queue.condition.notify_one();
+  }
+}
+
+std::unique_ptr<Trace::Event> Trace::take() {
+  auto idx = eventQueueReadIdx++ % eventQueues.size();
+  auto& queue = eventQueues[idx];
+  std::unique_lock<std::mutex> lock(queue.mutex);
+  queue.condition.wait(lock, [&queue] { return queue.data.size() > 0; });
+  auto event = std::move(queue.data.front());
+  queue.data.pop();
+  return event;
+}
+
+#define QUOTE(x) "\"" << x << "\""
+#define INDENT "  "
+
+Trace::Event::Event()
+    : threadID(std::hash<std::thread::id>()(std::this_thread::get_id())) {
+  if (auto fiber = Scheduler::Fiber::current()) {
+    fiberID = fiber->id;
+  }
+}
+
+void Trace::Event::write(std::ostream& out) const {
+  out << INDENT << QUOTE("name") << ": " << QUOTE(name) << "," << std::endl;
+  if (categories != nullptr) {
+    out << INDENT << QUOTE("cat") << ": "
+        << "\"";
+    auto first = true;
+    for (auto category = *categories; category != nullptr; category++) {
+      if (!first) {
+        out << ",";
+      }
+      out << category;
+    }
+    out << "\"," << std::endl;
+  }
+  if (fiberID != 0) {
+    out << INDENT << QUOTE("args") << ": "
+        << "{" << std::endl
+        << INDENT << INDENT << QUOTE("fiber") << ": " << fiberID << std::endl
+        << INDENT << "}," << std::endl;
+  }
+  if (threadID != 0) {
+    out << INDENT << QUOTE("tid") << ": " << threadFiberID(threadID, fiberID)
+        << "," << std::endl;
+  }
+  out << INDENT << QUOTE("ph") << ": " << QUOTE(static_cast<char>(type()))
+      << "," << std::endl
+      << INDENT << QUOTE("pid") << ": " << processID << "," << std::endl
+      << INDENT << QUOTE("ts") << ": " << timestamp << std::endl;
+}
+
+void Trace::NameThreadEvent::write(std::ostream& out) const {
+  out << INDENT << QUOTE("name") << ": " << QUOTE("thread_name") << ","
+      << std::endl
+      << INDENT << QUOTE("ph") << ": " << QUOTE("M") << "," << std::endl
+      << INDENT << QUOTE("pid") << ": " << processID << "," << std::endl
+      << INDENT << QUOTE("tid") << ": " << threadFiberID(threadID, fiberID)
+      << "," << std::endl
+      << INDENT << QUOTE("args") << ": {" << QUOTE("name") << ": "
+      << QUOTE(name) << "}" << std::endl;
+}
+
+void Trace::AsyncEvent::write(std::ostream& out) const {
+  out << INDENT << QUOTE("id") << ": " << QUOTE(id) << "," << std::endl
+      << INDENT << QUOTE("cat") << ": " << QUOTE("async") << "," << std::endl;
+  Event::write(out);
+}
+
+}  // namespace marl
+
+#endif  // MARL_TRACE_ENABLED
--- a/3party/marl/src/waitgroup_bench.cpp
+++ b/3party/marl/src/waitgroup_bench.cpp
@ -0,0 +1,31 @@
+// Copyright 2020 The Marl Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "marl_bench.h"
+
+#include "marl/waitgroup.h"
+
+BENCHMARK_DEFINE_F(Schedule, WaitGroup)(benchmark::State& state) {
+  run(state, [&](int numTasks) {
+    for (auto _ : state) {
+      marl::WaitGroup wg;
+      wg.add(numTasks);
+      for (auto i = 0; i < numTasks; i++) {
+        marl::schedule([=] { wg.done(); });
+      }
+      wg.wait();
+    }
+  });
+}
+BENCHMARK_REGISTER_F(Schedule, WaitGroup)->Apply(Schedule::args);
--- a/Show More
+++ b/Show More