Release 0.8.1

libunwind: fix unwinding through libunwind stack frames
Fixes #9591
2021-09-06 19:41:54 -07:00 · 2021-09-03 10:19:01 -07:00 · 2021-09-03 10:18:53 -07:00 · 2021-09-03 10:18:28 -07:00 · 2021-09-01 17:01:35 -07:00 · 2021-08-31 13:45:04 -07:00
6502 changed files with 673200 additions and 134699 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -3,9 +3,10 @@
 langref.html.in text eol=lf
 deps/SoftFloat-3e/*.txt text eol=crlf

-deps/* linguist-vendored
-lib/include/* linguist-vendored
-lib/libc/* linguist-vendored
-lib/libcxx/* linguist-vendored
-lib/libcxxabi/* linguist-vendored
-lib/libunwind/* linguist-vendored
+deps/** linguist-vendored
+lib/include/** linguist-vendored
+lib/libc/** linguist-vendored
+lib/libcxx/** linguist-vendored
+lib/libcxxabi/** linguist-vendored
+lib/libunwind/** linguist-vendored
+lib/tsan/** linguist-vendored
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,9 @@
 # -andrewrk

 zig-cache/
+zig-out/
+/release/
+/debug/
 /build/
 /build-*/
 /docgen_tmp/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.5)
+cmake_minimum_required(VERSION 2.8.12)

 # Use ccache if possible
 FIND_PROGRAM(CCACHE_PROGRAM ccache)
@@ -25,8 +25,8 @@ project(zig C CXX)
 set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})

 set(ZIG_VERSION_MAJOR 0)
-set(ZIG_VERSION_MINOR 7)
-set(ZIG_VERSION_PATCH 0)
+set(ZIG_VERSION_MINOR 8)
+set(ZIG_VERSION_PATCH 1)
 set(ZIG_VERSION "" CACHE STRING "Override Zig version string. Default is to find out with git.")

 if("${ZIG_VERSION}" STREQUAL "")
@@ -34,18 +34,31 @@ if("${ZIG_VERSION}" STREQUAL "")
    find_program(GIT_EXE NAMES git)
    if(GIT_EXE)
        execute_process(
-            COMMAND ${GIT_EXE} -C ${CMAKE_SOURCE_DIR} name-rev HEAD --tags --name-only --no-undefined --always
+            COMMAND ${GIT_EXE} -C ${CMAKE_SOURCE_DIR} describe --match *.*.* --tags
            RESULT_VARIABLE EXIT_STATUS
-            OUTPUT_VARIABLE ZIG_GIT_REV
+            OUTPUT_VARIABLE GIT_DESCRIBE
            OUTPUT_STRIP_TRAILING_WHITESPACE
            ERROR_QUIET)
        if(EXIT_STATUS EQUAL "0")
-            if(ZIG_GIT_REV MATCHES "\\^0$")
-                if(NOT("${ZIG_GIT_REV}" STREQUAL "${ZIG_VERSION}^0"))
-                    message("WARNING: Tag does not match configured Zig version")
+            if(GIT_DESCRIBE MATCHES "^v?([0-9]+\\.[0-9]+\\.[0-9]+)$")
+                # Tagged release version.
+                set(GIT_TAG ${CMAKE_MATCH_1})
+                if(NOT GIT_TAG VERSION_EQUAL ZIG_VERSION)
+                    message(SEND_ERROR "Zig version (${ZIG_VERSION}) does not match Git tag (${GIT_TAG}).")
                endif()
+            elseif(GIT_DESCRIBE MATCHES "^v?([0-9]+\\.[0-9]+\\.[0-9]+)-([0-9]+)-g(.+)$")
+                # Untagged pre-release. The Zig version is updated to include the number of commits
+                # since the last tagged version and the commit hash. The version is formatted in
+                # accordance with the https://semver.org specification.
+                set(GIT_TAG ${CMAKE_MATCH_1})
+                set(GIT_COMMITS_AFTER_TAG ${CMAKE_MATCH_2})
+                set(GIT_COMMIT ${CMAKE_MATCH_3})
+                if(NOT ZIG_VERSION VERSION_GREATER GIT_TAG)
+                    message(SEND_ERROR "Zig version (${ZIG_VERSION}) must be greater than tagged ancestor (${GIT_TAG}).")
+                endif()
+                set(ZIG_VERSION "${ZIG_VERSION}-dev.${GIT_COMMITS_AFTER_TAG}+${GIT_COMMIT}")
            else()
-                set(ZIG_VERSION "${ZIG_VERSION}+${ZIG_GIT_REV}")
+                message(WARNING "Failed to parse version from output of `git describe`.")
            endif()
        endif()
    endif()
@@ -54,6 +67,7 @@ message("Configuring zig version ${ZIG_VERSION}")

 set(ZIG_STATIC off CACHE BOOL "Attempt to build a static zig executable (not compatible with glibc)")
 set(ZIG_STATIC_LLVM off CACHE BOOL "Prefer linking against static LLVM libraries")
+set(ZIG_STATIC_ZLIB off CACHE BOOL "Prefer linking against static zlib")
 set(ZIG_PREFER_CLANG_CPP_DYLIB off CACHE BOOL "Try to link against -lclang-cpp")
 set(ZIG_USE_CCACHE off CACHE BOOL "Use ccache if available")

@@ -62,10 +76,8 @@ if(CCACHE_PROGRAM AND ZIG_USE_CCACHE)
 endif()

 if(ZIG_STATIC)
-    set(ZIG_STATIC_LLVM "on")
-    set(ZIG_LINK_MODE "Static")
-else()
-    set(ZIG_LINK_MODE "Dynamic")
+    set(ZIG_STATIC_LLVM ON)
+    set(ZIG_STATIC_ZLIB ON)
 endif()

 string(REGEX REPLACE "\\\\" "\\\\\\\\" ZIG_LIBC_LIB_DIR_ESCAPED "${ZIG_LIBC_LIB_DIR}")
@@ -77,17 +89,27 @@ option(ZIG_TEST_COVERAGE "Build Zig with test coverage instrumentation" OFF)
 set(ZIG_TARGET_TRIPLE "native" CACHE STRING "arch-os-abi to output binaries for")
 set(ZIG_TARGET_MCPU "baseline" CACHE STRING "-mcpu parameter to output binaries for")
 set(ZIG_EXECUTABLE "" CACHE STRING "(when cross compiling) path to already-built zig binary")
-set(ZIG_PREFER_LLVM_CONFIG off CACHE BOOL "(when cross compiling) use llvm-config to find target llvm dependencies if needed")
+set(ZIG_SINGLE_THREADED off CACHE BOOL "limit the zig compiler to use only 1 thread")
+set(ZIG_OMIT_STAGE2 off CACHE BOOL "omit the stage2 backend from stage1")
+set(ZIG_ENABLE_LOGGING off CACHE BOOL "enable logging")
+
+if("${ZIG_TARGET_TRIPLE}" STREQUAL "native")
+    set(ZIG_USE_LLVM_CONFIG ON CACHE BOOL "use llvm-config to find LLVM libraries")
+else()
+    set(ZIG_USE_LLVM_CONFIG OFF CACHE BOOL "use llvm-config to find LLVM libraries")
+endif()

 find_package(llvm)
 find_package(clang)
 find_package(lld)

-if(APPLE AND ZIG_STATIC)
+if(ZIG_STATIC_ZLIB)
    list(REMOVE_ITEM LLVM_LIBRARIES "-lz")
-    find_library(ZLIB NAMES libz.a z zlib libz)
+    find_library(ZLIB NAMES libz.a libzlibstatic.a z zlib libz)
    list(APPEND LLVM_LIBRARIES "${ZLIB}")
+endif()

+if(APPLE AND ZIG_STATIC)
    list(REMOVE_ITEM LLVM_LIBRARIES "-lcurses")
    find_library(CURSES NAMES libcurses.a curses libcurses libncurses.a ncurses libncurses)
    list(APPEND LLVM_LIBRARIES "${CURSES}")
@@ -258,6 +280,15 @@ set(SOFTFLOAT_LIBRARIES embedded_softfloat)

 find_package(Threads)

+set(ZIG_LIB_DIR "lib/zig")
+set(C_HEADERS_DEST "${ZIG_LIB_DIR}/include")
+set(LIBC_FILES_DEST "${ZIG_LIB_DIR}/libc")
+set(LIBUNWIND_FILES_DEST "${ZIG_LIB_DIR}/libunwind")
+set(LIBCXX_FILES_DEST "${ZIG_LIB_DIR}/libcxx")
+set(ZIG_STD_DEST "${ZIG_LIB_DIR}/std")
+set(ZIG_CONFIG_H_OUT "${CMAKE_BINARY_DIR}/config.h")
+set(ZIG_CONFIG_ZIG_OUT "${CMAKE_BINARY_DIR}/config.zig")
+
 # This is our shim which will be replaced by stage1.zig.
 set(ZIG0_SOURCES
    "${CMAKE_SOURCE_DIR}/src/stage1/zig0.cpp"
@@ -265,7 +296,7 @@ set(ZIG0_SOURCES

 set(STAGE1_SOURCES
    "${CMAKE_SOURCE_DIR}/src/stage1/analyze.cpp"
-    "${CMAKE_SOURCE_DIR}/src/stage1/ast_render.cpp"
+    "${CMAKE_SOURCE_DIR}/src/stage1/astgen.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/bigfloat.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/bigint.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/buffer.cpp"
@@ -280,11 +311,11 @@ set(STAGE1_SOURCES
    "${CMAKE_SOURCE_DIR}/src/stage1/os.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/parser.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/range_set.cpp"
+    "${CMAKE_SOURCE_DIR}/src/stage1/softfloat_ext.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/stage1.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/target.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/tokenizer.cpp"
    "${CMAKE_SOURCE_DIR}/src/stage1/util.cpp"
-    "${CMAKE_SOURCE_DIR}/src/stage1/softfloat_ext.cpp"
 )
 set(OPTIMIZED_C_SOURCES
    "${CMAKE_SOURCE_DIR}/src/stage1/parse_f128.c"
@@ -292,6 +323,7 @@ set(OPTIMIZED_C_SOURCES
 set(ZIG_CPP_SOURCES
    # These are planned to stay even when we are self-hosted.
    "${CMAKE_SOURCE_DIR}/src/zig_llvm.cpp"
+    "${CMAKE_SOURCE_DIR}/src/zig_llvm-ar.cpp"
    "${CMAKE_SOURCE_DIR}/src/zig_clang.cpp"
    "${CMAKE_SOURCE_DIR}/src/zig_clang_driver.cpp"
    "${CMAKE_SOURCE_DIR}/src/zig_clang_cc1_main.cpp"
@@ -299,6 +331,277 @@ set(ZIG_CPP_SOURCES
    # https://github.com/ziglang/zig/issues/6363
    "${CMAKE_SOURCE_DIR}/src/windows_sdk.cpp"
 )
+# Needed because we use cmake, not the zig build system, to build zig1.o.
+# This list is generated by building zig and then clearing the zig-cache directory,
+# then manually running the build-obj command (see BUILD_ZIG1_ARGS), and then looking
+# in the zig-cache directory for the compiler-generated list of zig file dependencies.
+set(ZIG_STAGE2_SOURCES
+    "${ZIG_CONFIG_ZIG_OUT}"
+    "${CMAKE_SOURCE_DIR}/lib/std/array_hash_map.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/array_list.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/ascii.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/atomic.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/atomic/Atomic.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/atomic/queue.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/atomic/stack.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/base64.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/buf_map.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/builtin.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/c.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/c/linux.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/c/tokenizer.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/child_process.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/coff.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/comptime_string_map.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/crypto.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/crypto/blake3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/crypto/siphash.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/debug.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/dwarf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/dwarf_bits.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/elf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/event.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/event/batch.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/event/loop.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fifo.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fmt.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fmt/errol.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fmt/errol/enum3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fmt/errol/lookup.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fmt/parse_float.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fs.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fs/file.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fs/get_app_data_dir.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/fs/path.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/hash.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/hash/auto_hash.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/hash/wyhash.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/hash_map.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/heap.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/heap/arena_allocator.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/buffered_atomic_file.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/buffered_writer.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/change_detection_stream.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/counting_reader.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/counting_writer.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/find_byte_writer.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/fixed_buffer_stream.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/limited_reader.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/reader.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/seekable_stream.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/io/writer.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/json.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/json/write_stream.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/leb128.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/linked_list.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/log.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/macho.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/big.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/big/int.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/floor.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/frexp.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/inf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/isinf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/isnan.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/ln.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/log.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/log10.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/log2.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/nan.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/signbit.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/math/sqrt.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/mem.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/mem/Allocator.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/meta.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/meta/trailer_flags.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/meta/trait.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/multi_array_list.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/bits.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/bits/linux.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/bits/linux/errno-generic.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/bits/linux/netlink.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/bits/linux/prctl.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/bits/linux/securebits.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/bits/linux/x86_64.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/linux.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/linux/io_uring.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/linux/x86_64.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/windows.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/windows/bits.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/windows/ntstatus.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/os/windows/win32error.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/Progress.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/pdb.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/process.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/rand.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/sort.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/addXf3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/atomics.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/clear_cache.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/clzsi2.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/compareXf2.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/divdf3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/divsf3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/divtf3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/divti3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/extendXfYf2.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixdfdi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixdfsi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixdfti.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixint.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixsfdi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixsfsi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixsfti.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixtfdi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixtfsi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixtfti.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixuint.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunsdfdi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunsdfsi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunsdfti.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunssfdi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunssfsi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunssfti.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunstfdi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunstfsi.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/fixunstfti.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatXisf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatdidf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatditf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatsiXf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floattidf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floattitf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatundidf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatundisf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatunditf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatunsidf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatunsisf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatunsitf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatuntidf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatuntisf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/floatuntitf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/int.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/modti3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/mulXf3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/muldi3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/mulodi4.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/muloti4.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/multi3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/negXf2.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/popcountdi2.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/shift.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/stack_probe.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/truncXfYf2.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/udivmod.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/udivmodti4.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/udivti3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/special/compiler_rt/umodti3.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/start.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/std.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/aarch64.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/amdgpu.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/arm.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/avr.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/bpf.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/hexagon.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/mips.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/msp430.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/nvptx.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/powerpc.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/riscv.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/sparc.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/systemz.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/wasm.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/target/x86.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/Thread.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/Thread/AutoResetEvent.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/Thread/Mutex.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/Thread/ResetEvent.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/Thread/StaticResetEvent.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/time.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/unicode.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig/ast.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig/cross_target.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig/parse.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig/render.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig/string_literal.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig/system.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig/system/x86.zig"
+    "${CMAKE_SOURCE_DIR}/lib/std/zig/tokenizer.zig"
+    "${CMAKE_SOURCE_DIR}/src/Cache.zig"
+    "${CMAKE_SOURCE_DIR}/src/Compilation.zig"
+    "${CMAKE_SOURCE_DIR}/src/DepTokenizer.zig"
+    "${CMAKE_SOURCE_DIR}/src/Module.zig"
+    "${CMAKE_SOURCE_DIR}/src/Package.zig"
+    "${CMAKE_SOURCE_DIR}/src/RangeSet.zig"
+    "${CMAKE_SOURCE_DIR}/src/ThreadPool.zig"
+    "${CMAKE_SOURCE_DIR}/src/TypedValue.zig"
+    "${CMAKE_SOURCE_DIR}/src/WaitGroup.zig"
+    "${CMAKE_SOURCE_DIR}/src/AstGen.zig"
+    "${CMAKE_SOURCE_DIR}/src/clang.zig"
+    "${CMAKE_SOURCE_DIR}/src/clang_options.zig"
+    "${CMAKE_SOURCE_DIR}/src/clang_options_data.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/aarch64.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/arm.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/c.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/llvm.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/llvm/bindings.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/riscv64.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/wasm.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/x86_64.zig"
+    "${CMAKE_SOURCE_DIR}/src/glibc.zig"
+    "${CMAKE_SOURCE_DIR}/src/introspect.zig"
+    "${CMAKE_SOURCE_DIR}/src/air.zig"
+    "${CMAKE_SOURCE_DIR}/src/libc_installation.zig"
+    "${CMAKE_SOURCE_DIR}/src/libcxx.zig"
+    "${CMAKE_SOURCE_DIR}/src/libtsan.zig"
+    "${CMAKE_SOURCE_DIR}/src/libunwind.zig"
+    "${CMAKE_SOURCE_DIR}/src/link.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/C.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Coff.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Elf.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/Archive.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/CodeSignature.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/DebugSymbols.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/Dylib.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/Object.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/Symbol.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/Trie.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/Zld.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/bind.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/commands.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/reloc.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/reloc/aarch64.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/reloc/x86_64.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/C/zig.h"
+    "${CMAKE_SOURCE_DIR}/src/link/msdos-stub.bin"
+    "${CMAKE_SOURCE_DIR}/src/liveness.zig"
+    "${CMAKE_SOURCE_DIR}/src/main.zig"
+    "${CMAKE_SOURCE_DIR}/src/mingw.zig"
+    "${CMAKE_SOURCE_DIR}/src/musl.zig"
+    "${CMAKE_SOURCE_DIR}/src/print_env.zig"
+    "${CMAKE_SOURCE_DIR}/src/print_targets.zig"
+    "${CMAKE_SOURCE_DIR}/src/stage1.zig"
+    "${CMAKE_SOURCE_DIR}/src/target.zig"
+    "${CMAKE_SOURCE_DIR}/src/tracy.zig"
+    "${CMAKE_SOURCE_DIR}/src/translate_c.zig"
+    "${CMAKE_SOURCE_DIR}/src/translate_c/ast.zig"
+    "${CMAKE_SOURCE_DIR}/src/type.zig"
+    "${CMAKE_SOURCE_DIR}/src/value.zig"
+    "${CMAKE_SOURCE_DIR}/src/wasi_libc.zig"
+    "${CMAKE_SOURCE_DIR}/src/windows_sdk.zig"
+    "${CMAKE_SOURCE_DIR}/src/Zir.zig"
+    "${CMAKE_SOURCE_DIR}/src/Sema.zig"
+)

 if(MSVC)
    set(MSVC_DIA_SDK_DIR "$ENV{VSINSTALLDIR}DIA SDK")
@@ -308,14 +611,18 @@ if(MSVC)
    endif()
 endif()

-set(ZIG_LIB_DIR "lib/zig")
-set(C_HEADERS_DEST "${ZIG_LIB_DIR}/include")
-set(LIBC_FILES_DEST "${ZIG_LIB_DIR}/libc")
-set(LIBUNWIND_FILES_DEST "${ZIG_LIB_DIR}/libunwind")
-set(LIBCXX_FILES_DEST "${ZIG_LIB_DIR}/libcxx")
-set(ZIG_STD_DEST "${ZIG_LIB_DIR}/std")
-set(ZIG_CONFIG_H_OUT "${CMAKE_BINARY_DIR}/config.h")
-set(ZIG_CONFIG_ZIG_OUT "${CMAKE_BINARY_DIR}/config.zig")
+if(ZIG_OMIT_STAGE2)
+  set(ZIG_OMIT_STAGE2_BOOL "true")
+else()
+  set(ZIG_OMIT_STAGE2_BOOL "false")
+endif()
+
+if(ZIG_ENABLE_LOGGING)
+  set(ZIG_ENABLE_LOGGING_BOOL "true")
+else()
+  set(ZIG_ENABLE_LOGGING_BOOL "false")
+endif()
+
 configure_file (
    "${CMAKE_SOURCE_DIR}/src/stage1/config.h.in"
    "${ZIG_CONFIG_H_OUT}"
@@ -339,6 +646,12 @@ else(MSVC)
  set(EXE_CFLAGS "-std=c++14")
 endif(MSVC)

+if(ZIG_STATIC)
+    set(EXE_CFLAGS "${EXE_CFLAGS} -DZIG_LINK_MODE=Static")
+else()
+    set(EXE_CFLAGS "${EXE_CFLAGS} -DZIG_LINK_MODE=Dynamic")
+endif()
+
 if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
    if(MSVC)
        set(EXE_CFLAGS "${EXE_CFLAGS} /w")
@@ -348,6 +661,10 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
        if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.0)
            set(EXE_CFLAGS "${EXE_CFLAGS} -Werror=implicit-fallthrough")
        endif()
+        # GCC 9.2 and older are unable to detect valid variable initialization in some cases
+        if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 9.2)
+            set(EXE_CFLAGS "${EXE_CFLAGS} -Wno-maybe-uninitialized")
+        endif()
    endif()
 endif()

@@ -434,12 +751,14 @@ if(MSVC OR MINGW)
    target_link_libraries(zigstage1 LINK_PUBLIC version)
 endif()

-add_executable(zig0 ${ZIG0_SOURCES})
-set_target_properties(zig0 PROPERTIES
-    COMPILE_FLAGS ${EXE_CFLAGS}
-    LINK_FLAGS ${EXE_LDFLAGS}
-)
-target_link_libraries(zig0 zigstage1)
+if("${ZIG_EXECUTABLE}" STREQUAL "")
+  add_executable(zig0 ${ZIG0_SOURCES})
+  set_target_properties(zig0 PROPERTIES
+      COMPILE_FLAGS ${EXE_CFLAGS}
+      LINK_FLAGS ${EXE_LDFLAGS}
+  )
+  target_link_libraries(zig0 zigstage1)
+endif()

 if(MSVC)
    set(ZIG1_OBJECT "${CMAKE_BINARY_DIR}/zig1.obj")
@@ -451,6 +770,11 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
 else()
    set(ZIG1_RELEASE_ARG -OReleaseFast --strip)
 endif()
+if(ZIG_SINGLE_THREADED)
+  set(ZIG1_SINGLE_THREADED_ARG "--single-threaded")
+else()
+  set(ZIG1_SINGLE_THREADED_ARG "")
+endif()

 set(BUILD_ZIG1_ARGS
    "src/stage1.zig"
@@ -460,6 +784,7 @@ set(BUILD_ZIG1_ARGS
    --override-lib-dir "${CMAKE_SOURCE_DIR}/lib"
    "-femit-bin=${ZIG1_OBJECT}"
    "${ZIG1_RELEASE_ARG}"
+    "${ZIG1_SINGLE_THREADED_ARG}"
    -lc
    --pkg-begin build_options "${ZIG_CONFIG_ZIG_OUT}"
    --pkg-end
@@ -468,10 +793,10 @@ set(BUILD_ZIG1_ARGS
 )

 if("${ZIG_EXECUTABLE}" STREQUAL "")
-  add_custom_target(zig_build_zig1 ALL
+  add_custom_command(
+      OUTPUT "${ZIG1_OBJECT}"
      COMMAND zig0 ${BUILD_ZIG1_ARGS}
-      DEPENDS zig0
-      BYPRODUCTS "${ZIG1_OBJECT}"
+      DEPENDS zig0 "${ZIG_STAGE2_SOURCES}"
      COMMENT STATUS "Building self-hosted component ${ZIG1_OBJECT}"
      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
  )
@@ -480,56 +805,57 @@ if("${ZIG_EXECUTABLE}" STREQUAL "")
    set(ZIG_EXECUTABLE "${ZIG_EXECUTABLE}.exe")
  endif()
 else()
-  add_custom_target(zig_build_zig1 ALL
+  add_custom_command(
+      OUTPUT "${ZIG1_OBJECT}"
      COMMAND "${ZIG_EXECUTABLE}" "build-obj" ${BUILD_ZIG1_ARGS}
-      BYPRODUCTS "${ZIG1_OBJECT}"
+      DEPENDS ${ZIG_STAGE2_SOURCES}
      COMMENT STATUS "Building self-hosted component ${ZIG1_OBJECT}"
      WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
  )
 endif()

 # cmake won't let us configure an executable without C sources.
-add_executable(zig "${CMAKE_SOURCE_DIR}/src/stage1/empty.cpp")
+add_executable(zig "${CMAKE_SOURCE_DIR}/src/stage1/empty.cpp" "${ZIG1_OBJECT}")

 set_target_properties(zig PROPERTIES
    COMPILE_FLAGS ${EXE_CFLAGS}
    LINK_FLAGS ${EXE_LDFLAGS}
 )
-target_link_libraries(zig "${ZIG1_OBJECT}" zigstage1)
+target_link_libraries(zig zigstage1)
 if(MSVC)
  target_link_libraries(zig ntdll.lib)
 elseif(MINGW)
  target_link_libraries(zig ntdll)
 endif()
-add_dependencies(zig zig_build_zig1)

 install(TARGETS zig DESTINATION bin)

-set(ZIG_INSTALL_ARGS "build"
-    --override-lib-dir "${CMAKE_SOURCE_DIR}/lib"
-    "-Dlib-files-only"
-    --prefix "${CMAKE_INSTALL_PREFIX}"
-    "-Dconfig_h=${ZIG_CONFIG_H_OUT}"
-    install
-)
+set(ZIG_SKIP_INSTALL_LIB_FILES off CACHE BOOL
+    "Disable copying lib/ files to install prefix during the build phase")

-# CODE has no effect with Visual Studio build system generator, therefore
-# when using Visual Studio build system generator we resort to running
-# `zig build install` during the build phase.
-if(MSVC)
-    set(ZIG_SKIP_INSTALL_LIB_FILES off CACHE BOOL
-        "Windows-only: Disable copying lib/ files to install prefix during the build phase")
-    if(NOT ZIG_SKIP_INSTALL_LIB_FILES)
+if(NOT ZIG_SKIP_INSTALL_LIB_FILES)
+    set(ZIG_INSTALL_ARGS "build"
+        --override-lib-dir "${CMAKE_SOURCE_DIR}/lib"
+        "-Dlib-files-only"
+        --prefix "${CMAKE_INSTALL_PREFIX}"
+        "-Dconfig_h=${ZIG_CONFIG_H_OUT}"
+        install
+    )
+
+    # CODE has no effect with Visual Studio build system generator, therefore
+    # when using Visual Studio build system generator we resort to running
+    # `zig build install` during the build phase.
+    if(MSVC)
        add_custom_target(zig_install_lib_files ALL
            COMMAND zig ${ZIG_INSTALL_ARGS}
            DEPENDS zig
            WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
        )
+    else()
+        get_target_property(zig_BINARY_DIR zig BINARY_DIR)
+        install(CODE "set(zig_EXE \"${ZIG_EXECUTABLE}\")")
+        install(CODE "set(ZIG_INSTALL_ARGS \"${ZIG_INSTALL_ARGS}\")")
+        install(CODE "set(CMAKE_SOURCE_DIR \"${CMAKE_SOURCE_DIR}\")")
+        install(SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/cmake/install.cmake)
    endif()
-else()
-    get_target_property(zig_BINARY_DIR zig BINARY_DIR)
-    install(CODE "set(zig_EXE \"${ZIG_EXECUTABLE}\")")
-    install(CODE "set(ZIG_INSTALL_ARGS \"${ZIG_INSTALL_ARGS}\")")
-    install(CODE "set(CMAKE_SOURCE_DIR \"${CMAKE_SOURCE_DIR}\")")
-    install(SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/cmake/install.cmake)
 endif()
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -9,7 +9,7 @@ a link. There is no concept of "official" or "unofficial", however, each
 gathering place has its own moderators and rules.

 This is Andrew Kelley speaking. At least for now, I'm the moderator of the
-ziglang organization GitHub repositories and the #zig IRC channel on Freenode.
+ziglang organization GitHub repositories and the #zig IRC channel on Libera.chat.
 **This document contains the rules that govern these two spaces only**.

 The rules here are strict. This space is for focused, on topic, technical work
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,7 +7,10 @@ personal project. Here are some great examples:

 * [Oxid](https://github.com/dbandstra/oxid) - arcade style game
 * [TM35-Metronome](https://github.com/TM35-Metronome) - tools for modifying and randomizing Pokémon games
- * [trOS](https://github.com/sjdh02/trOS) - tiny aarch64 baremetal OS thingy
+ * [River](https://github.com/ifreund/river/) - a dynamic tiling wayland compositor 
+
+More examples can be found on the
+[Community Projects Wiki](https://github.com/ziglang/zig/wiki/Community-Projects).

 Without fail, these projects lead to discovering bugs and helping flesh out use
 cases, which lead to further design iterations of Zig. Importantly, each issue
@@ -51,7 +54,8 @@ knowledge of Zig internals.**

 ### Editing Source Code

-First, build the Stage 1 compiler as described in [Building from Source](README.md#Building-from-Source).
+First, build the Stage 1 compiler as described in
+[Building Zig From Source](https://github.com/ziglang/zig/wiki/Building-Zig-From-Source).

 Zig locates lib files relative to executable path by searching up the
 filesystem tree for a sub-path of `lib/zig/std/std.zig` or `lib/std/std.zig`.
@@ -129,6 +133,14 @@ This will enable running behavior tests and std lib tests with Wine. It's
 recommended for Linux users to install Wine and enable this testing option 
 when editing the standard library or anything Windows-related.

+#### Testing WebAssembly using wasmtime
+
+If you have [wasmtime](https://wasmtime.dev/) installed, take advantage of the
+`-Denable-wasmtime` flag which will enable running WASI behavior tests and std
+lib tests. It's recommended for all users to install wasmtime and enable this
+testing option when editing the standard library and especially anything
+WebAssembly-related.
+
 #### Improving Translate-C

 Please read the [Editing Source Code](#editing-source-code) section as a
@@ -152,7 +164,7 @@ The relevant tests for this feature are:
   same, and that the program exits cleanly. This kind of test coverage is preferred, when
   possible, because it makes sure that the resulting Zig code is actually viable.

- * `test/stage1/behavior/translate_c_macros.zig` - each test case consists of a Zig test 
+ * `test/stage1/behavior/translate_c_macros.zig` - each test case consists of a Zig test
   which checks that the relevant macros in `test/stage1/behavior/translate_c_macros.h`.
   have the correct values. Macros have to be tested separately since they are expanded by
   Clang in `run_translated_c` tests.
@@ -173,21 +185,21 @@ repo, we maintain a C API on top of Clang's C++ API:
   Clang's C++ API changes. This one file necessarily does include Clang's C++ headers, which
   makes it the slowest-to-compile source file in all of Zig's codebase.

- * `src-self-hosted/clang.zig` - the Zig equivalent of `src/zig_clang.h`. This is a manually
+ * `src/clang.zig` - the Zig equivalent of `src/zig_clang.h`. This is a manually
   maintained list of types and functions that are ABI-compatible with the Clang C API we
   maintain. In theory this could be generated by running translate-c on `src/zig_clang.h`,
   but that would introduce a dependency cycle, since we are using this file to implement
   translate-c.

 Finally, the actual source code for the translate-c feature is
-`src-self-hosted/translate_c.zig`. This code uses the Clang C API exposed by
-`src-self-hosted/clang.zig`, and produces Zig AST.
+`src/translate_c.zig`. This code uses the Clang C API exposed by
+`src/clang.zig`, and produces Zig AST.

 The steps for contributing to translate-c look like this:

 1. Identify a test case you want to improve. Add it as a run-translated-c test
    case (usually preferable), or as a translate-c test case.

- 2. Edit `src-self-hosted/translate_c.zig` to improve the behavior.
+ 2. Edit `src/translate_c.zig` to improve the behavior.

 3. Run the relevant tests: `./zig build test-run-translated-c test-translate-c`
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 The MIT License (Expat)

-Copyright (c) 2015 Andrew Kelley
+Copyright (c) 2015-2021, Zig contributors

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ A general-purpose programming language and toolchain for maintaining

 ## Resources

- * [Introduction](https://ziglang.org/#Introduction)
+ * [Introduction](https://ziglang.org/learn/#introduction)
 * [Download & Documentation](https://ziglang.org/download)
 * [Chapter 0 - Getting Started | ZigLearn.org](https://ziglearn.org/)
 * [Community](https://github.com/ziglang/zig/wiki/Community)
@@ -14,98 +14,12 @@ A general-purpose programming language and toolchain for maintaining
 * [Frequently Asked Questions](https://github.com/ziglang/zig/wiki/FAQ)
 * [Community Projects](https://github.com/ziglang/zig/wiki/Community-Projects)

-## Building from Source
+## Installation

-[![Build Status](https://dev.azure.com/ziglang/zig/_apis/build/status/ziglang.zig?branchName=master)](https://dev.azure.com/ziglang/zig/_build/latest?definitionId=1&branchName=master)
-
-Note that you can
-[download a binary of the master branch](https://ziglang.org/download/#release-master) or 
-[install Zig from a package manager](https://github.com/ziglang/zig/wiki/Install-Zig-from-a-Package-Manager).
-
-### Stage 1: Build Zig from C++ Source Code
-
-This step must be repeated when you make changes to any of the C++ source code.
-
-#### Dependencies
-
-##### POSIX
-
- * cmake >= 2.8.5
- * gcc >= 5.0.0 or clang >= 3.6.0
- * LLVM, Clang, LLD development libraries == 11.x, compiled with the same gcc or clang version above
-   - Use the system package manager, or [build from source](https://github.com/ziglang/zig/wiki/How-to-build-LLVM,-libclang,-and-liblld-from-source#posix).
-
-##### Windows
-
- * cmake >= 3.15.3
- * Microsoft Visual Studio. Supported versions:
-   - 2017 (version 15.8)
-   - 2019 (version 16)
- * LLVM, Clang, LLD development libraries == 11.x
-   - Use the [pre-built binaries](https://github.com/ziglang/zig/wiki/Building-Zig-on-Windows) or [build from source](https://github.com/ziglang/zig/wiki/How-to-build-LLVM,-libclang,-and-liblld-from-source#windows).
-
-#### Instructions
-
-##### POSIX
-
-```
-mkdir build
-cd build
-cmake ..
-make install
-```
-
-Need help? [Troubleshooting Build Issues](https://github.com/ziglang/zig/wiki/Troubleshooting-Build-Issues)
-
-##### MacOS
-
-```
-brew install cmake llvm
-brew outdated llvm || brew upgrade llvm
-mkdir build
-cd build
-cmake .. -DCMAKE_PREFIX_PATH=$(brew --prefix llvm)
-make install
-```
-
-##### Windows
-
-See https://github.com/ziglang/zig/wiki/Building-Zig-on-Windows
-
-### Stage 2: Build Self-Hosted Zig from Zig Source Code
-
-Now we use the stage1 binary:
-
-```
-zig build --prefix $(pwd)/stage2 -Denable-llvm
-```
-
-This produces `stage2/bin/zig` which can be used for testing and development.
-Once it is feature complete, it will be used to build stage 3 - the final compiler
-binary.
-
-### Stage 3: Rebuild Self-Hosted Zig Using the Self-Hosted Compiler
-
-*Note: Stage 2 compiler is not yet able to build Stage 3. Building Stage 3 is
-not yet supported.*
-
-Once the self-hosted compiler can build itself, this will be the actual
-compiler binary that we will install to the system. Until then, users should
-use stage 1.
-
-#### Debug / Development Build
-
-```
-stage2/bin/zig build
-```
-
-This produces `zig-cache/bin/zig`.
-
-#### Release / Install Build
-
-```
-stage2/bin/zig build install -Drelease
-```
+ * [download a pre-built binary](https://ziglang.org/download/)
+ * [install from a package manager](https://github.com/ziglang/zig/wiki/Install-Zig-from-a-Package-Manager)
+ * [build from source](https://github.com/ziglang/zig/wiki/Building-Zig-From-Source)
+ * [bootstrap zig for any target](https://github.com/ziglang/zig-bootstrap)

 ## License

--- a/build.zig
+++ b/build.zig
--- a/ci/azure/linux_script
+++ b/ci/azure/linux_script
@@ -3,54 +3,108 @@
 set -x
 set -e

-BUILDDIR="$(pwd)"
-
-sudo sh -c 'echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main" >> /etc/apt/sources.list'
-wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
-sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
 sudo apt-get update -q
+sudo apt-get install -y cmake s3cmd tidy

-sudo apt-get remove -y llvm-*
-sudo rm -rf /usr/local/*
-sudo apt-get install -y libxml2-dev libclang-11-dev llvm-11 llvm-11-dev liblld-11-dev cmake s3cmd gcc-7 g++-7 ninja-build tidy
+ZIGDIR="$(pwd)"
+ARCH="$(uname -m)"
+TARGET="$ARCH-linux-musl"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.8.1-dev.94+535615117"
+PREFIX="$HOME/$CACHE_BASENAME"
+MCPU="baseline"
+JOBS="-j$(nproc)"

-QEMUBASE="qemu-linux-x86_64-5.1.0"
-wget https://ziglang.org/deps/$QEMUBASE.tar.xz
-tar xf $QEMUBASE.tar.xz
-PATH=$PWD/$QEMUBASE/bin:$PATH
+rm -rf $PREFIX
+cd $HOME

-WASMTIME="wasmtime-v0.20.0-x86_64-linux"
-wget https://github.com/bytecodealliance/wasmtime/releases/download/v0.20.0/$WASMTIME.tar.xz
-tar xf $WASMTIME.tar.xz
-PATH=$PWD/$WASMTIME:$PATH
+wget -nv "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
+tar xf "$CACHE_BASENAME.tar.xz"
+
+QEMUBASE="qemu-linux-x86_64-5.2.0.1"
+wget -nv "https://ziglang.org/deps/$QEMUBASE.tar.xz"
+tar xf "$QEMUBASE.tar.xz"
+export PATH="$(pwd)/$QEMUBASE/bin:$PATH"
+
+# Bump to v0.23 once this issue is resolved:
+# https://github.com/ziglang/zig/issues/8742
+WASMTIME="wasmtime-v0.22.1-x86_64-linux"
+wget -nv "https://github.com/bytecodealliance/wasmtime/releases/download/v0.22.1/$WASMTIME.tar.xz"
+tar xf "$WASMTIME.tar.xz"
+export PATH="$(pwd)/$WASMTIME:$PATH"
+
+ZIG="$PREFIX/bin/zig"
+export CC="$ZIG cc -target $TARGET -mcpu=$MCPU"
+export CXX="$ZIG c++ -target $TARGET -mcpu=$MCPU"
+
+cd $ZIGDIR

 # Make the `zig version` number consistent.
 # This will affect the cmake command below.
 git config core.abbrev 9
+git fetch --unshallow || true
+git fetch --tags

-export CC=gcc-7
-export CXX=g++-7
 mkdir build
 cd build
-cmake .. -DCMAKE_BUILD_TYPE=Release -GNinja
-ninja install
-./zig build test -Denable-qemu -Denable-wasmtime
+cmake .. \
+  -DCMAKE_INSTALL_PREFIX="$(pwd)/release" \
+  -DCMAKE_PREFIX_PATH="$PREFIX" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DZIG_TARGET_TRIPLE="$TARGET" \
+  -DZIG_TARGET_MCPU="$MCPU" \
+  -DZIG_STATIC=ON

-# look for HTML errors
+# Now cmake will use zig as the C/C++ compiler. We reset the environment variables
+# so that installation and testing do not get affected by them.
+unset CC
+unset CXX
+
+make $JOBS install
+
+# Here we rebuild zig but this time using the Zig binary we just now produced to
+# build zig1.o rather than relying on the one built with stage0. See
+# https://github.com/ziglang/zig/issues/6830 for more details.
+cmake .. -DZIG_EXECUTABLE="$(pwd)/release/bin/zig"
+make $JOBS install
+
+for step in test-toolchain test-std docs; do
+  release/bin/zig build $step -Denable-qemu -Denable-wasmtime
+done
+
+# Look for HTML errors.
 tidy -qe ../zig-cache/langref.html

-VERSION="$(./zig version)"
-
 if [ "${BUILD_REASON}" != "PullRequest" ]; then
-  ARTIFACTSDIR="$BUILDDIR/artifacts"
-  mkdir "$ARTIFACTSDIR"
-  docker run -i --mount type=bind,source="$ARTIFACTSDIR",target=/z ziglang/static-base:llvm11-x86_64-1 -j2 $BUILD_SOURCEVERSION
-  TARBALL="$(ls $ARTIFACTSDIR)"
-  mv "$DOWNLOADSECUREFILE_SECUREFILEPATH" "$HOME/.s3cfg"
-  s3cmd put -P --add-header="cache-control: public, max-age=31536000, immutable" "$ARTIFACTSDIR/$TARBALL" s3://ziglang.org/builds/
+  # Produce the experimental std lib documentation.
+  mkdir -p release/docs/std
+  release/bin/zig test ../lib/std/std.zig \
+    --override-lib-dir ../lib \
+    -femit-docs=release/docs/std \
+    -fno-emit-bin

-  SHASUM=$(sha256sum $ARTIFACTSDIR/$TARBALL | cut '-d ' -f1)
-  BYTESIZE=$(wc -c < $ARTIFACTSDIR/$TARBALL)
+  mv ../LICENSE release/
+  mv ../zig-cache/langref.html release/docs/
+
+  # Remove the unnecessary bin dir in $prefix/bin/zig
+  mv release/bin/zig release/
+  rmdir release/bin
+
+  # Remove the unnecessary zig dir in $prefix/lib/zig/std/std.zig
+  mv release/lib/zig release/lib2
+  rmdir release/lib
+  mv release/lib2 release/lib
+
+  VERSION=$(release/zig version)
+  DIRNAME="zig-linux-$ARCH-$VERSION"
+  TARBALL="$DIRNAME.tar.xz"
+  mv release "$DIRNAME"
+  tar cfJ "$TARBALL" "$DIRNAME"
+
+  mv "$DOWNLOADSECUREFILE_SECUREFILEPATH" "$HOME/.s3cfg"
+  s3cmd put -P --add-header="cache-control: public, max-age=31536000, immutable" "$TARBALL" s3://ziglang.org/builds/
+
+  SHASUM=$(sha256sum $TARBALL | cut '-d ' -f1)
+  BYTESIZE=$(wc -c < $TARBALL)

  JSONFILE="linux-$GITBRANCH.json"
  touch $JSONFILE
@@ -59,7 +113,7 @@ if [ "${BUILD_REASON}" != "PullRequest" ]; then
  echo "\"size\": \"$BYTESIZE\"}" >>$JSONFILE

  s3cmd put -P --add-header="Cache-Control: max-age=0, must-revalidate" "$JSONFILE" "s3://ziglang.org/builds/$JSONFILE"
-  s3cmd put -P "$JSONFILE" "s3://ziglang.org/builds/x86_64-linux-$VERSION.json"
+  s3cmd put -P "$JSONFILE" "s3://ziglang.org/builds/$ARCH-linux-$VERSION.json"

  # `set -x` causes these variables to be mangled.
  # See https://developercommunity.visualstudio.com/content/problem/375679/pipeline-variable-incorrectly-inserts-single-quote.html
--- a/ci/azure/macos_arm64_script
+++ b/ci/azure/macos_arm64_script
@@ -0,0 +1,131 @@
+#!/bin/sh
+
+set -x
+set -e
+
+brew update && brew install s3cmd
+
+ZIGDIR="$(pwd)"
+
+HOST_ARCH="x86_64"
+HOST_TARGET="$HOST_ARCH-macos-gnu"
+HOST_MCPU="baseline"
+HOST_CACHE_BASENAME="zig+llvm+lld+clang-$HOST_TARGET-0.8.0-dev.2703+c12704a33"
+HOST_PREFIX="$HOME/$HOST_CACHE_BASENAME"
+
+ARCH="aarch64"
+TARGET="$ARCH-macos-gnu"
+MCPU="apple_a14"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.8.0-dev.2703+c12704a33"
+PREFIX="$HOME/$CACHE_BASENAME"
+
+JOBS="-j2"
+
+rm -rf $HOST_PREFIX $PREFIX
+cd $HOME
+
+wget -nv "https://ziglang.org/deps/$HOST_CACHE_BASENAME.tar.xz"
+wget -nv "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
+tar xf "$HOST_CACHE_BASENAME.tar.xz"
+tar xf "$CACHE_BASENAME.tar.xz"
+
+cd $ZIGDIR
+
+# Make the `zig version` number consistent.
+# This will affect the cmake command below.
+git config core.abbrev 9
+git fetch --unshallow || true
+git fetch --tags
+
+# Build host zig compiler in debug so that we can get the
+# current version when packaging
+
+ZIG="$HOST_PREFIX/bin/zig"
+
+export CC="$ZIG cc -target $HOST_TARGET -mcpu=$HOST_MCPU"
+export CXX="$ZIG c++ -target $HOST_TARGET -mcpu=$HOST_MCPU"
+
+mkdir build.host
+cd build.host
+cmake .. \
+  -DCMAKE_INSTALL_PREFIX="$(pwd)/release" \
+  -DCMAKE_PREFIX_PATH="$HOST_PREFIX" \
+  -DCMAKE_BUILD_TYPE=Debug \
+  -DZIG_TARGET_TRIPLE="$HOST_TARGET" \
+  -DZIG_TARGET_MCPU="$HOST_MCPU" \
+  -DZIG_STATIC=ON
+
+unset CC
+unset CXX
+
+make $JOBS install
+
+# Build zig compiler cross-compiled for arm64
+cd $ZIGDIR
+
+ZIG="$ZIGDIR/build.host/release/bin/zig"
+
+export CC="$ZIG cc -target $TARGET -mcpu=$MCPU"
+export CXX="$ZIG c++ -target $TARGET -mcpu=$MCPU"
+
+mkdir build
+cd build
+cmake .. \
+  -DCMAKE_INSTALL_PREFIX="$(pwd)/release" \
+  -DCMAKE_PREFIX_PATH="$PREFIX" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DZIG_TARGET_TRIPLE="$TARGET" \
+  -DZIG_TARGET_MCPU="$MCPU" \
+  -DZIG_EXECUTABLE="$ZIG" \
+  -DZIG_STATIC=ON
+
+unset CC
+unset CXX
+
+make $JOBS install
+
+if [ "${BUILD_REASON}" != "PullRequest" ]; then
+  mv ../LICENSE release/
+
+  # We do not run test suite but still need langref.
+  mkdir -p release/docs
+  $ZIG run ../doc/docgen.zig -- $ZIG ../doc/langref.html.in release/docs/langref.html
+
+  # Produce the experimental std lib documentation.
+  mkdir -p release/docs/std
+  $ZIG test ../lib/std/std.zig \
+    --override-lib-dir ../lib \
+    -femit-docs=release/docs/std \
+    -fno-emit-bin
+
+  mv release/bin/zig release/
+  rmdir release/bin
+
+  VERSION=$(../build.host/release/bin/zig version)
+  DIRNAME="zig-macos-$ARCH-$VERSION"
+  TARBALL="$DIRNAME.tar.xz"
+  mv release "$DIRNAME"
+  tar cfJ "$TARBALL" "$DIRNAME"
+
+  mv "$DOWNLOADSECUREFILE_SECUREFILEPATH" "$HOME/.s3cfg"
+  s3cmd put -P --add-header="cache-control: public, max-age=31536000, immutable" "$TARBALL" s3://ziglang.org/builds/
+
+  SHASUM=$(shasum -a 256 $TARBALL | cut '-d ' -f1)
+  BYTESIZE=$(wc -c < $TARBALL)
+
+  JSONFILE="macos-$GITBRANCH.json"
+  touch $JSONFILE
+  echo "{\"tarball\": \"$TARBALL\"," >>$JSONFILE
+  echo "\"shasum\": \"$SHASUM\"," >>$JSONFILE
+  echo "\"size\": \"$BYTESIZE\"}" >>$JSONFILE
+
+  s3cmd put -P --add-header="Cache-Control: max-age=0, must-revalidate" "$JSONFILE" "s3://ziglang.org/builds/$JSONFILE"
+  s3cmd put -P "$JSONFILE" "s3://ziglang.org/builds/$ARCH-macos-$VERSION.json"
+
+  # `set -x` causes these variables to be mangled.
+  # See https://developercommunity.visualstudio.com/content/problem/375679/pipeline-variable-incorrectly-inserts-single-quote.html
+  set +x
+  echo "##vso[task.setvariable variable=tarball;isOutput=true]$TARBALL"
+  echo "##vso[task.setvariable variable=shasum;isOutput=true]$SHASUM"
+  echo "##vso[task.setvariable variable=bytesize;isOutput=true]$BYTESIZE"
+fi
--- a/ci/azure/macos_script
+++ b/ci/azure/macos_script
@@ -3,31 +3,33 @@
 set -x
 set -e

-brew install s3cmd
+brew update && brew install s3cmd

 ZIGDIR="$(pwd)"
 ARCH="x86_64"
-CACHE_BASENAME="zig+llvm+lld+clang-$ARCH-macos-gnu-0.6.0+1c9ef63a"
+TARGET="$ARCH-macos-gnu"
+MCPU="baseline"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.8.1-dev.94+535615117"
 PREFIX="$HOME/$CACHE_BASENAME"
 JOBS="-j2"

 rm -rf $PREFIX
 cd $HOME
+
 wget -nv "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
 tar xf "$CACHE_BASENAME.tar.xz"

 ZIG="$PREFIX/bin/zig"
-NATIVE_LIBC_TXT="$HOME/native_libc.txt"
-$ZIG libc > "$NATIVE_LIBC_TXT"
-export ZIG_LIBC="$NATIVE_LIBC_TXT"
-export CC="$ZIG cc"
-export CXX="$ZIG c++"
+export CC="$ZIG cc -target $TARGET -mcpu=$MCPU"
+export CXX="$ZIG c++ -target $TARGET -mcpu=$MCPU"

 cd $ZIGDIR

 # Make the `zig version` number consistent.
 # This will affect the cmake command below.
 git config core.abbrev 9
+git fetch --unshallow || true
+git fetch --tags

 mkdir build
 cd build
@@ -35,15 +37,14 @@ cmake .. \
  -DCMAKE_INSTALL_PREFIX="$(pwd)/release" \
  -DCMAKE_PREFIX_PATH="$PREFIX" \
  -DCMAKE_BUILD_TYPE=Release \
-  -DZIG_TARGET_TRIPLE="$ARCH-native-gnu" \
-  -DZIG_TARGET_MCPU="baseline" \
+  -DZIG_TARGET_TRIPLE="$TARGET" \
+  -DZIG_TARGET_MCPU="$MCPU" \
  -DZIG_STATIC=ON

 # Now cmake will use zig as the C/C++ compiler. We reset the environment variables
 # so that installation and testing do not get affected by them.
 unset CC
 unset CXX
-unset ZIG_LIBC

 make $JOBS install

@@ -53,7 +54,9 @@ make $JOBS install
 cmake .. -DZIG_EXECUTABLE="$(pwd)/release/bin/zig"
 make $JOBS install

-release/bin/zig build test
+for step in test-toolchain test-std docs; do
+  release/bin/zig build $step
+done

 if [ "${BUILD_REASON}" != "PullRequest" ]; then
  mv ../LICENSE release/
--- a/ci/azure/pipelines.yml
+++ b/ci/azure/pipelines.yml
@@ -2,9 +2,7 @@ jobs:
 - job: BuildMacOS
  pool:
    vmImage: 'macOS-10.15'
-
  timeoutInMinutes: 360
-
  steps:
  - task: DownloadSecureFile@1
    inputs:
@@ -12,12 +10,21 @@ jobs:
  - script: ci/azure/macos_script
    name: main
    displayName: 'Build and test'
+- job: BuildMacOS_arm64
+  pool:
+    vmImage: 'macOS-10.15'
+  timeoutInMinutes: 180
+  steps:
+  - task: DownloadSecureFile@1
+    inputs:
+      secureFile: s3cfg
+  - script: ci/azure/macos_arm64_script
+    name: main
+    displayName: 'Build'
 - job: BuildLinux
  pool:
    vmImage: 'ubuntu-18.04'
-
  timeoutInMinutes: 360
-
  steps:
  - task: DownloadSecureFile@1
    inputs:
@@ -31,7 +38,7 @@ jobs:
  timeoutInMinutes: 360
  steps:
  - powershell: |
-      (New-Object Net.WebClient).DownloadFile("https://github.com/msys2/msys2-installer/releases/download/2020-09-03/msys2-base-x86_64-20200903.sfx.exe", "sfx.exe")
+      (New-Object Net.WebClient).DownloadFile("https://github.com/msys2/msys2-installer/releases/download/2021-06-04/msys2-base-x86_64-20210604.sfx.exe", "sfx.exe")
      .\sfx.exe -y -o\
      del sfx.exe
    displayName: Download/Extract/Install MSYS2
@@ -53,6 +60,7 @@ jobs:
 - job: OnMasterSuccess
  dependsOn:
  - BuildMacOS
+  - BuildMacOS_arm64
  - BuildLinux
  - BuildWindows
  condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/master'))
--- a/ci/azure/windows_msvc_install
+++ b/ci/azure/windows_msvc_install
@@ -3,9 +3,9 @@
 set -x
 set -e

-pacman -Su --needed --noconfirm
+pacman -Suy --needed --noconfirm
 pacman -S --needed --noconfirm wget p7zip python3-pip tar xz

 pip install s3cmd
-wget -nv "https://ziglang.org/deps/llvm%2bclang%2blld-11.0.0-x86_64-windows-msvc-release-mt.tar.xz"
-tar xf llvm+clang+lld-11.0.0-x86_64-windows-msvc-release-mt.tar.xz
+wget -nv "https://ziglang.org/deps/llvm%2bclang%2blld-12.0.1-rc1-x86_64-windows-msvc-release-mt.tar.xz"
+tar xf llvm+clang+lld-12.0.1-rc1-x86_64-windows-msvc-release-mt.tar.xz
--- a/ci/azure/windows_msvc_script.bat
+++ b/ci/azure/windows_msvc_script.bat
@@ -11,32 +11,23 @@ SET "MSYSTEM=%PREVMSYSTEM%"

 SET "ZIGBUILDDIR=%SRCROOT%\build"
 SET "ZIGINSTALLDIR=%ZIGBUILDDIR%\dist"
-SET "ZIGPREFIXPATH=%SRCROOT%\llvm+clang+lld-11.0.0-x86_64-windows-msvc-release-mt"
+SET "ZIGPREFIXPATH=%SRCROOT%\llvm+clang+lld-12.0.1-rc1-x86_64-windows-msvc-release-mt"

 call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64

 REM Make the `zig version` number consistent.
 REM This will affect the cmake command below.
 git.exe config core.abbrev 9
+git.exe fetch --unshallow
+git.exe fetch --tags

 mkdir %ZIGBUILDDIR%
 cd %ZIGBUILDDIR%
-cmake.exe .. -Thost=x64 -G"Visual Studio 16 2019" -A x64 "-DCMAKE_INSTALL_PREFIX=%ZIGINSTALLDIR%" "-DCMAKE_PREFIX_PATH=%ZIGPREFIXPATH%" -DCMAKE_BUILD_TYPE=Release || exit /b
+cmake.exe .. -Thost=x64 -G"Visual Studio 16 2019" -A x64 "-DCMAKE_INSTALL_PREFIX=%ZIGINSTALLDIR%" "-DCMAKE_PREFIX_PATH=%ZIGPREFIXPATH%" -DCMAKE_BUILD_TYPE=Release -DZIG_OMIT_STAGE2=ON || exit /b
 msbuild /maxcpucount /p:Configuration=Release INSTALL.vcxproj || exit /b

-"%ZIGINSTALLDIR%\bin\zig.exe" build test-behavior -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-stage2 -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-fmt -Dskip-non-native || exit /b
+"%ZIGINSTALLDIR%\bin\zig.exe" build test-toolchain -Dskip-non-native -Dskip-stage2-tests || exit /b
 "%ZIGINSTALLDIR%\bin\zig.exe" build test-std -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-compiler-rt -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-compare-output -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-standalone -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-stack-traces -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-cli -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-asm-link -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-runtime-safety -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-translate-c -Dskip-non-native || exit /b
-"%ZIGINSTALLDIR%\bin\zig.exe" build test-run-translated-c -Dskip-non-native || exit /b
 "%ZIGINSTALLDIR%\bin\zig.exe" build docs || exit /b

 set "PATH=%CD:~0,2%\msys64\usr\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem"
--- a/ci/drone/drone.yml
+++ b/ci/drone/drone.yml
@@ -6,8 +6,39 @@ platform:
  arch: arm64

 steps:
- name: build-and-test
-  image: ziglang/static-base:llvm11-aarch64-1
+- name: build
+  image: ziglang/static-base:llvm12-aarch64-5
+  commands:
+  - ./ci/drone/linux_script_build
+
+- name: test-1
+  depends_on:
+  - build
+  image: ziglang/static-base:llvm12-aarch64-5
+  commands:
+  - ./ci/drone/linux_script_test 1
+
+- name: test-2
+  depends_on:
+  - build
+  image: ziglang/static-base:llvm12-aarch64-5
+  commands:
+  - ./ci/drone/linux_script_test 2
+
+- name: test-3
+  depends_on:
+  - build
+  image: ziglang/static-base:llvm12-aarch64-5
+  commands:
+  - ./ci/drone/linux_script_test 3
+
+- name: finalize
+  depends_on:
+  - build
+  - test-1
+  - test-2
+  - test-3
+  image: ziglang/static-base:llvm12-aarch64-5
  environment:
    SRHT_OAUTH_TOKEN:
      from_secret: SRHT_OAUTH_TOKEN
@@ -16,4 +47,4 @@ steps:
    AWS_SECRET_ACCESS_KEY:
      from_secret: AWS_SECRET_ACCESS_KEY
  commands:
-  - ./ci/drone/linux_script
+  - ./ci/drone/linux_script_finalize
--- a/ci/drone/linux_script
+++ b/ci/drone/linux_script
@@ -1,58 +0,0 @@
-#!/bin/sh
-
-set -x
-set -e
-
-TRIPLEARCH="$(uname -m)"
-BUILDDIR="$(pwd)"
-DISTDIR="$(pwd)/dist"
-
-apk update
-apk add py3-pip xz perl-utils jq curl samurai
-pip3 install s3cmd
-
-# Make the `zig version` number consistent.
-# This will affect the cmake command below.
-git config core.abbrev 9
-
-mkdir build
-cd build
-cmake .. -DCMAKE_BUILD_TYPE=Release "-DCMAKE_INSTALL_PREFIX=$DISTDIR" -DZIG_STATIC=ON -DCMAKE_PREFIX_PATH=/deps/local -GNinja
-
-samu install
-./zig build test -Dskip-release -Dskip-non-native
-
-if [ -z "$DRONE_PULL_REQUEST" ]; then
-  mv ../LICENSE "$DISTDIR/"
-  mv ../zig-cache/langref.html "$DISTDIR/"
-  mv "$DISTDIR/bin/zig" "$DISTDIR/"
-  rmdir "$DISTDIR/bin"
-
-  GITBRANCH="$DRONE_BRANCH"
-  VERSION="$("$DISTDIR/zig" version)"
-  DIRNAME="zig-linux-$TRIPLEARCH-$VERSION"
-  TARBALL="$DIRNAME.tar.xz"
-  mv "$DISTDIR" "$DIRNAME"
-  tar cfJ "$TARBALL" "$DIRNAME"
-
-  s3cmd put -P --add-header="cache-control: public, max-age=31536000, immutable" "$TARBALL" s3://ziglang.org/builds/
-
-  SHASUM=$(shasum -a 256 $TARBALL | cut '-d ' -f1)
-  BYTESIZE=$(wc -c < $TARBALL)
-
-  JSONFILE="$TRIPLEARCH-linux-$GITBRANCH.json"
-  touch $JSONFILE
-  echo "{\"tarball\": \"$TARBALL\"," >>$JSONFILE
-  echo "\"shasum\": \"$SHASUM\"," >>$JSONFILE
-  echo "\"size\": \"$BYTESIZE\"}" >>$JSONFILE
-
-  s3cmd put -P --add-header="Cache-Control: max-age=0, must-revalidate" "$JSONFILE" "s3://ziglang.org/builds/$JSONFILE"
-  s3cmd put -P "$JSONFILE" "s3://ziglang.org/builds/$TRIPLEARCH-linux-$VERSION.json"
-  if [ "$GITBRANCH" = "master" ]; then 
-    # avoid leaking oauth token
-    set +x
-
-    cd "$BUILDDIR"
-    ./ci/srht/on_master_success "$VERSION" "$SRHT_OAUTH_TOKEN"
-  fi
-fi
--- a/ci/drone/linux_script_base
+++ b/ci/drone/linux_script_base
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+# https://docs.drone.io/pipeline/docker/syntax/workspace/
+#
+# Drone automatically creates a temporary volume, known as your workspace,
+# where it clones your repository. The workspace is the current working
+# directory for each step in your pipeline.
+#
+# Because the workspace is a volume, filesystem changes are persisted between
+# pipeline steps. In other words, individual steps can communicate and share
+# state using the filesystem.
+#
+# Workspace volumes are ephemeral. They are created when the pipeline starts
+# and destroyed after the pipeline completes.
+
+set -x
+set -e
+
+TRIPLEARCH="$(uname -m)"
+DISTDIR="$DRONE_WORKSPACE/dist"
+
+export ZIG_GLOBAL_CACHE_DIR="$DRONE_WORKSPACE/zig-cache"
--- a/ci/drone/linux_script_build
+++ b/ci/drone/linux_script_build
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+. ./ci/drone/linux_script_base
+
+PREFIX="/deps/local"
+ZIG="$PREFIX/bin/zig"
+TARGET="$TRIPLEARCH-linux-musl"
+MCPU="baseline"
+
+export CC="$ZIG cc -target $TARGET -mcpu=$MCPU"
+export CXX="$ZIG c++ -target $TARGET -mcpu=$MCPU"
+
+# The `CMAKE_AR` parameter will consider any spaces to
+# be part of the executable path rather than CLI args, so we have
+# to create wrapper scripts for `zig ar` and zig ranlib`.
+
+cat <<'ENDFILE' >$PREFIX/bin/ar
+#!/bin/sh
+/deps/local/bin/zig ar $@
+ENDFILE
+
+cat <<'ENDFILE' >$PREFIX/bin/ranlib
+#!/bin/sh
+/deps/local/bin/zig ranlib $@
+ENDFILE
+
+chmod +x $PREFIX/bin/ar
+chmod +x $PREFIX/bin/ranlib
+
+# Make the `zig version` number consistent.
+# This will affect the cmake command below.
+git config core.abbrev 9
+git fetch --unshallow || true
+git fetch --tags
+
+mkdir build
+cd build
+cmake .. \
+  -DCMAKE_INSTALL_PREFIX="$DISTDIR" \
+  -DCMAKE_PREFIX_PATH="$PREFIX" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_AR="$PREFIX/bin/ar" \
+  -DCMAKE_RANLIB="$PREFIX/bin/ranlib" \
+  -DZIG_TARGET_TRIPLE="$TARGET" \
+  -DZIG_TARGET_MCPU="$MCPU" \
+  -DZIG_STATIC=ON \
+  -GNinja
+
+# Now CMake will use Zig as the C/C++ compiler. We reset the environment variables
+# so that installation and testing do not get affected by them.
+unset CC
+unset CXX
+samu install
+
+# Here we rebuild Zig but this time using the Zig binary we just now produced to
+# build zig1.o rather than relying on the one built with stage0. See
+# https://github.com/ziglang/zig/issues/6830 for more details.
+cmake .. -DZIG_EXECUTABLE="$DISTDIR/bin/zig"
+samu install
--- a/ci/drone/linux_script_finalize
+++ b/ci/drone/linux_script_finalize
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+. ./ci/drone/linux_script_base
+
+if [ -n "$DRONE_PULL_REQUEST" ]; then
+  exit 0
+fi
+
+apk update
+apk add py3-pip perl-utils jq curl
+pip3 install s3cmd
+
+cd build
+
+mv ../LICENSE "$DISTDIR/"
+# docs are disabled due to: https://github.com/ziglang/zig/issues/8597
+#mv ../zig-cache/langref.html "$DISTDIR/"
+mv "$DISTDIR/bin/zig" "$DISTDIR/"
+rmdir "$DISTDIR/bin"
+
+GITBRANCH="$DRONE_BRANCH"
+VERSION="$("$DISTDIR/zig" version)"
+DIRNAME="zig-linux-$TRIPLEARCH-$VERSION"
+TARBALL="$DIRNAME.tar.xz"
+mv "$DISTDIR" "$DIRNAME"
+tar cfJ "$TARBALL" "$DIRNAME"
+
+s3cmd put -P --add-header="cache-control: public, max-age=31536000, immutable" "$TARBALL" s3://ziglang.org/builds/
+
+SHASUM=$(shasum -a 256 $TARBALL | cut '-d ' -f1)
+BYTESIZE=$(wc -c < $TARBALL)
+
+JSONFILE="tarball.json"
+touch $JSONFILE
+echo "{\"tarball\": \"$TARBALL\"," >>$JSONFILE
+echo "\"shasum\": \"$SHASUM\"," >>$JSONFILE
+echo "\"size\": \"$BYTESIZE\"}" >>$JSONFILE
+
+s3cmd put -P "$JSONFILE" "s3://ziglang.org/builds/$TRIPLEARCH-linux-$VERSION.json"
+if [ "$GITBRANCH" = "master" ]; then
+  # avoid leaking oauth token
+  set +x
+
+  cd "$DRONE_WORKSPACE"
+  ./ci/srht/on_master_success "$VERSION" "$SRHT_OAUTH_TOKEN"
+fi
--- a/ci/drone/linux_script_test
+++ b/ci/drone/linux_script_test
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+. ./ci/drone/linux_script_base
+
+# only release-fast builds of test suite due to: https://github.com/ziglang/zig/issues/8597
+#
+# Some test suite components will be missing because they do not support
+# forcing -OReleaseFast
+#
+# see `zig build --help` for the full list of test-* components
+case "$1" in
+  1)
+    steps="\
+      test-stage2 \
+      test-fmt \
+      test-behavior"
+    ;;
+  2)
+    steps="test-std"
+    ;;
+  3)
+    steps="\
+      test-compiler-rt \
+      test-minilibc \
+      test-compare-output \
+      test-translate-c \
+      test-run-translated-c"
+    ;;
+  '')
+    echo "error: expecting test group argument"
+    exit 1
+    ;;
+  *)
+    echo "error: unknown test group: $1"
+    exit 1
+    ;;
+esac
+
+# only release-fast builds of test suite due to: https://github.com/ziglang/zig/issues/8597
+./build/zig build \
+  -Drelease \
+  -Dskip-debug \
+  -Dskip-release-small \
+  -Dskip-release-safe \
+  -Dskip-non-native \
+  $steps
--- a/ci/srht/freebsd_script
+++ b/ci/srht/freebsd_script
@@ -4,12 +4,11 @@ set -x
 set -e

 sudo pkg update -fq
-sudo pkg install -y cmake py27-s3cmd wget curl jq
+sudo pkg install -y cmake py38-s3cmd wget curl jq samurai

 ZIGDIR="$(pwd)"
-CACHE_BASENAME="llvm+clang+lld-11.0.0-x86_64-freebsd-release"
+CACHE_BASENAME="zig+llvm+lld+clang-x86_64-freebsd-gnu-0.8.0-dev.2703+c12704a33"
 PREFIX="$HOME/$CACHE_BASENAME"
-JOBS="-j$(sysctl -n hw.ncpu)"

 cd $HOME
 wget -nv "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
@@ -20,6 +19,8 @@ cd $ZIGDIR
 # Make the `zig version` number consistent.
 # This will affect the cmake command below.
 git config core.abbrev 9
+git fetch --unshallow || true
+git fetch --tags

 # SourceHut reports that it is a terminal that supports escape codes, but it
 # is a filthy liar. Here we tell Zig to not try to send any terminal escape
@@ -28,8 +29,14 @@ export TERM=dumb

 mkdir build
 cd build
-cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=$PREFIX "-DCMAKE_INSTALL_PREFIX=$(pwd)/release" -DZIG_STATIC=ON
-make $JOBS install
+cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_PREFIX_PATH=$PREFIX \
+    "-DCMAKE_INSTALL_PREFIX=$(pwd)/release" \
+    -DZIG_STATIC=ON \
+    -DZIG_TARGET_TRIPLE=x86_64-freebsd-gnu \
+    -GNinja
+samu install

 # Here we skip some tests to save time.
 release/bin/zig build test -Dskip-compile-errors -Dskip-non-native
--- a/ci/srht/index.html
+++ b/ci/srht/index.html
@@ -0,0 +1,737 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Releases ⚡ The Zig Programming Language</title>
+    <link rel="icon" href="/favicon.png">
+    <style>
+      body{
+        font-family: system-ui, -apple-system, Roboto, "Segoe UI", sans-serif;
+        line-height: 1.45;
+        margin-left: 0;
+        margin-right: 0;
+      }
+      p {
+        margin: 0.8em 0;
+      }
+      
+      h1, h2, h3, h4 {
+        margin: 0.5em 0 0.5em;
+        line-height: 1.2;
+        font-weight: bold;
+        color: #666;
+      }
+
+      h1 a, h2 a, h3 a, h4 a {
+        text-decoration: none;
+        color: #666;
+      }
+      
+      a.hdr {
+        visibility: hidden;
+      }
+      h1:hover > a.hdr, h2:hover > a.hdr, h3:hover > a.hdr, h4:hover > a.hdr, h5:hover > a.hdr {
+        visibility: visible;
+      }
+
+      h1 { font-size: 2.0em; }
+      
+      h2 { font-size: 1.5em; }
+      
+      h3 { font-size: 1.25em; }
+
+      h4 { font-size: 1.0em; }
+
+      a {
+        color: #2A6286;
+      }
+      
+      a:not(:hover) {
+        text-decoration: none;
+      }
+      
+      th, td {
+        padding: 0.6em;
+        text-align: left;
+        white-space: nowrap;
+      }
+      
+      td {
+        font-size: 0.96em;
+      }
+      
+      th {
+        border-bottom: 2px solid #f2f3f3;
+      }
+      
+      tr:nth-child(even) {
+        background: #f2f3f3;
+      }
+      
+      .container {
+        margin: 0 auto;
+        position: relative;
+        max-width: 1000px;
+      }
+      
+      #navbar {
+        background-color: #737475;
+        padding: 5px 0;
+        border-top: 4px solid #f7a41d;
+        margin-bottom: 30px;
+      }
+      
+      #navbar .navbar-item, #navbar .navbar-item:visited {
+        color: white;
+        padding-right: 5px;
+        padding-left: 5px;
+      }
+
+      .code {
+        font-family: monospace;
+        font-size: 0.8em;
+      }
+
+      #header-image {
+          background-image: url(https://ziglang.org/img/zig-logo-dark.svg);
+          background-repeat: no-repeat;
+          width: 340px;
+          height: 90px;
+          display: block;
+          padding: 0;
+          margin: 0;
+      }
+
+      @media (prefers-color-scheme: dark) {
+        body{
+          background-color:#111;
+          color: #bbb;
+        }
+        a {
+          color: #88f;
+        }
+        table, th, td {
+          border-color: grey;
+        }
+        tr:nth-child(even) {
+          background: #1e1e1e;
+        }
+        h1 a, h2 a, h3 a, h4 a, h5 a {
+          color: #aaa;
+        }
+        #header-image {
+            background-image: url(https://ziglang.org/img/zig-logo-light.svg);
+        }
+      }
+    </style>
+  </head>
+  <body>
+    <div class="container">
+      <a href="/"><span id="header-image"></span></a>
+    </div>
+    <nav id="navbar">
+      <div class="container">
+        <a href="/download/" class="navbar-item">Download &amp; Documentation</a>
+        <a href="https://github.com/ziglang/zig" class="navbar-item">Source Code</a>
+        <a href="/news/" class="navbar-item">News</a>
+        <a href="https://github.com/ziglang/zig/wiki/Community" class="navbar-item">Join a Community</a>
+        <a href="/zsf/" class="navbar-item">
+          <svg style="color: #ea4aaa; vertical-align: middle;fill: currentColor; margin-right: 5px" viewBox="0 0 12 16" version="1.1" width="12" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M9 2c-.97 0-1.69.42-2.2 1-.51.58-.78.92-.8 1-.02-.08-.28-.42-.8-1-.52-.58-1.17-1-2.2-1-1.632.086-2.954 1.333-3 3 0 .52.09 1.52.67 2.67C1.25 8.82 3.01 10.61 6 13c2.98-2.39 4.77-4.17 5.34-5.33C11.91 6.51 12 5.5 12 5c-.047-1.69-1.342-2.913-3-3z"></path></svg>Sponsor the Zig Software Foundation</a>
+      </div>
+    </nav>
+    <div class="container">
+      <h1>Releases</h1>
+      <p>You can also
+      <a href="https://github.com/ziglang/zig/wiki/Install-Zig-from-a-Package-Manager">install Zig from a package manager</a>.
+      </p>
+      <p>
+      There is a <a href="index.json">JSON version of this page</a>.
+      </p>
+
+      <h2 id="release-master">master</h2>
+      <ul>
+        <li>{{MASTER_DATE}}</li>
+        <li><a href="/documentation/master/">Language Reference</a></li>
+        <li><a href="/documentation/master/std/">Standard Library Documentation</a> (experimental)</li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/builds/{{SRC_TARBALL}}">{{SRC_TARBALL}}</a></td>
+            <td>Source</td>
+            <td>{{SRC_BYTESIZE}}</td>
+            <td class="code">{{SRC_SHASUM}}</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/builds/{{X86_64_LINUX_TARBALL}}">{{X86_64_LINUX_TARBALL}}</a></td>
+            <td>Binary</td>
+            <td>{{X86_64_LINUX_BYTESIZE}}</td>
+            <td class="code">{{X86_64_LINUX_SHASUM}}</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/builds/{{AARCH64_LINUX_TARBALL}}">{{AARCH64_LINUX_TARBALL}}</a></td>
+            <td>Binary</td>
+            <td>{{AARCH64_LINUX_BYTESIZE}}</td>
+            <td class="code">{{AARCH64_LINUX_SHASUM}}</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/builds/{{X86_64_WINDOWS_TARBALL}}">{{X86_64_WINDOWS_TARBALL}}</a></td>
+            <td>Binary</td>
+            <td>{{X86_64_WINDOWS_BYTESIZE}}</td>
+            <td class="code">{{X86_64_WINDOWS_SHASUM}}</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/builds/{{X86_64_MACOS_TARBALL}}">{{X86_64_MACOS_TARBALL}}</a></td>
+            <td>Binary</td>
+            <td>{{X86_64_MACOS_BYTESIZE}}</td>
+            <td class="code">{{X86_64_MACOS_SHASUM}}</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/builds/{{AARCH64_MACOS_TARBALL}}">{{AARCH64_MACOS_TARBALL}}</a></td>
+            <td>Binary</td>
+            <td>{{AARCH64_MACOS_BYTESIZE}}</td>
+            <td class="code">{{AARCH64_MACOS_SHASUM}}</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/builds/{{X86_64_FREEBSD_TARBALL}}">{{X86_64_FREEBSD_TARBALL}}</a></td>
+            <td>Binary</td>
+            <td>{{X86_64_FREEBSD_BYTESIZE}}</td>
+            <td class="code">{{X86_64_FREEBSD_SHASUM}}</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <h2 id="release-0.7.1">0.7.1</h2>
+      <ul>
+        <li>2020-12-13</li>
+        <li><a href="0.7.1/release-notes.html">Release Notes</a></li>
+        <li><a href="/documentation/0.7.1/">Language Reference</a></li>
+        <li><a href="/documentation/0.7.1/std">Standard Library Documentation</a> (experimental)</li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-0.7.1.tar.xz">zig-0.7.1.tar.xz</a></td>
+            <td>Source</td>
+            <td>11MiB</td>
+            <td class="code">2db3b944ab368d955b48743d9f7c963b8f96de1a441ba5a35e197237cc6dae44</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-bootstrap-0.7.1.tar.xz">zig-bootstrap-0.7.1.tar.xz</a></td>
+            <td>Source</td>
+            <td>39MiB</td>
+            <td class="code">040f27c1fae4b0cac0a2782aecdb691f6a2f8e89db6a6ed35024c31c304fd9b2</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-freebsd-x86_64-0.7.1.tar.xz">zig-freebsd-x86_64-0.7.1.tar.xz</a></td>
+            <td>Binary</td>
+            <td>38MiB</td>
+            <td class="code">e73c1dca35791a3183fdd5ecde0443ebbe180942efceafe651886034fb8def09</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-linux-aarch64-0.7.1.tar.xz">zig-linux-aarch64-0.7.1.tar.xz</a></td>
+            <td>Binary</td>
+            <td>33MiB</td>
+            <td class="code">48ec90eba407e4587ddef7eecef25fec7e13587eb98e3b83c5f2f5fff2a5cbe7</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-linux-armv7a-0.7.1.tar.xz">zig-linux-armv7a-0.7.1.tar.xz</a></td>
+            <td>Binary</td>
+            <td>35MiB</td>
+            <td class="code">5a0662e07b4c4968665e1f97558f8591f6facec45d2e0ff5715e661743107ceb</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-linux-i386-0.7.1.tar.xz">zig-linux-i386-0.7.1.tar.xz</a></td>
+            <td>Binary</td>
+            <td>38MiB</td>
+            <td class="code">4882e052e5f83690bd0334bb4fc1702b5403cb3a3d2aa63fd7d6043d8afecba3</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-linux-riscv64-0.7.1.tar.xz">zig-linux-riscv64-0.7.1.tar.xz</a></td>
+            <td>Binary</td>
+            <td>36MiB</td>
+            <td class="code">187294bfd35983348c3fe042901b42e67e7e36ab7f77a5f969d21c0051f4d21f</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-linux-x86_64-0.7.1.tar.xz">zig-linux-x86_64-0.7.1.tar.xz</a></td>
+            <td>Binary</td>
+            <td>37MiB</td>
+            <td class="code">18c7b9b200600f8bcde1cd8d7f1f578cbc3676241ce36d771937ce19a8159b8d</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-macos-x86_64-0.7.1.tar.xz">zig-macos-x86_64-0.7.1.tar.xz</a></td>
+            <td>Binary</td>
+            <td>35MiB</td>
+            <td class="code">845cb17562978af0cf67e3993f4e33330525eaf01ead9386df9105111e3bc519</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-windows-i386-0.7.1.zip">zig-windows-i386-0.7.1.zip</a></td>
+            <td>Binary</td>
+            <td>52MiB</td>
+            <td class="code">a1b9a7421e13153e07fd2e2c93ff29aad64d83105b8fcdafa633dbe689caf1c0</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.1/zig-windows-x86_64-0.7.1.zip">zig-windows-x86_64-0.7.1.zip</a></td>
+            <td>Binary</td>
+            <td>53MiB</td>
+            <td class="code">4818a8a65b4672bc52c0ae7f14d014e0eb8caf10f12c0745176820384cea296a</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <h2 id="release-0.7.0">0.7.0</h2>
+      <ul>
+        <li>2020-11-08</li>
+        <li><a href="0.7.0/release-notes.html">Release Notes</a></li>
+        <li><a href="/documentation/0.7.0/">Language Reference</a></li>
+        <li><a href="/documentation/0.7.0/std">Standard Library Documentation</a> (experimental)</li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-0.7.0.tar.xz">zig-0.7.0.tar.xz</a></td>
+            <td>Source</td>
+            <td>11MiB</td>
+            <td class="code">0efd2cf6c3b05723db80e9cf193bc55150bba84ca41f855a90f53fc756445f83</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-bootstrap-0.7.0.tar.xz">zig-bootstrap-0.7.0.tar.xz</a></td>
+            <td>Source</td>
+            <td>39MiB</td>
+            <td class="code">f073beaf5c53c8c57c0d374cbfcb332ef92ad703173edba0d9e0f2ed28401b72</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-freebsd-x86_64-0.7.0.tar.xz">zig-freebsd-x86_64-0.7.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>34MiB</td>
+            <td class="code">a0c926272ee4ae720034b4a6a1dc98399d76156dd84182554740f0ca8a41fc99</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-linux-aarch64-0.7.0.tar.xz">zig-linux-aarch64-0.7.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>32MiB</td>
+            <td class="code">f89933bac87d44be82325754ff88423020c81c7032a6fc41cfeb81e982eeab9b</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-linux-armv7a-0.7.0.tar.xz">zig-linux-armv7a-0.7.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>34MiB</td>
+            <td class="code">011c267e25a96ee160505a560c441daa045359a9d50e13ab1bada9d75c95db2d</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-linux-i386-0.7.0.tar.xz">zig-linux-i386-0.7.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>37MiB</td>
+            <td class="code">4bb2072cd363bcb1cbeb4872ff5cbc1f683b02d0cc1f90c46e3ea7422ce53222</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-linux-riscv64-0.7.0.tar.xz">zig-linux-riscv64-0.7.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>36MiB</td>
+            <td class="code">40dff81faa6f232ac40abbf88b9371f3cc932b6e09c423b94387c9ea580cb7be</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-linux-x86_64-0.7.0.tar.xz">zig-linux-x86_64-0.7.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>36MiB</td>
+            <td class="code">e619b1c6094c095b932767f527aee2507f847ea981513ff8a08aab0fd730e0ac</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-macos-aarch64-0.7.0.tar.xz">zig-macos-aarch64-0.7.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>33MiB</td>
+            <td class="code">338238035734db74ea4f30e500a4893bf741d38305c10952d5e39fa05bdb057d</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-macos-x86_64-0.7.0.tar.xz">zig-macos-x86_64-0.7.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>35MiB</td>
+            <td class="code">94063f9a311cbbf7a2e0a12295e09437182cf950f18cb0eb30ea9893f3677f24</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-windows-i386-0.7.0.zip">zig-windows-i386-0.7.0.zip</a></td>
+            <td>Binary</td>
+            <td>51MiB</td>
+            <td class="code">b1e520aacbfbd645ff3521b3eb4d44166d9a0288b8725e4b001f8b50a425eb2e</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.7.0/zig-windows-x86_64-0.7.0.zip">zig-windows-x86_64-0.7.0.zip</a></td>
+            <td>Binary</td>
+            <td>52MiB</td>
+            <td class="code">965f56c0a36f9cda2125e3a348bc654f7f155e2804c3667d231775ec228f8553</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <h2 id="release-0.6.0">0.6.0</h2>
+      <ul>
+        <li>2020-04-13</li>
+        <li><a href="0.6.0/release-notes.html">Release Notes</a></li>
+        <li><a href="/documentation/0.6.0">Language Reference</a></li>
+        <li><a href="/documentation/0.6.0/std">Standard Library Documentation</a> (experimental)</li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-0.6.0.tar.xz">zig-0.6.0.tar.xz</a></td>
+            <td>Source</td>
+            <td>9.9MiB</td>
+            <td class="code">5d167dc19354282dd35dd17b38e99e1763713b9be8a4ba9e9e69284e059e7204</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-bootstrap-0.6.0.tar.xz">zig-bootstrap-0.6.0.tar.xz</a></td>
+            <td>Source</td>
+            <td>36.7MiB</td>
+            <td class="code">5e0e4dc878b3dd0c1852a442b174f0732e8c07869a8fcd226b71a93b89b381ab</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-freebsd-x86_64-0.6.0.tar.xz">zig-freebsd-x86_64-0.6.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>36MiB</td>
+            <td class="code">190ff79c1eb56805a315d7c7a51082e32f62926250c0702b36760c225e1634a3</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-linux-aarch64-0.6.0.tar.xz">zig-linux-aarch64-0.6.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>36MiB</td>
+            <td class="code">e7520efd42cfa02be48c2e430d08fe1f3cbb999d21d9f0d3ffd0febb976b2f41</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-linux-armv6kz-0.6.0.tar.xz">zig-linux-armv6kz-0.6.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>38MiB</td>
+            <td class="code">36b6493b3fed43eb1f0000e765798ad31a6bb7d7fd3f553ac1c3761dbc919b82</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-linux-armv7a-0.6.0.tar.xz">zig-linux-armv7a-0.6.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>38MiB</td>
+            <td class="code">946969abe357def95ca9cbbfcebfcf2d90cf967bcd3f48ee87662e32d91d8f35</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-linux-i386-0.6.0.tar.xz">zig-linux-i386-0.6.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>43MiB</td>
+            <td class="code">a97a2f9ae21575743cdd763c1917d49400d83fc562ef64582b18bade43eb24ce</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-linux-riscv64-0.6.0.tar.xz">zig-linux-riscv64-0.6.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>41MiB</td>
+            <td class="code">68ddee43f7503c8ae5f26a921f3602c34719a02ed2241f528c0b8b888cc14b38</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-linux-x86_64-0.6.0.tar.xz">zig-linux-x86_64-0.6.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>43MiB</td>
+            <td class="code">08fd3c757963630645441c2772362e9c2294020c44f14fce1b89f45de0dc1253</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-macos-x86_64-0.6.0.tar.xz">zig-macos-x86_64-0.6.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>41MiB</td>
+            <td class="code">17270360e87ddc49f737e760047b2fac49f1570a824a306119b1194ac4093895</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-windows-i386-0.6.0.zip">zig-windows-i386-0.6.0.zip</a></td>
+            <td>Binary</td>
+            <td>58MiB</td>
+            <td class="code">3b0a02618743e92175990dc6d1a787bb95ff62c4cda016f1c14c7786f575f8ca</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.6.0/zig-windows-x86_64-0.6.0.zip">zig-windows-x86_64-0.6.0.zip</a></td>
+            <td>Binary</td>
+            <td>47MiB</td>
+            <td class="code">c3b897832523e1026e10b2d8d55d7f895185c0a27a63681f3a23219c3f1c38f4</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <h2 id="release-0.5.0">0.5.0</h2>
+      <ul>
+        <li>2019-09-30</li>
+        <li><a href="0.5.0/release-notes.html">Release Notes</a></li>
+        <li><a href="/documentation/0.5.0">Documentation</a></li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.5.0/zig-0.5.0.tar.xz">zig-0.5.0.tar.xz</a></td>
+            <td>Source</td>
+            <td>10.4MiB</td>
+            <td class="code">55ae16960f152bcb9cf98b4f8570902d0e559a141abf927f0d3555b7cc838a31</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.5.0/zig-linux-x86_64-0.5.0.tar.xz">zig-linux-x86_64-0.5.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>39.0MiB</td>
+            <td class="code">43e8f8a8b8556edd373ddf9c1ef3ca6cf852d4d09fe07d5736d12fefedd2b4f7</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.5.0/zig-windows-x86_64-0.5.0.zip">zig-windows-x86_64-0.5.0.zip</a></td>
+            <td>Binary</td>
+            <td>42.8MiB</td>
+            <td class="code">58141323db8d84a5af62746be5f9140bc161ee760ef33dc91a887bf9ac021976</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.5.0/zig-macos-x86_64-0.5.0.tar.xz">zig-macos-x86_64-0.5.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>36.1MiB</td>
+            <td class="code">28702cc05745c7c0bd450487d5f4091bf0a1ad279b35eb9a640ce3e3a15b300d</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.5.0/zig-freebsd-x86_64-0.5.0.tar.xz">zig-freebsd-x86_64-0.5.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>32.1MiB</td>
+            <td class="code">9e1f4d36c3d584c0aa01f20eb4cd0a0eef3eee5af23e483b8414de55feab6ab6</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <h2 id="release-0.4.0">0.4.0</h2>
+      <ul>
+        <li>2019-04-08</li>
+        <li><a href="0.4.0/release-notes.html">Release Notes</a></li>
+        <li><a href="/documentation/0.4.0">Documentation</a></li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.4.0/zig-0.4.0.tar.xz">zig-0.4.0.tar.xz</a></td>
+            <td>Source</td>
+            <td>5.1MiB</td>
+            <td class="code">fec1f3f6b359a3d942e0a7f9157b3b30cde83927627a0e1ea95c54de3c526cfc</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.4.0/zig-linux-x86_64-0.4.0.tar.xz">zig-linux-x86_64-0.4.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>31.4MiB</td>
+            <td class="code">fb1954e2fb556a01f8079a08130e88f70084e08978ff853bb2b1986d8c39d84e</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.4.0/zig-windows-x86_64-0.4.0.zip">zig-windows-x86_64-0.4.0.zip</a></td>
+            <td>Binary</td>
+            <td>34.1MiB</td>
+            <td class="code">fbc3dd205e064c263063f69f600bedb18e3d0aa2efa747a63ef6cafb6d73f127</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.4.0/zig-macos-x86_64-0.4.0.tar.xz">zig-macos-x86_64-0.4.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>29.4MiB</td>
+            <td class="code">67c932982484d017c5111e54af9f33f15e8e05c6bc5346a55e04052159c964a8</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.4.0/zig-freebsd-x86_64-0.4.0.tar.xz">zig-freebsd-x86_64-0.4.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>26.0MiB</td>
+            <td class="code">3d557c91ac36d8262eb1733bb5f261c95944f9b635e43386e3d00a3272818c30</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <h2 id="release-0.3.0">0.3.0</h2>
+      <ul>
+        <li>2018-09-28</li>
+        <li><a href="0.3.0/release-notes.html">Release Notes</a></li>
+        <li><a href="/documentation/0.3.0">Documentation</a></li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.3.0/zig-0.3.0.tar.xz">zig-0.3.0.tar.xz</a></td>
+            <td>Source</td>
+            <td>2.2MiB</td>
+            <td class="code">d70af604f3a8622f3393d93abb3e056bf60351e32d121e6fa4fe03d8d41e1f5a</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.3.0/zig-linux-x86_64-0.3.0.tar.xz">zig-linux-x86_64-0.3.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>24.0MiB</td>
+            <td class="code">b378d0aae30cb54f28494e7bc4efbc9bfb6326f47bfb302e8b5287af777b2f3c</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.3.0/zig-windows-x86_64-0.3.0.zip">zig-windows-x86_64-0.3.0.zip</a></td>
+            <td>Binary</td>
+            <td>21.5MiB</td>
+            <td class="code">bb568c03950958f8bb3472139c3ab5ed74547c8c694ab50f404c202faf51baf4</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.3.0/zig-macos-x86_64-0.3.0.tar.xz">zig-macos-x86_64-0.3.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>22.6MiB</td>
+            <td class="code">19dec1f1943ab7be26823376d466f7e456143deb34e17502778a949034dc2e7e</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <h2 id="release-0.2.0">0.2.0</h2>
+      <ul>
+        <li>2018-03-15</li>
+        <li><a href="0.2.0/release-notes.html">Release Notes</a></li>
+        <li><a href="/documentation/0.2.0">Documentation</a></li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.2.0/zig-0.2.0.tar.xz">zig-0.2.0.tar.xz</a></td>
+            <td>Source</td>
+            <td>1.9MiB</td>
+            <td class="code">29c9beb172737f4d5019b88ceae829ae8bc6512fb4386cfbf895ae2b42aa6965</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.2.0/zig-linux-x86_64-0.2.0.tar.xz">zig-linux-x86_64-0.2.0.tar.xz</a></td>
+            <td>Binary</td>
+            <td>23.5MiB</td>
+            <td class="code">209c6fb745d42474c0a73d6f291c7ae3a38b6a1b6b641eea285a7f840cc1a890</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.2.0/zig-win64-0.2.0.zip">zig-win64-0.2.0.zip</a></td>
+            <td>Binary</td>
+            <td>20.6MiB</td>
+            <td class="code">4f8a2979941a1f081ec8e545cca0b72608c0db1c5a3fd377a94db40649dcd3d4</td>
+          </tr>
+        </tbody>
+      </table>
+
+      <h2 id="release-0.1.1">0.1.1</h2>
+      <ul>
+        <li>2017-10-17</li>
+        <li><a href="0.1.1/release-notes.html">Release Notes</a></li>
+        <li><a href="/documentation/0.1.1">Documentation</a></li>
+      </ul>
+      <table>
+        <colgroup>
+          <col width="40%">
+          <col width="10%">
+          <col width="10%">
+        </colgroup>
+        <thead>
+          <tr>
+            <th>Filename</th>
+            <th>Kind</th>
+            <th>Size</th>
+            <th>Sha256</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.1.1/zig-0.1.1.tar.xz">zig-0.1.1.tar.xz</a></td>
+            <td>Source</td>
+            <td>1.62MiB</td>
+            <td class="code">ffca0cfb263485287e19cc997b08701fcd5f24b700345bcdc3dd8074f5a104e0</td>
+          </tr>
+          <tr>
+            <td><a href="https://ziglang.org/download/0.1.1/zig-win64-0.1.1.zip">zig-win64-0.1.1.zip</a></td>
+            <td>Binary</td>
+            <td>19.3MiB</td>
+            <td class="code">6fc88bef531af7e567fe30bf60da1487b86833cbee84c7a2f3e317030aa5b660</td>
+          </tr>
+        </tbody>
+      </table>
+    </div>
+  </body>
+</html>
--- a/ci/srht/index.json
+++ b/ci/srht/index.json
@@ -0,0 +1,356 @@
+{
+  "master": {
+    "version": "{{MASTER_VERSION}}",
+    "date": "{{MASTER_DATE}}",
+    "docs": "https://ziglang.org/documentation/master/",
+    "stdDocs": "https://ziglang.org/documentation/master/std/",
+    "src": {
+      "tarball": "https://ziglang.org/builds/{{SRC_TARBALL}}",
+      "shasum": "{{SRC_SHASUM}}",
+      "size": "{{SRC_BYTESIZE}}"
+    },
+    "x86_64-freebsd": {
+      "tarball": "https://ziglang.org/builds/{{X86_64_FREEBSD_TARBALL}}",
+      "shasum": "{{X86_64_FREEBSD_SHASUM}}",
+      "size": "{{X86_64_FREEBSD_BYTESIZE}}"
+    },
+    "x86_64-macos": {
+      "tarball": "https://ziglang.org/builds/{{X86_64_MACOS_TARBALL}}",
+      "shasum": "{{X86_64_MACOS_SHASUM}}",
+      "size": "{{X86_64_MACOS_BYTESIZE}}"
+    },
+    "aarch64-macos": {
+      "tarball": "https://ziglang.org/builds/{{AARCH64_MACOS_TARBALL}}",
+      "shasum": "{{AARCH64_MACOS_SHASUM}}",
+      "size": "{{AARCH64_MACOS_BYTESIZE}}"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/builds/{{X86_64_WINDOWS_TARBALL}}",
+      "shasum": "{{X86_64_WINDOWS_SHASUM}}",
+      "size": "{{X86_64_WINDOWS_BYTESIZE}}"
+    },
+    "x86_64-linux": {
+      "tarball": "https://ziglang.org/builds/{{X86_64_LINUX_TARBALL}}",
+      "shasum": "{{X86_64_LINUX_SHASUM}}",
+      "size": "{{X86_64_LINUX_BYTESIZE}}"
+    },
+    "aarch64-linux": {
+      "tarball": "https://ziglang.org/builds/{{AARCH64_LINUX_TARBALL}}",
+      "shasum": "{{AARCH64_LINUX_SHASUM}}",
+      "size": "{{AARCH64_LINUX_BYTESIZE}}"
+    }
+  },
+  "0.7.1": {
+    "date": "2020-12-13",
+    "docs": "https://ziglang.org/documentation/0.7.1/",
+    "stdDocs": "https://ziglang.org/documentation/0.7.1/std/",
+    "notes": "https://ziglang.org/download/0.7.1/release-notes.html",
+    "src": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-0.7.1.tar.xz",
+      "shasum": "2db3b944ab368d955b48743d9f7c963b8f96de1a441ba5a35e197237cc6dae44",
+      "size": "10711824"
+    },
+    "bootstrap": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-bootstrap-0.7.1.tar.xz",
+      "shasum": "040f27c1fae4b0cac0a2782aecdb691f6a2f8e89db6a6ed35024c31c304fd9b2",
+      "size": "40232612"
+    },
+    "x86_64-freebsd": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-freebsd-x86_64-0.7.1.tar.xz",
+      "shasum": "e73c1dca35791a3183fdd5ecde0443ebbe180942efceafe651886034fb8def09",
+      "size": "39066808"
+    },
+    "aarch64-linux": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-linux-aarch64-0.7.1.tar.xz",
+      "shasum": "48ec90eba407e4587ddef7eecef25fec7e13587eb98e3b83c5f2f5fff2a5cbe7",
+      "size": "33780552"
+    },
+    "armv7a-linux": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-linux-armv7a-0.7.1.tar.xz",
+      "shasum": "5a0662e07b4c4968665e1f97558f8591f6facec45d2e0ff5715e661743107ceb",
+      "size": "35813504"
+    },
+    "i386-linux": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-linux-i386-0.7.1.tar.xz",
+      "shasum": "4882e052e5f83690bd0334bb4fc1702b5403cb3a3d2aa63fd7d6043d8afecba3",
+      "size": "39230912"
+    },
+    "riscv64-linux": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-linux-riscv64-0.7.1.tar.xz",
+      "shasum": "187294bfd35983348c3fe042901b42e67e7e36ab7f77a5f969d21c0051f4d21f",
+      "size": "37454812"
+    },
+    "x86_64-linux": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-linux-x86_64-0.7.1.tar.xz",
+      "shasum": "18c7b9b200600f8bcde1cd8d7f1f578cbc3676241ce36d771937ce19a8159b8d",
+      "size": "37848176"
+    },
+    "x86_64-macos": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-macos-x86_64-0.7.1.tar.xz",
+      "shasum": "845cb17562978af0cf67e3993f4e33330525eaf01ead9386df9105111e3bc519",
+      "size": "36211076"
+    },
+    "i386-windows": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-windows-i386-0.7.1.zip",
+      "shasum": "a1b9a7421e13153e07fd2e2c93ff29aad64d83105b8fcdafa633dbe689caf1c0",
+      "size": "54374983"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/download/0.7.1/zig-windows-x86_64-0.7.1.zip",
+      "shasum": "4818a8a65b4672bc52c0ae7f14d014e0eb8caf10f12c0745176820384cea296a",
+      "size": "54909997"
+    }
+  },
+  "0.7.0": {
+    "date": "2020-11-08",
+    "docs": "https://ziglang.org/documentation/0.7.0/",
+    "stdDocs": "https://ziglang.org/documentation/0.7.0/std/",
+    "notes": "https://ziglang.org/download/0.7.0/release-notes.html",
+    "src": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-0.7.0.tar.xz",
+      "shasum": "0efd2cf6c3b05723db80e9cf193bc55150bba84ca41f855a90f53fc756445f83",
+      "size": "10683920"
+    },
+    "bootstrap": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-bootstrap-0.7.0.tar.xz",
+      "shasum": "f073beaf5c53c8c57c0d374cbfcb332ef92ad703173edba0d9e0f2ed28401b72",
+      "size": "40200436"
+    },
+    "x86_64-freebsd": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-freebsd-x86_64-0.7.0.tar.xz",
+      "shasum": "a0c926272ee4ae720034b4a6a1dc98399d76156dd84182554740f0ca8a41fc99",
+      "size": "34798992"
+    },
+    "aarch64-linux": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-linux-aarch64-0.7.0.tar.xz",
+      "shasum": "f89933bac87d44be82325754ff88423020c81c7032a6fc41cfeb81e982eeab9b",
+      "size": "33096140"
+    },
+    "armv7a-linux": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-linux-armv7a-0.7.0.tar.xz",
+      "shasum": "011c267e25a96ee160505a560c441daa045359a9d50e13ab1bada9d75c95db2d",
+      "size": "35157584"
+    },
+    "i386-linux": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-linux-i386-0.7.0.tar.xz",
+      "shasum": "4bb2072cd363bcb1cbeb4872ff5cbc1f683b02d0cc1f90c46e3ea7422ce53222",
+      "size": "38530596"
+    },
+    "riscv64-linux": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-linux-riscv64-0.7.0.tar.xz",
+      "shasum": "40dff81faa6f232ac40abbf88b9371f3cc932b6e09c423b94387c9ea580cb7be",
+      "size": "36759992"
+    },
+    "x86_64-linux": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-linux-x86_64-0.7.0.tar.xz",
+      "shasum": "e619b1c6094c095b932767f527aee2507f847ea981513ff8a08aab0fd730e0ac",
+      "size": "37154432"
+    },
+    "aarch64-macos": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-macos-aarch64-0.7.0.tar.xz",
+      "shasum": "338238035734db74ea4f30e500a4893bf741d38305c10952d5e39fa05bdb057d",
+      "size": "33739424"
+    },
+    "x86_64-macos": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-macos-x86_64-0.7.0.tar.xz",
+      "shasum": "94063f9a311cbbf7a2e0a12295e09437182cf950f18cb0eb30ea9893f3677f24",
+      "size": "35258328"
+    },
+    "i386-windows": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-windows-i386-0.7.0.zip",
+      "shasum": "b1e520aacbfbd645ff3521b3eb4d44166d9a0288b8725e4b001f8b50a425eb2e",
+      "size": "53390517"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/download/0.7.0/zig-windows-x86_64-0.7.0.zip",
+      "shasum": "965f56c0a36f9cda2125e3a348bc654f7f155e2804c3667d231775ec228f8553",
+      "size": "53943784"
+    }
+  },
+  "0.6.0": {
+    "date": "2020-04-13",
+    "docs": "https://ziglang.org/documentation/0.6.0/",
+    "stdDocs": "https://ziglang.org/documentation/0.6.0/std/",
+    "notes": "https://ziglang.org/download/0.6.0/release-notes.html",
+    "src": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-0.6.0.tar.xz",
+      "shasum": "5d167dc19354282dd35dd17b38e99e1763713b9be8a4ba9e9e69284e059e7204",
+      "size": "10349552"
+    },
+    "bootstrap": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-bootstrap-0.6.0.tar.xz",
+      "shasum": "5e0e4dc878b3dd0c1852a442b174f0732e8c07869a8fcd226b71a93b89b381ab",
+      "size": "38469948"
+    },
+    "x86_64-freebsd": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-freebsd-x86_64-0.6.0.tar.xz",
+      "shasum": "190ff79c1eb56805a315d7c7a51082e32f62926250c0702b36760c225e1634a3",
+      "size": "36974604"
+    },
+    "aarch64-linux": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-linux-aarch64-0.6.0.tar.xz",
+      "shasum": "e7520efd42cfa02be48c2e430d08fe1f3cbb999d21d9f0d3ffd0febb976b2f41",
+      "size": "37090044"
+    },
+    "armv6kz-linux": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-linux-armv6kz-0.6.0.tar.xz",
+      "shasum": "36b6493b3fed43eb1f0000e765798ad31a6bb7d7fd3f553ac1c3761dbc919b82",
+      "size": "39133452"
+    },
+    "armv7a-linux": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-linux-armv7a-0.6.0.tar.xz",
+      "shasum": "946969abe357def95ca9cbbfcebfcf2d90cf967bcd3f48ee87662e32d91d8f35",
+      "size": "39143748"
+    },
+    "i386-linux": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-linux-i386-0.6.0.tar.xz",
+      "shasum": "a97a2f9ae21575743cdd763c1917d49400d83fc562ef64582b18bade43eb24ce",
+      "size": "44877640"
+    },
+    "riscv64-linux": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-linux-riscv64-0.6.0.tar.xz",
+      "shasum": "68ddee43f7503c8ae5f26a921f3602c34719a02ed2241f528c0b8b888cc14b38",
+      "size": "41993144"
+    },
+    "x86_64-linux": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-linux-x86_64-0.6.0.tar.xz",
+      "shasum": "08fd3c757963630645441c2772362e9c2294020c44f14fce1b89f45de0dc1253",
+      "size": "44766320"
+    },
+    "x86_64-macos": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-macos-x86_64-0.6.0.tar.xz",
+      "shasum": "17270360e87ddc49f737e760047b2fac49f1570a824a306119b1194ac4093895",
+      "size": "42573184"
+    },
+    "i386-windows": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-windows-i386-0.6.0.zip",
+      "shasum": "3b0a02618743e92175990dc6d1a787bb95ff62c4cda016f1c14c7786f575f8ca",
+      "size": "60446431"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/download/0.6.0/zig-windows-x86_64-0.6.0.zip",
+      "shasum": "c3b897832523e1026e10b2d8d55d7f895185c0a27a63681f3a23219c3f1c38f4",
+      "size": "49065511"
+    }
+  },
+  "0.5.0": {
+    "date": "2019-09-30",
+    "docs": "https://ziglang.org/documentation/0.5.0/",
+    "notes": "https://ziglang.org/download/0.5.0/release-notes.html",
+    "src": {
+      "tarball": "https://ziglang.org/download/0.5.0/zig-0.5.0.tar.xz",
+      "shasum": "55ae16960f152bcb9cf98b4f8570902d0e559a141abf927f0d3555b7cc838a31",
+      "size": "10956132"
+    },
+    "x86_64-freebsd": {
+      "tarball": "https://ziglang.org/download/0.5.0/zig-freebsd-x86_64-0.5.0.tar.xz",
+      "shasum": "9e1f4d36c3d584c0aa01f20eb4cd0a0eef3eee5af23e483b8414de55feab6ab6",
+      "size": "33650744"
+    },
+    "x86_64-macos": {
+      "tarball": "https://ziglang.org/download/0.5.0/zig-macos-x86_64-0.5.0.tar.xz",
+      "shasum": "28702cc05745c7c0bd450487d5f4091bf0a1ad279b35eb9a640ce3e3a15b300d",
+      "size": "37898664"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/download/0.5.0/zig-windows-x86_64-0.5.0.zip",
+      "shasum": "58141323db8d84a5af62746be5f9140bc161ee760ef33dc91a887bf9ac021976",
+      "size": "44871804"
+    },
+    "x86_64-linux": {
+      "tarball": "https://ziglang.org/download/0.5.0/zig-linux-x86_64-0.5.0.tar.xz",
+      "shasum": "43e8f8a8b8556edd373ddf9c1ef3ca6cf852d4d09fe07d5736d12fefedd2b4f7",
+      "size": "40895068"
+    }
+  },
+  "0.4.0": {
+    "date": "2019-04-08",
+    "docs": "https://ziglang.org/documentation/0.4.0/",
+    "notes": "https://ziglang.org/download/0.4.0/release-notes.html",
+    "src": {
+      "tarball": "https://ziglang.org/download/0.4.0/zig-0.4.0.tar.xz",
+      "shasum": "fec1f3f6b359a3d942e0a7f9157b3b30cde83927627a0e1ea95c54de3c526cfc",
+      "size": "5348776"
+    },
+    "x86_64-freebsd": {
+      "tarball": "https://ziglang.org/download/0.4.0/zig-freebsd-x86_64-0.4.0.tar.xz",
+      "shasum": "3d557c91ac36d8262eb1733bb5f261c95944f9b635e43386e3d00a3272818c30",
+      "size": "27269672"
+    },
+    "x86_64-macos": {
+      "tarball": "https://ziglang.org/download/0.4.0/zig-macos-x86_64-0.4.0.tar.xz",
+      "shasum": "67c932982484d017c5111e54af9f33f15e8e05c6bc5346a55e04052159c964a8",
+      "size": "30841504"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/download/0.4.0/zig-windows-x86_64-0.4.0.zip",
+      "shasum": "fbc3dd205e064c263063f69f600bedb18e3d0aa2efa747a63ef6cafb6d73f127",
+      "size": "35800101"
+    },
+    "x86_64-linux": {
+      "tarball": "https://ziglang.org/download/0.4.0/zig-linux-x86_64-0.4.0.tar.xz",
+      "shasum": "fb1954e2fb556a01f8079a08130e88f70084e08978ff853bb2b1986d8c39d84e",
+      "size": "32876100"
+    }
+  },
+  "0.3.0": {
+    "date": "2018-09-28",
+    "docs": "https://ziglang.org/documentation/0.3.0/",
+    "notes": "https://ziglang.org/download/0.3.0/release-notes.html",
+    "src": {
+      "tarball": "https://ziglang.org/download/0.3.0/zig-0.3.0.tar.xz",
+      "shasum": "d70af604f3a8622f3393d93abb3e056bf60351e32d121e6fa4fe03d8d41e1f5a",
+      "size": "2335592"
+    },
+    "x86_64-macos": {
+      "tarball": "https://ziglang.org/download/0.3.0/zig-macos-x86_64-0.3.0.tar.xz",
+      "shasum": "19dec1f1943ab7be26823376d466f7e456143deb34e17502778a949034dc2e7e",
+      "size": "23712696"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/download/0.3.0/zig-windows-x86_64-0.3.0.zip",
+      "shasum": "bb568c03950958f8bb3472139c3ab5ed74547c8c694ab50f404c202faf51baf4",
+      "size": "22524425"
+    },
+    "x86_64-linux": {
+      "tarball": "https://ziglang.org/download/0.3.0/zig-linux-x86_64-0.3.0.tar.xz",
+      "shasum": "b378d0aae30cb54f28494e7bc4efbc9bfb6326f47bfb302e8b5287af777b2f3c",
+      "size": "25209304"
+    }
+  },
+  "0.2.0": {
+    "date": "2018-03-15",
+    "docs": "https://ziglang.org/documentation/0.2.0/",
+    "notes": "https://ziglang.org/download/0.2.0/release-notes.html",
+    "src": {
+      "tarball": "https://ziglang.org/download/0.2.0/zig-0.2.0.tar.xz",
+      "shasum": "29c9beb172737f4d5019b88ceae829ae8bc6512fb4386cfbf895ae2b42aa6965",
+      "size": "1940832"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/download/0.2.0/zig-win64-0.2.0.zip",
+      "shasum": "4f8a2979941a1f081ec8e545cca0b72608c0db1c5a3fd377a94db40649dcd3d4",
+      "size": "21076274"
+    },
+    "x86_64-linux": {
+      "tarball": "https://ziglang.org/download/0.2.0/zig-linux-x86_64-0.2.0.tar.xz",
+      "shasum": "209c6fb745d42474c0a73d6f291c7ae3a38b6a1b6b641eea285a7f840cc1a890",
+      "size": "22551928"
+    }
+  },
+  "0.1.1": {
+    "date": "2017-10-17",
+    "docs": "https://ziglang.org/documentation/0.1.1/",
+    "notes": "https://ziglang.org/download/0.1.1/release-notes.html",
+    "src": {
+      "tarball": "https://ziglang.org/download/0.1.1/zig-0.1.1.tar.xz",
+      "shasum": "ffca0cfb263485287e19cc997b08701fcd5f24b700345bcdc3dd8074f5a104e0",
+      "size": "1659716"
+    },
+    "x86_64-windows": {
+      "tarball": "https://ziglang.org/download/0.1.1/zig-win64-0.1.1.zip",
+      "shasum": "6fc88bef531af7e567fe30bf60da1487b86833cbee84c7a2f3e317030aa5b660",
+      "size": "19757776"
+    }
+  }
+}
--- a/ci/srht/on_master_success
+++ b/ci/srht/on_master_success
@@ -24,6 +24,7 @@ packages:
  - xz
 secrets:
  - 51bfddf5-86a6-4e01-8576-358c72a4a0a4
+  - 44e2bd57-1d07-42bf-925e-22a36119041d
 sources:
  - https://github.com/ziglang/zig
 tasks:
--- a/ci/srht/update-download-page.zig
+++ b/ci/srht/update-download-page.zig
@@ -0,0 +1,104 @@
+const std = @import("std");
+const path = std.fs.path;
+const mem = std.mem;
+
+pub fn main() !void {
+    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena.deinit();
+
+    const allocator = &arena.allocator;
+
+    const out_dir = "out";
+    try std.fs.cwd().makePath(out_dir);
+    {
+        const out_file = out_dir ++ path.sep_str ++ "index.html";
+        const in_file = "index.html";
+        try render(allocator, in_file, out_file, .html);
+    }
+    {
+        const out_file = out_dir ++ path.sep_str ++ "index.json";
+        const in_file = "index.json";
+        try render(allocator, in_file, out_file, .plain);
+    }
+}
+
+fn render(
+    allocator: *mem.Allocator,
+    in_file: []const u8,
+    out_file: []const u8,
+    fmt: enum {
+        html,
+        plain,
+    },
+) !void {
+    const in_contents = try std.fs.cwd().readFileAlloc(allocator, in_file, 1 * 1024 * 1024);
+
+    var vars = try std.process.getEnvMap(allocator);
+
+    var buffer = std.ArrayList(u8).init(allocator);
+    defer buffer.deinit();
+
+    const State = enum {
+        Start,
+        OpenBrace,
+        VarName,
+        EndBrace,
+    };
+    const writer = buffer.writer();
+    var state = State.Start;
+    var var_name_start: usize = undefined;
+    var line: usize = 1;
+    for (in_contents) |byte, index| {
+        switch (state) {
+            State.Start => switch (byte) {
+                '{' => {
+                    state = State.OpenBrace;
+                },
+                else => try writer.writeByte(byte),
+            },
+            State.OpenBrace => switch (byte) {
+                '{' => {
+                    state = State.VarName;
+                    var_name_start = index + 1;
+                },
+                else => {
+                    try writer.writeByte('{');
+                    try writer.writeByte(byte);
+                    state = State.Start;
+                },
+            },
+            State.VarName => switch (byte) {
+                '}' => {
+                    const var_name = in_contents[var_name_start..index];
+                    if (vars.get(var_name)) |value| {
+                        const trimmed = mem.trim(u8, value, " \r\n");
+                        if (fmt == .html and mem.endsWith(u8, var_name, "BYTESIZE")) {
+                            const size = try std.fmt.parseInt(u64, trimmed, 10);
+                            try writer.print("{:.1}", .{std.fmt.fmtIntSizeDec(size)});
+                        } else {
+                            try writer.writeAll(trimmed);
+                        }
+                    } else {
+                        std.debug.warn("line {d}: missing variable: {s}\n", .{ line, var_name });
+                        try writer.writeAll("(missing)");
+                    }
+                    state = State.EndBrace;
+                },
+                else => {},
+            },
+            State.EndBrace => switch (byte) {
+                '}' => {
+                    state = State.Start;
+                },
+                else => {
+                    std.debug.warn("line {d}: invalid byte: '0x{x}'", .{ line, byte });
+                    std.process.exit(1);
+                },
+            },
+        }
+        if (byte == '\n') {
+            line += 1;
+        }
+    }
+    try std.fs.cwd().writeFile(out_file, buffer.items);
+}
--- a/ci/srht/update_download_page
+++ b/ci/srht/update_download_page
@@ -12,6 +12,7 @@ NATIVE_TARBALL="zig-linux-$(uname -m)-$VERSION.tar.xz"
 AARCH64_LINUX_JSON_URL="https://ziglang.org/builds/aarch64-linux-$VERSION.json"
 X86_64_LINUX_JSON_URL="https://ziglang.org/builds/x86_64-linux-$VERSION.json"
 X86_64_WINDOWS_JSON_URL="https://ziglang.org/builds/x86_64-windows-$VERSION.json"
+AARCH64_MACOS_JSON_URL="https://ziglang.org/builds/aarch64-macos-$VERSION.json"
 X86_64_MACOS_JSON_URL="https://ziglang.org/builds/x86_64-macos-$VERSION.json"
 X86_64_FREEBSD_JSON_URL="https://ziglang.org/builds/x86_64-freebsd-$VERSION.json"

@@ -20,6 +21,7 @@ X86_64_FREEBSD_JSON_URL="https://ziglang.org/builds/x86_64-freebsd-$VERSION.json
 curl --fail -I "$AARCH64_LINUX_JSON_URL" >/dev/null || exit 0
 curl --fail -I "$X86_64_LINUX_JSON_URL" >/dev/null || exit 0
 curl --fail -I "$X86_64_WINDOWS_JSON_URL" >/dev/null || exit 0
+curl --fail -I "$AARCH64_MACOS_JSON_URL" >/dev/null || exit 0
 curl --fail -I "$X86_64_MACOS_JSON_URL" >/dev/null || exit 0
 curl --fail -I "$X86_64_FREEBSD_JSON_URL" >/dev/null || exit 0

@@ -28,14 +30,23 @@ curl --fail -I "$X86_64_FREEBSD_JSON_URL" >/dev/null || exit 0
 pip3 install s3cmd --user
 S3CMD="$HOME/.local/bin/s3cmd"

+# This is the user when pushing to the website repo.
+git config --global user.email "ziggy@ziglang.org"
+git config --global user.name "Ziggy"
+
+# Refresh this with `ssh-keyscan github.com` from a trusted Internet connection.
+# We hard code the public key here to detect man-in-the-middle attacks.
+echo "github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==" >> ~/.ssh/known_hosts
+
+# We don't want the .git folder inside the tarball.
 rm -rf .git

 cd "$HOME"
 wget "https://ziglang.org/builds/$NATIVE_TARBALL"
 tar xf "$NATIVE_TARBALL"
-ZIGDIR=$(basename $NATIVE_TARBALL .tar.xz)
+ZIGDIR="$(pwd)/$(basename $NATIVE_TARBALL .tar.xz)"
 ZIG="$ZIGDIR/zig"
-LANGREF="$ZIGDIR/langref.html"
+LANGREF="$ZIGDIR/docs/langref.html"
 SRCTARBALLDIR="zig-$VERSION"
 export SRC_TARBALL="$SRCTARBALLDIR.tar.xz"
 mv "$SRCDIR" "$SRCTARBALLDIR"
@@ -48,6 +59,11 @@ export X86_64_WINDOWS_TARBALL="$(echo "$X86_64_WINDOWS_JSON" | jq .tarball -r)"
 export X86_64_WINDOWS_BYTESIZE="$(echo "$X86_64_WINDOWS_JSON" | jq .size -r)"
 export X86_64_WINDOWS_SHASUM="$(echo "$X86_64_WINDOWS_JSON" | jq .shasum -r)"

+AARCH64_MACOS_JSON=$(curl --fail "$AARCH64_MACOS_JSON_URL" || exit 1)
+export AARCH64_MACOS_TARBALL="$(echo "$AARCH64_MACOS_JSON" | jq .tarball -r)"
+export AARCH64_MACOS_BYTESIZE="$(echo "$AARCH64_MACOS_JSON" | jq .size -r)"
+export AARCH64_MACOS_SHASUM="$(echo "$AARCH64_MACOS_JSON" | jq .shasum -r)"
+
 X86_64_MACOS_JSON=$(curl --fail "$X86_64_MACOS_JSON_URL" || exit 1)
 export X86_64_MACOS_TARBALL="$(echo "$X86_64_MACOS_JSON" | jq .tarball -r)"
 export X86_64_MACOS_BYTESIZE="$(echo "$X86_64_MACOS_JSON" | jq .size -r)"
@@ -68,13 +84,29 @@ export X86_64_FREEBSD_TARBALL="$(echo "$X86_64_FREEBSD_JSON" | jq .tarball -r)"
 export X86_64_FREEBSD_BYTESIZE="$(echo "$X86_64_FREEBSD_JSON" | jq .size -r)"
 export X86_64_FREEBSD_SHASUM="$(echo "$X86_64_FREEBSD_JSON" | jq .shasum -r)"

-git clone https://github.com/ziglang/www.ziglang.org --depth 1
-cd www.ziglang.org
 export MASTER_DATE="$(date +%Y-%m-%d)"
 export MASTER_VERSION="$VERSION"
-"../$ZIG" run update-download-page.zig

-$S3CMD put -P --no-mime-magic --add-header="cache-control: public, max-age=31536000, immutable" "../$SRC_TARBALL" s3://ziglang.org/builds/
-$S3CMD put -P --no-mime-magic "../$LANGREF" s3://ziglang.org/documentation/master/index.html --add-header="Cache-Control: max-age=0, must-revalidate"
-$S3CMD put -P --no-mime-magic www/download/index.html s3://ziglang.org/download/index.html --add-header="Cache-Control: max-age=0, must-revalidate"
-$S3CMD put -P --no-mime-magic www/download/index.json s3://ziglang.org/download/index.json --add-header="Cache-Control: max-age=0, must-revalidate"
+cd "$SRCTARBALLDIR/ci/srht"
+"$ZIG" run update-download-page.zig
+CIDIR="$(pwd)"
+
+# Create index.json and index.html and update the website repo.
+cd "$HOME"
+git clone git@github.com:ziglang/www.ziglang.org.git
+cd www.ziglang.org
+WWWDIR="$(pwd)"
+
+$S3CMD put -P --no-mime-magic --add-header="cache-control: public, max-age=31536000, immutable" "$HOME/$SRC_TARBALL" s3://ziglang.org/builds/
+
+cd "$WWWDIR"
+cp "$CIDIR/out/index.json" data/releases.json
+mkdir -p content/documentation/master/std
+cp "$LANGREF" content/documentation/master/index.html
+cp "$ZIGDIR/docs/std/index.html" content/documentation/master/std/index.html
+cp "$ZIGDIR/docs/std/data.js" content/documentation/master/std/data.js
+cp "$ZIGDIR/docs/std/main.js" content/documentation/master/std/main.js
+git add data/releases.json
+git add content/
+git commit -m "CI: update releases and docs"
+git push origin master
--- a/cmake/Findclang.cmake
+++ b/cmake/Findclang.cmake
@@ -9,27 +9,27 @@

 find_path(CLANG_INCLUDE_DIRS NAMES clang/Frontend/ASTUnit.h
  PATHS
-    /usr/lib/llvm/11/include
-    /usr/lib/llvm-11/include
-    /usr/lib/llvm-11.0/include
-    /usr/local/llvm110/include
-    /usr/local/llvm11/include
+    /usr/lib/llvm/12/include
+    /usr/lib/llvm-12/include
+    /usr/lib/llvm-12.0/include
+    /usr/local/llvm120/include
+    /usr/local/llvm12/include
    /mingw64/include
 )

 if(ZIG_PREFER_CLANG_CPP_DYLIB)
  find_library(CLANG_LIBRARIES
    NAMES
-      clang-cpp-11.0
-      clang-cpp110
+      clang-cpp-12.0
+      clang-cpp120
      clang-cpp
    PATHS
      ${CLANG_LIBDIRS}
-      /usr/lib/llvm/11/lib
-      /usr/lib/llvm/11/lib64
-      /usr/lib/llvm-11/lib
-      /usr/local/llvm110/lib
-      /usr/local/llvm11/lib
+      /usr/lib/llvm/12/lib
+      /usr/lib/llvm/12/lib64
+      /usr/lib/llvm-12/lib
+      /usr/local/llvm120/lib
+      /usr/local/llvm12/lib
  )
 endif()

@@ -39,11 +39,11 @@ if(NOT CLANG_LIBRARIES)
    find_library(CLANG_${_prettylibname_}_LIB NAMES ${_libname_}
      PATHS
        ${CLANG_LIBDIRS}
-        /usr/lib/llvm/11/lib
-        /usr/lib/llvm-11/lib
-        /usr/lib/llvm-11.0/lib
-        /usr/local/llvm110/lib
-        /usr/local/llvm11/lib
+        /usr/lib/llvm/12/lib
+        /usr/lib/llvm-12/lib
+        /usr/lib/llvm-12.0/lib
+        /usr/local/llvm120/lib
+        /usr/local/llvm12/lib
        /mingw64/lib
        /c/msys64/mingw64/lib
        c:\\msys64\\mingw64\\lib
--- a/cmake/Findlld.cmake
+++ b/cmake/Findlld.cmake
@@ -8,16 +8,16 @@

 find_path(LLD_INCLUDE_DIRS NAMES lld/Common/Driver.h
    PATHS
-        /usr/lib/llvm-11/include
-        /usr/local/llvm110/include
-        /usr/local/llvm11/include
+        /usr/lib/llvm-12/include
+        /usr/local/llvm120/include
+        /usr/local/llvm12/include
        /mingw64/include)

-find_library(LLD_LIBRARY NAMES lld-11.0 lld110 lld
+find_library(LLD_LIBRARY NAMES lld-12.0 lld120 lld
    PATHS
-        /usr/lib/llvm-11/lib
-        /usr/local/llvm110/lib
-        /usr/local/llvm11/lib
+        /usr/lib/llvm-12/lib
+        /usr/local/llvm120/lib
+        /usr/local/llvm12/lib
 )
 if(EXISTS ${LLD_LIBRARY})
    set(LLD_LIBRARIES ${LLD_LIBRARY})
@@ -27,9 +27,9 @@ else()
        find_library(LLD_${_prettylibname_}_LIB NAMES ${_libname_}
            PATHS
                ${LLD_LIBDIRS}
-                /usr/lib/llvm-11/lib
-                /usr/local/llvm110/lib
-                /usr/local/llvm11/lib
+                /usr/lib/llvm-12/lib
+                /usr/local/llvm120/lib
+                /usr/local/llvm12/lib
                /mingw64/lib
                /c/msys64/mingw64/lib
                c:/msys64/mingw64/lib)
--- a/cmake/Findllvm.cmake
+++ b/cmake/Findllvm.cmake
@@ -9,37 +9,37 @@

 find_path(LLVM_INCLUDE_DIRS NAMES llvm/IR/IRBuilder.h
  PATHS
-    /usr/lib/llvm/11/include
-    /usr/lib/llvm-11/include
-    /usr/lib/llvm-11.0/include
-    /usr/local/llvm11/include
-    /usr/local/llvm110/include
+    /usr/lib/llvm/12/include
+    /usr/lib/llvm-12/include
+    /usr/lib/llvm-12.0/include
+    /usr/local/llvm12/include
+    /usr/local/llvm120/include
    /mingw64/include
 )

 if(ZIG_PREFER_CLANG_CPP_DYLIB)
  find_library(LLVM_LIBRARIES
    NAMES
-      LLVM-11.0
-      LLVM-11
-      LLVM-110
+      LLVM-12.0
+      LLVM-12
+      LLVM-120
      LLVM
    PATHS
      ${LLVM_LIBDIRS}
-      /usr/lib/llvm/11/lib
-      /usr/lib/llvm/11/lib64
-      /usr/lib/llvm-11/lib
-      /usr/local/llvm11/lib
-      /usr/local/llvm110/lib
+      /usr/lib/llvm/12/lib
+      /usr/lib/llvm/12/lib64
+      /usr/lib/llvm-12/lib
+      /usr/local/llvm12/lib
+      /usr/local/llvm120/lib
  )

  find_program(LLVM_CONFIG_EXE
-      NAMES llvm-config-11 llvm-config-11.0 llvm-config110 llvm-config11 llvm-config
+      NAMES llvm-config-12 llvm-config-12.0 llvm-config120 llvm-config12 llvm-config
      PATHS
          "/mingw64/bin"
          "/c/msys64/mingw64/bin"
          "c:/msys64/mingw64/bin"
-          "C:/Libraries/llvm-11.0.0/bin")
+          "C:/Libraries/llvm-12.0.0/bin")

  if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
    message(FATAL_ERROR "unable to find llvm-config")
@@ -54,23 +54,23 @@ if(ZIG_PREFER_CLANG_CPP_DYLIB)
    OUTPUT_VARIABLE LLVM_CONFIG_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE)

-  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 11)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 12)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 12)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 13)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 12)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 13)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG)
+elseif(ZIG_USE_LLVM_CONFIG)
  find_program(LLVM_CONFIG_EXE
-      NAMES llvm-config-11 llvm-config-11.0 llvm-config110 llvm-config11 llvm-config
+      NAMES llvm-config-12 llvm-config-12.0 llvm-config120 llvm-config12 llvm-config
      PATHS
          "/mingw64/bin"
          "/c/msys64/mingw64/bin"
          "c:/msys64/mingw64/bin"
-          "C:/Libraries/llvm-11.0.0/bin")
+          "C:/Libraries/llvm-12.0.0/bin")

  if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
    message(FATAL_ERROR "unable to find llvm-config")
@@ -85,14 +85,14 @@ elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG)
    OUTPUT_VARIABLE LLVM_CONFIG_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE)

-  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 11)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 12)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 12)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 13)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 12)
-    message(FATAL_ERROR "expected LLVM 11.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 13)
+    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()

  execute_process(
@@ -166,7 +166,7 @@ elseif(("${ZIG_TARGET_TRIPLE}" STREQUAL "native") OR ZIG_PREFER_LLVM_CONFIG)
  set(LLVM_LIBRARIES ${LLVM_LIBRARIES} ${LLVM_SYSTEM_LIBS})

  if(NOT LLVM_LIBRARIES)
-    find_library(LLVM_LIBRARIES NAMES LLVM LLVM-11 LLVM-11.0)
+    find_library(LLVM_LIBRARIES NAMES LLVM LLVM-12 LLVM-12.0)
  endif()

  link_directories("${CMAKE_PREFIX_PATH}/lib")
@@ -180,11 +180,11 @@ else()
    find_library(LLVM_${_prettylibname_}_LIB NAMES ${_libname_}
      PATHS
      ${LLVM_LIBDIRS}
-      /usr/lib/llvm/11/lib
-      /usr/lib/llvm-11/lib
-      /usr/lib/llvm-11.0/lib
-      /usr/local/llvm110/lib
-      /usr/local/llvm11/lib
+      /usr/lib/llvm/12/lib
+      /usr/lib/llvm-12/lib
+      /usr/lib/llvm-12.0/lib
+      /usr/local/llvm120/lib
+      /usr/local/llvm12/lib
      /mingw64/lib
      /c/msys64/mingw64/lib
      c:\\msys64\\mingw64\\lib)
@@ -194,78 +194,57 @@ else()
  # This list can be re-generated with `llvm-config --libfiles` and then
  # reformatting using your favorite text editor. Note we do not execute
  # `llvm-config` here because we are cross compiling.
-  FIND_AND_ADD_LLVM_LIB(LLVMXRay)
  FIND_AND_ADD_LLVM_LIB(LLVMWindowsManifest)
-  FIND_AND_ADD_LLVM_LIB(LLVMSymbolize)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoPDB)
-  FIND_AND_ADD_LLVM_LIB(LLVMOrcJIT)
-  FIND_AND_ADD_LLVM_LIB(LLVMOrcError)
-  FIND_AND_ADD_LLVM_LIB(LLVMJITLink)
-  FIND_AND_ADD_LLVM_LIB(LLVMObjectYAML)
-  FIND_AND_ADD_LLVM_LIB(LLVMMCA)
-  FIND_AND_ADD_LLVM_LIB(LLVMLTO)
-  FIND_AND_ADD_LLVM_LIB(LLVMPasses)
-  FIND_AND_ADD_LLVM_LIB(LLVMCoroutines)
-  FIND_AND_ADD_LLVM_LIB(LLVMObjCARCOpts)
-  FIND_AND_ADD_LLVM_LIB(LLVMExtensions)
-  FIND_AND_ADD_LLVM_LIB(LLVMLineEditor)
+  FIND_AND_ADD_LLVM_LIB(LLVMXRay)
  FIND_AND_ADD_LLVM_LIB(LLVMLibDriver)
-  FIND_AND_ADD_LLVM_LIB(LLVMInterpreter)
-  FIND_AND_ADD_LLVM_LIB(LLVMFuzzMutate)
-  FIND_AND_ADD_LLVM_LIB(LLVMMCJIT)
-  FIND_AND_ADD_LLVM_LIB(LLVMExecutionEngine)
-  FIND_AND_ADD_LLVM_LIB(LLVMRuntimeDyld)
-  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinker)
  FIND_AND_ADD_LLVM_LIB(LLVMDlltoolDriver)
-  FIND_AND_ADD_LLVM_LIB(LLVMOption)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoGSYM)
  FIND_AND_ADD_LLVM_LIB(LLVMCoverage)
+  FIND_AND_ADD_LLVM_LIB(LLVMLineEditor)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreDisassembler)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMX86Disassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMX86CodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMX86AsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMX86CodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMX86Desc)
  FIND_AND_ADD_LLVM_LIB(LLVMX86Info)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyDisassembler)
+  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyAsmParser)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyDesc)
-  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyAsmParser)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMSystemZDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMSystemZCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMSystemZAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMSystemZCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMSystemZDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMSystemZInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMSparcDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMSparcCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMSparcAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMSparcCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMSparcDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMSparcInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMRISCVDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMRISCVCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMRISCVAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMRISCVCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMRISCVDesc)
-  FIND_AND_ADD_LLVM_LIB(LLVMRISCVUtils)
  FIND_AND_ADD_LLVM_LIB(LLVMRISCVInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMPowerPCDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMPowerPCCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMPowerPCAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMPowerPCCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMPowerPCDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMPowerPCInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMNVPTXCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMNVPTXDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMNVPTXInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMMSP430Disassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMMSP430CodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMMSP430AsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMMSP430CodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMMSP430Desc)
  FIND_AND_ADD_LLVM_LIB(LLVMMSP430Info)
  FIND_AND_ADD_LLVM_LIB(LLVMMipsDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMMipsCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMMipsAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMMipsCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMMipsDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMMipsInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMLanaiDisassembler)
@@ -279,44 +258,73 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMHexagonDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMHexagonInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMBPFDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMBPFCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMBPFAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMBPFCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMBPFDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMBPFInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMAVRDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMAVRCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMAVRAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMAVRCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMAVRDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMAVRInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMARMDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMARMCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMARMAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMARMCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMARMDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMARMUtils)
  FIND_AND_ADD_LLVM_LIB(LLVMARMInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUDisassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUCodeGen)
-  FIND_AND_ADD_LLVM_LIB(LLVMMIRParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMipo)
-  FIND_AND_ADD_LLVM_LIB(LLVMInstrumentation)
-  FIND_AND_ADD_LLVM_LIB(LLVMVectorize)
-  FIND_AND_ADD_LLVM_LIB(LLVMLinker)
-  FIND_AND_ADD_LLVM_LIB(LLVMIRReader)
-  FIND_AND_ADD_LLVM_LIB(LLVMAsmParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenMP)
  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUDesc)
  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUUtils)
  FIND_AND_ADD_LLVM_LIB(LLVMAMDGPUInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Disassembler)
-  FIND_AND_ADD_LLVM_LIB(LLVMMCDisassembler)
+  FIND_AND_ADD_LLVM_LIB(LLVMAArch64AsmParser)
  FIND_AND_ADD_LLVM_LIB(LLVMAArch64CodeGen)
+  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Desc)
+  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Utils)
+  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Info)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcJIT)
+  FIND_AND_ADD_LLVM_LIB(LLVMMCJIT)
+  FIND_AND_ADD_LLVM_LIB(LLVMJITLink)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcTargetProcess)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcShared)
+  FIND_AND_ADD_LLVM_LIB(LLVMInterpreter)
+  FIND_AND_ADD_LLVM_LIB(LLVMExecutionEngine)
+  FIND_AND_ADD_LLVM_LIB(LLVMRuntimeDyld)
+  FIND_AND_ADD_LLVM_LIB(LLVMSymbolize)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoPDB)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoGSYM)
+  FIND_AND_ADD_LLVM_LIB(LLVMOption)
+  FIND_AND_ADD_LLVM_LIB(LLVMObjectYAML)
+  FIND_AND_ADD_LLVM_LIB(LLVMMCA)
+  FIND_AND_ADD_LLVM_LIB(LLVMMCDisassembler)
+  FIND_AND_ADD_LLVM_LIB(LLVMLTO)
+  FIND_AND_ADD_LLVM_LIB(LLVMPasses)
  FIND_AND_ADD_LLVM_LIB(LLVMCFGuard)
+  FIND_AND_ADD_LLVM_LIB(LLVMCoroutines)
+  FIND_AND_ADD_LLVM_LIB(LLVMObjCARCOpts)
+  FIND_AND_ADD_LLVM_LIB(LLVMHelloNew)
+  FIND_AND_ADD_LLVM_LIB(LLVMipo)
+  FIND_AND_ADD_LLVM_LIB(LLVMVectorize)
+  FIND_AND_ADD_LLVM_LIB(LLVMLinker)
+  FIND_AND_ADD_LLVM_LIB(LLVMInstrumentation)
+  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenMP)
+  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenACC)
+  FIND_AND_ADD_LLVM_LIB(LLVMExtensions)
+  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinker)
  FIND_AND_ADD_LLVM_LIB(LLVMGlobalISel)
-  FIND_AND_ADD_LLVM_LIB(LLVMSelectionDAG)
+  FIND_AND_ADD_LLVM_LIB(LLVMMIRParser)
  FIND_AND_ADD_LLVM_LIB(LLVMAsmPrinter)
  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoDWARF)
+  FIND_AND_ADD_LLVM_LIB(LLVMSelectionDAG)
  FIND_AND_ADD_LLVM_LIB(LLVMCodeGen)
+  FIND_AND_ADD_LLVM_LIB(LLVMIRReader)
+  FIND_AND_ADD_LLVM_LIB(LLVMAsmParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMInterfaceStub)
+  FIND_AND_ADD_LLVM_LIB(LLVMFileCheck)
+  FIND_AND_ADD_LLVM_LIB(LLVMFuzzMutate)
  FIND_AND_ADD_LLVM_LIB(LLVMTarget)
  FIND_AND_ADD_LLVM_LIB(LLVMScalarOpts)
  FIND_AND_ADD_LLVM_LIB(LLVMInstCombine)
@@ -327,19 +335,15 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMProfileData)
  FIND_AND_ADD_LLVM_LIB(LLVMObject)
  FIND_AND_ADD_LLVM_LIB(LLVMTextAPI)
+  FIND_AND_ADD_LLVM_LIB(LLVMMCParser)
+  FIND_AND_ADD_LLVM_LIB(LLVMMC)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoCodeView)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoMSF)
  FIND_AND_ADD_LLVM_LIB(LLVMBitReader)
  FIND_AND_ADD_LLVM_LIB(LLVMCore)
  FIND_AND_ADD_LLVM_LIB(LLVMRemarks)
  FIND_AND_ADD_LLVM_LIB(LLVMBitstreamReader)
-  FIND_AND_ADD_LLVM_LIB(LLVMAArch64AsmParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMMCParser)
-  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Desc)
-  FIND_AND_ADD_LLVM_LIB(LLVMMC)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoCodeView)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoMSF)
  FIND_AND_ADD_LLVM_LIB(LLVMBinaryFormat)
-  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Utils)
-  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Info)
  FIND_AND_ADD_LLVM_LIB(LLVMSupport)
  FIND_AND_ADD_LLVM_LIB(LLVMDemangle)
 endif()
--- a/doc/docgen.zig
+++ b/doc/docgen.zig
@@ -4,6 +4,7 @@ const io = std.io;
 const fs = std.fs;
 const process = std.process;
 const ChildProcess = std.ChildProcess;
+const Progress = std.Progress;
 const print = std.debug.print;
 const mem = std.mem;
 const testing = std.testing;
@@ -34,15 +35,24 @@ pub fn main() !void {
    const out_file_name = try (args_it.next(allocator) orelse @panic("expected output arg"));
    defer allocator.free(out_file_name);

+    var do_code_tests = true;
+    if (args_it.next(allocator)) |arg| {
+        if (mem.eql(u8, try arg, "--skip-code-tests")) {
+            do_code_tests = false;
+        } else {
+            @panic("unrecognized arg");
+        }
+    }
+
    var in_file = try fs.cwd().openFile(in_file_name, .{ .read = true });
    defer in_file.close();

    var out_file = try fs.cwd().createFile(out_file_name, .{});
    defer out_file.close();

-    const input_file_bytes = try in_file.inStream().readAllAlloc(allocator, max_doc_file_size);
+    const input_file_bytes = try in_file.reader().readAllAlloc(allocator, max_doc_file_size);

-    var buffered_out_stream = io.bufferedOutStream(out_file.outStream());
+    var buffered_writer = io.bufferedWriter(out_file.writer());

    var tokenizer = Tokenizer.init(in_file_name, input_file_bytes);
    var toc = try genToc(allocator, &tokenizer);
@@ -50,8 +60,8 @@ pub fn main() !void {
    try fs.cwd().makePath(tmp_dir_name);
    defer fs.cwd().deleteTree(tmp_dir_name) catch {};

-    try genHtml(allocator, &tokenizer, &toc, buffered_out_stream.outStream(), zig_exe);
-    try buffered_out_stream.flush();
+    try genHtml(allocator, &tokenizer, &toc, buffered_writer.writer(), zig_exe, do_code_tests);
+    try buffered_writer.flush();
 }

 const Token = struct {
@@ -215,9 +225,9 @@ const Tokenizer = struct {
 fn parseError(tokenizer: *Tokenizer, token: Token, comptime fmt: []const u8, args: anytype) anyerror {
    const loc = tokenizer.getTokenLocation(token);
    const args_prefix = .{ tokenizer.source_file_name, loc.line + 1, loc.column + 1 };
-    print("{}:{}:{}: error: " ++ fmt ++ "\n", args_prefix ++ args);
+    print("{s}:{d}:{d}: error: " ++ fmt ++ "\n", args_prefix ++ args);
    if (loc.line_start <= loc.line_end) {
-        print("{}\n", .{tokenizer.buffer[loc.line_start..loc.line_end]});
+        print("{s}\n", .{tokenizer.buffer[loc.line_start..loc.line_end]});
        {
            var i: usize = 0;
            while (i < loc.column) : (i += 1) {
@@ -225,7 +235,7 @@ fn parseError(tokenizer: *Tokenizer, token: Token, comptime fmt: []const u8, arg
            }
        }
        {
-            const caret_count = token.end - token.start;
+            const caret_count = std.math.min(token.end, loc.line_end) - token.start;
            var i: usize = 0;
            while (i < caret_count) : (i += 1) {
                print("~", .{});
@@ -238,7 +248,7 @@ fn parseError(tokenizer: *Tokenizer, token: Token, comptime fmt: []const u8, arg

 fn assertToken(tokenizer: *Tokenizer, token: Token, id: Token.Id) !void {
    if (token.id != id) {
-        return parseError(tokenizer, token, "expected {}, found {}", .{ @tagName(id), @tagName(token.id) });
+        return parseError(tokenizer, token, "expected {s}, found {s}", .{ @tagName(id), @tagName(token.id) });
    }
 }

@@ -274,6 +284,7 @@ const Code = struct {
    link_objects: []const []const u8,
    target_str: ?[]const u8,
    link_libc: bool,
+    link_mode: ?std.builtin.LinkMode,
    disable_cache: bool,

    const Id = union(enum) {
@@ -325,7 +336,7 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
    var toc_buf = std.ArrayList(u8).init(allocator);
    defer toc_buf.deinit();

-    var toc = toc_buf.outStream();
+    var toc = toc_buf.writer();

    var nodes = std.ArrayList(Node).init(allocator);
    defer nodes.deinit();
@@ -374,7 +385,7 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                                    return parseError(
                                        tokenizer,
                                        bracket_tok,
-                                        "unrecognized header_open param: {}",
+                                        "unrecognized header_open param: {s}",
                                        .{param},
                                    );
                                }
@@ -393,9 +404,9 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                            .n = header_stack_size,
                        },
                    });
-                    if (try urls.fetchPut(urlized, tag_token)) |entry| {
-                        parseError(tokenizer, tag_token, "duplicate header url: #{}", .{urlized}) catch {};
-                        parseError(tokenizer, entry.value, "other tag here", .{}) catch {};
+                    if (try urls.fetchPut(urlized, tag_token)) |kv| {
+                        parseError(tokenizer, tag_token, "duplicate header url: #{s}", .{urlized}) catch {};
+                        parseError(tokenizer, kv.value, "other tag here", .{}) catch {};
                        return error.ParseError;
                    }
                    if (last_action == Action.Open) {
@@ -411,7 +422,7 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                    }
                    last_columns = columns;
                    try toc.writeByteNTimes(' ', 4 + header_stack_size * 4);
-                    try toc.print("<li><a id=\"toc-{}\" href=\"#{}\">{}</a>", .{ urlized, urlized, content });
+                    try toc.print("<li><a id=\"toc-{s}\" href=\"#{s}\">{s}</a>", .{ urlized, urlized, content });
                } else if (mem.eql(u8, tag_name, "header_close")) {
                    if (header_stack_size == 0) {
                        return parseError(tokenizer, tag_token, "unbalanced close header", .{});
@@ -515,7 +526,7 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                        code_kind_id = Code.Id{ .Obj = null };
                        is_inline = true;
                    } else {
-                        return parseError(tokenizer, code_kind_tok, "unrecognized code kind: {}", .{code_kind_str});
+                        return parseError(tokenizer, code_kind_tok, "unrecognized code kind: {s}", .{code_kind_str});
                    }

                    var mode: builtin.Mode = .Debug;
@@ -523,6 +534,7 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                    defer link_objects.deinit();
                    var target_str: ?[]const u8 = null;
                    var link_libc = false;
+                    var link_mode: ?std.builtin.LinkMode = null;
                    var disable_cache = false;

                    const source_token = while (true) {
@@ -552,6 +564,8 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                            target_str = "wasm32-wasi";
                        } else if (mem.eql(u8, end_tag_name, "link_libc")) {
                            link_libc = true;
+                        } else if (mem.eql(u8, end_tag_name, "link_mode_dynamic")) {
+                            link_mode = .Dynamic;
                        } else if (mem.eql(u8, end_tag_name, "code_end")) {
                            _ = try eatToken(tokenizer, Token.Id.BracketClose);
                            break content_tok;
@@ -559,13 +573,12 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                            return parseError(
                                tokenizer,
                                end_code_tag,
-                                "invalid token inside code_begin: {}",
+                                "invalid token inside code_begin: {s}",
                                .{end_tag_name},
                            );
                        }
                        _ = try eatToken(tokenizer, Token.Id.BracketClose);
-                    } else
-                        unreachable; // TODO issue #707
+                    } else unreachable; // TODO issue #707
                    try nodes.append(Node{
                        .Code = Code{
                            .id = code_kind_id,
@@ -576,6 +589,7 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                            .link_objects = link_objects.toOwnedSlice(),
                            .target_str = target_str,
                            .link_libc = link_libc,
+                            .link_mode = link_mode,
                            .disable_cache = disable_cache,
                        },
                    });
@@ -590,14 +604,14 @@ fn genToc(allocator: *mem.Allocator, tokenizer: *Tokenizer) !Toc {
                        return parseError(
                            tokenizer,
                            end_syntax_tag,
-                            "invalid token inside syntax: {}",
+                            "invalid token inside syntax: {s}",
                            .{end_tag_name},
                        );
                    }
                    _ = try eatToken(tokenizer, Token.Id.BracketClose);
                    try nodes.append(Node{ .Syntax = content_tok });
                } else {
-                    return parseError(tokenizer, tag_token, "unrecognized tag name: {}", .{tag_name});
+                    return parseError(tokenizer, tag_token, "unrecognized tag name: {s}", .{tag_name});
                }
            },
            else => return parseError(tokenizer, token, "invalid token", .{}),
@@ -615,7 +629,7 @@ fn urlize(allocator: *mem.Allocator, input: []const u8) ![]u8 {
    var buf = std.ArrayList(u8).init(allocator);
    defer buf.deinit();

-    const out = buf.outStream();
+    const out = buf.writer();
    for (input) |c| {
        switch (c) {
            'a'...'z', 'A'...'Z', '_', '-', '0'...'9' => {
@@ -634,7 +648,7 @@ fn escapeHtml(allocator: *mem.Allocator, input: []const u8) ![]u8 {
    var buf = std.ArrayList(u8).init(allocator);
    defer buf.deinit();

-    const out = buf.outStream();
+    const out = buf.writer();
    try writeEscaped(out, input);
    return buf.toOwnedSlice();
 }
@@ -680,7 +694,7 @@ fn termColor(allocator: *mem.Allocator, input: []const u8) ![]u8 {
    var buf = std.ArrayList(u8).init(allocator);
    defer buf.deinit();

-    var out = buf.outStream();
+    var out = buf.writer();
    var number_start_index: usize = undefined;
    var first_number: usize = undefined;
    var second_number: usize = undefined;
@@ -744,7 +758,7 @@ fn termColor(allocator: *mem.Allocator, input: []const u8) ![]u8 {
                        try out.writeAll("</span>");
                    }
                    if (first_number != 0 or second_number != 0) {
-                        try out.print("<span class=\"t{}_{}\">", .{ first_number, second_number });
+                        try out.print("<span class=\"t{d}_{d}\">", .{ first_number, second_number });
                        open_span_count += 1;
                    }
                },
@@ -781,106 +795,119 @@ fn tokenizeAndPrintRaw(docgen_tokenizer: *Tokenizer, out: anytype, source_token:
        next_tok_is_fn = false;

        const token = tokenizer.next();
-        try writeEscaped(out, src[index..token.loc.start]);
-        switch (token.id) {
-            .Eof => break,
+        if (mem.indexOf(u8, src[index..token.loc.start], "//")) |comment_start_off| {
+            // render one comment
+            const comment_start = index + comment_start_off;
+            const comment_end_off = mem.indexOf(u8, src[comment_start..token.loc.start], "\n");
+            const comment_end = if (comment_end_off) |o| comment_start + o else token.loc.start;

-            .Keyword_align,
-            .Keyword_and,
-            .Keyword_asm,
-            .Keyword_async,
-            .Keyword_await,
-            .Keyword_break,
-            .Keyword_catch,
-            .Keyword_comptime,
-            .Keyword_const,
-            .Keyword_continue,
-            .Keyword_defer,
-            .Keyword_else,
-            .Keyword_enum,
-            .Keyword_errdefer,
-            .Keyword_error,
-            .Keyword_export,
-            .Keyword_extern,
-            .Keyword_for,
-            .Keyword_if,
-            .Keyword_inline,
-            .Keyword_noalias,
-            .Keyword_noinline,
-            .Keyword_nosuspend,
-            .Keyword_opaque,
-            .Keyword_or,
-            .Keyword_orelse,
-            .Keyword_packed,
-            .Keyword_anyframe,
-            .Keyword_pub,
-            .Keyword_resume,
-            .Keyword_return,
-            .Keyword_linksection,
-            .Keyword_callconv,
-            .Keyword_struct,
-            .Keyword_suspend,
-            .Keyword_switch,
-            .Keyword_test,
-            .Keyword_threadlocal,
-            .Keyword_try,
-            .Keyword_union,
-            .Keyword_unreachable,
-            .Keyword_usingnamespace,
-            .Keyword_var,
-            .Keyword_volatile,
-            .Keyword_allowzero,
-            .Keyword_while,
-            .Keyword_anytype,
+            try writeEscaped(out, src[index..comment_start]);
+            try out.writeAll("<span class=\"tok-comment\">");
+            try writeEscaped(out, src[comment_start..comment_end]);
+            try out.writeAll("</span>");
+            index = comment_end;
+            tokenizer.index = index;
+            continue;
+        }
+
+        try writeEscaped(out, src[index..token.loc.start]);
+        switch (token.tag) {
+            .eof => break,
+
+            .keyword_align,
+            .keyword_and,
+            .keyword_asm,
+            .keyword_async,
+            .keyword_await,
+            .keyword_break,
+            .keyword_catch,
+            .keyword_comptime,
+            .keyword_const,
+            .keyword_continue,
+            .keyword_defer,
+            .keyword_else,
+            .keyword_enum,
+            .keyword_errdefer,
+            .keyword_error,
+            .keyword_export,
+            .keyword_extern,
+            .keyword_for,
+            .keyword_if,
+            .keyword_inline,
+            .keyword_noalias,
+            .keyword_noinline,
+            .keyword_nosuspend,
+            .keyword_opaque,
+            .keyword_or,
+            .keyword_orelse,
+            .keyword_packed,
+            .keyword_anyframe,
+            .keyword_pub,
+            .keyword_resume,
+            .keyword_return,
+            .keyword_linksection,
+            .keyword_callconv,
+            .keyword_struct,
+            .keyword_suspend,
+            .keyword_switch,
+            .keyword_test,
+            .keyword_threadlocal,
+            .keyword_try,
+            .keyword_union,
+            .keyword_unreachable,
+            .keyword_usingnamespace,
+            .keyword_var,
+            .keyword_volatile,
+            .keyword_allowzero,
+            .keyword_while,
+            .keyword_anytype,
            => {
                try out.writeAll("<span class=\"tok-kw\">");
                try writeEscaped(out, src[token.loc.start..token.loc.end]);
                try out.writeAll("</span>");
            },

-            .Keyword_fn => {
+            .keyword_fn => {
                try out.writeAll("<span class=\"tok-kw\">");
                try writeEscaped(out, src[token.loc.start..token.loc.end]);
                try out.writeAll("</span>");
                next_tok_is_fn = true;
            },

-            .Keyword_undefined,
-            .Keyword_null,
-            .Keyword_true,
-            .Keyword_false,
+            .keyword_undefined,
+            .keyword_null,
+            .keyword_true,
+            .keyword_false,
            => {
                try out.writeAll("<span class=\"tok-null\">");
                try writeEscaped(out, src[token.loc.start..token.loc.end]);
                try out.writeAll("</span>");
            },

-            .StringLiteral,
-            .MultilineStringLiteralLine,
-            .CharLiteral,
+            .string_literal,
+            .multiline_string_literal_line,
+            .char_literal,
            => {
                try out.writeAll("<span class=\"tok-str\">");
                try writeEscaped(out, src[token.loc.start..token.loc.end]);
                try out.writeAll("</span>");
            },

-            .Builtin => {
+            .builtin => {
                try out.writeAll("<span class=\"tok-builtin\">");
                try writeEscaped(out, src[token.loc.start..token.loc.end]);
                try out.writeAll("</span>");
            },

-            .LineComment,
-            .DocComment,
-            .ContainerDocComment,
-            .ShebangLine,
+            .doc_comment,
+            .container_doc_comment,
            => {
                try out.writeAll("<span class=\"tok-comment\">");
                try writeEscaped(out, src[token.loc.start..token.loc.end]);
                try out.writeAll("</span>");
            },

-            .Identifier => {
+            .identifier => {
                if (prev_tok_was_fn) {
                    try out.writeAll("<span class=\"tok-fn\">");
                    try writeEscaped(out, src[token.loc.start..token.loc.end]);
@@ -908,71 +935,71 @@ fn tokenizeAndPrintRaw(docgen_tokenizer: *Tokenizer, out: anytype, source_token:
                }
            },

-            .IntegerLiteral,
-            .FloatLiteral,
+            .integer_literal,
+            .float_literal,
            => {
                try out.writeAll("<span class=\"tok-number\">");
                try writeEscaped(out, src[token.loc.start..token.loc.end]);
                try out.writeAll("</span>");
            },

-            .Bang,
-            .Pipe,
-            .PipePipe,
-            .PipeEqual,
-            .Equal,
-            .EqualEqual,
-            .EqualAngleBracketRight,
-            .BangEqual,
-            .LParen,
-            .RParen,
-            .Semicolon,
-            .Percent,
-            .PercentEqual,
-            .LBrace,
-            .RBrace,
-            .LBracket,
-            .RBracket,
-            .Period,
-            .PeriodAsterisk,
-            .Ellipsis2,
-            .Ellipsis3,
-            .Caret,
-            .CaretEqual,
-            .Plus,
-            .PlusPlus,
-            .PlusEqual,
-            .PlusPercent,
-            .PlusPercentEqual,
-            .Minus,
-            .MinusEqual,
-            .MinusPercent,
-            .MinusPercentEqual,
-            .Asterisk,
-            .AsteriskEqual,
-            .AsteriskAsterisk,
-            .AsteriskPercent,
-            .AsteriskPercentEqual,
-            .Arrow,
-            .Colon,
-            .Slash,
-            .SlashEqual,
-            .Comma,
-            .Ampersand,
-            .AmpersandEqual,
-            .QuestionMark,
-            .AngleBracketLeft,
-            .AngleBracketLeftEqual,
-            .AngleBracketAngleBracketLeft,
-            .AngleBracketAngleBracketLeftEqual,
-            .AngleBracketRight,
-            .AngleBracketRightEqual,
-            .AngleBracketAngleBracketRight,
-            .AngleBracketAngleBracketRightEqual,
-            .Tilde,
+            .bang,
+            .pipe,
+            .pipe_pipe,
+            .pipe_equal,
+            .equal,
+            .equal_equal,
+            .equal_angle_bracket_right,
+            .bang_equal,
+            .l_paren,
+            .r_paren,
+            .semicolon,
+            .percent,
+            .percent_equal,
+            .l_brace,
+            .r_brace,
+            .l_bracket,
+            .r_bracket,
+            .period,
+            .period_asterisk,
+            .ellipsis2,
+            .ellipsis3,
+            .caret,
+            .caret_equal,
+            .plus,
+            .plus_plus,
+            .plus_equal,
+            .plus_percent,
+            .plus_percent_equal,
+            .minus,
+            .minus_equal,
+            .minus_percent,
+            .minus_percent_equal,
+            .asterisk,
+            .asterisk_equal,
+            .asterisk_asterisk,
+            .asterisk_percent,
+            .asterisk_percent_equal,
+            .arrow,
+            .colon,
+            .slash,
+            .slash_equal,
+            .comma,
+            .ampersand,
+            .ampersand_equal,
+            .question_mark,
+            .angle_bracket_left,
+            .angle_bracket_left_equal,
+            .angle_bracket_angle_bracket_left,
+            .angle_bracket_angle_bracket_left_equal,
+            .angle_bracket_right,
+            .angle_bracket_right_equal,
+            .angle_bracket_angle_bracket_right,
+            .angle_bracket_angle_bracket_right_equal,
+            .tilde,
            => try writeEscaped(out, src[token.loc.start..token.loc.end]),

-            .Invalid, .Invalid_ampersands, .Invalid_periodasterisks => return parseError(
+            .invalid, .invalid_ampersands, .invalid_periodasterisks => return parseError(
                docgen_tokenizer,
                source_token,
                "syntax error",
@@ -989,24 +1016,28 @@ fn tokenizeAndPrint(docgen_tokenizer: *Tokenizer, out: anytype, source_token: To
    return tokenizeAndPrintRaw(docgen_tokenizer, out, source_token, raw_src);
 }

-fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: anytype, zig_exe: []const u8) !void {
+fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: anytype, zig_exe: []const u8, do_code_tests: bool) !void {
    var code_progress_index: usize = 0;
+    var progress = Progress{};
+    const root_node = try progress.start("Generating docgen examples", toc.nodes.len);
+    defer root_node.end();

    var env_map = try process.getEnvMap(allocator);
-    try env_map.set("ZIG_DEBUG_COLOR", "1");
+    try env_map.put("ZIG_DEBUG_COLOR", "1");

    const builtin_code = try getBuiltinCode(allocator, &env_map, zig_exe);

    for (toc.nodes) |node| {
+        defer root_node.completeOne();
        switch (node) {
            .Content => |data| {
                try out.writeAll(data);
            },
            .Link => |info| {
                if (!toc.urls.contains(info.url)) {
-                    return parseError(tokenizer, info.token, "url not found: {}", .{info.url});
+                    return parseError(tokenizer, info.token, "url not found: {s}", .{info.url});
                }
-                try out.print("<a href=\"#{}\">{}</a>", .{ info.url, info.name });
+                try out.print("<a href=\"#{s}\">{s}</a>", .{ info.url, info.name });
            },
            .Nav => {
                try out.writeAll(toc.toc);
@@ -1018,7 +1049,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
            },
            .HeaderOpen => |info| {
                try out.print(
-                    "<h{} id=\"{}\"><a href=\"#toc-{}\">{}</a> <a class=\"hdr\" href=\"#{}\">§</a></h{}>\n",
+                    "<h{d} id=\"{s}\"><a href=\"#toc-{s}\">{s}</a> <a class=\"hdr\" href=\"#{s}\">§</a></h{d}>\n",
                    .{ info.n, info.url, info.url, info.name, info.url, info.n },
                );
            },
@@ -1027,9 +1058,9 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                for (items) |item| {
                    const url = try urlize(allocator, item.name);
                    if (!toc.urls.contains(url)) {
-                        return parseError(tokenizer, item.token, "url not found: {}", .{url});
+                        return parseError(tokenizer, item.token, "url not found: {s}", .{url});
                    }
-                    try out.print("<li><a href=\"#{}\">{}</a></li>\n", .{ url, item.name });
+                    try out.print("<li><a href=\"#{s}\">{s}</a></li>\n", .{ url, item.name });
                }
                try out.writeAll("</ul>\n");
            },
@@ -1037,18 +1068,20 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                try tokenizeAndPrint(tokenizer, out, content_tok);
            },
            .Code => |code| {
-                code_progress_index += 1;
-                print("docgen example code {}/{}...", .{ code_progress_index, tokenizer.code_node_count });
-
                const raw_source = tokenizer.buffer[code.source_token.start..code.source_token.end];
                const trimmed_raw_source = mem.trim(u8, raw_source, " \n");
                if (!code.is_inline) {
-                    try out.print("<p class=\"file\">{}.zig</p>", .{code.name});
+                    try out.print("<p class=\"file\">{s}.zig</p>", .{code.name});
                }
                try out.writeAll("<pre>");
                try tokenizeAndPrint(tokenizer, out, code.source_token);
                try out.writeAll("</pre>");
-                const name_plus_ext = try std.fmt.allocPrint(allocator, "{}.zig", .{code.name});
+
+                if (!do_code_tests) {
+                    continue;
+                }
+
+                const name_plus_ext = try std.fmt.allocPrint(allocator, "{s}.zig", .{code.name});
                const tmp_source_file_name = try fs.path.join(
                    allocator,
                    &[_][]const u8{ tmp_dir_name, name_plus_ext },
@@ -1057,7 +1090,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any

                switch (code.id) {
                    Code.Id.Exe => |expected_outcome| code_block: {
-                        const name_plus_bin_ext = try std.fmt.allocPrint(allocator, "{}{}", .{ code.name, exe_ext });
+                        const name_plus_bin_ext = try std.fmt.allocPrint(allocator, "{s}{s}", .{ code.name, exe_ext });
                        var build_args = std.ArrayList([]const u8).init(allocator);
                        defer build_args.deinit();
                        try build_args.appendSlice(&[_][]const u8{
@@ -1066,7 +1099,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                            "--color",        "on",
                            "--enable-cache", tmp_source_file_name,
                        });
-                        try out.print("<pre><code class=\"shell\">$ zig build-exe {}.zig", .{code.name});
+                        try out.print("<pre><code class=\"shell\">$ zig build-exe {s}.zig", .{code.name});
                        switch (code.mode) {
                            .Debug => {},
                            else => {
@@ -1075,7 +1108,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                            },
                        }
                        for (code.link_objects) |link_object| {
-                            const name_with_ext = try std.fmt.allocPrint(allocator, "{}{}", .{ link_object, obj_ext });
+                            const name_with_ext = try std.fmt.allocPrint(allocator, "{s}{s}", .{ link_object, obj_ext });
                            const full_path_object = try fs.path.join(
                                allocator,
                                &[_][]const u8{ tmp_dir_name, name_with_ext },
@@ -1093,7 +1126,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        if (code.target_str) |triple| {
                            try build_args.appendSlice(&[_][]const u8{ "-target", triple });
                            if (!code.is_inline) {
-                                try out.print(" -target {}", .{triple});
+                                try out.print(" -target {s}", .{triple});
                            }
                        }
                        if (expected_outcome == .BuildFail) {
@@ -1106,20 +1139,22 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                            switch (result.term) {
                                .Exited => |exit_code| {
                                    if (exit_code == 0) {
-                                        print("{}\nThe following command incorrectly succeeded:\n", .{result.stderr});
+                                        progress.log("", .{});
+                                        print("{s}\nThe following command incorrectly succeeded:\n", .{result.stderr});
                                        dumpArgs(build_args.items);
                                        return parseError(tokenizer, code.source_token, "example incorrectly compiled", .{});
                                    }
                                },
                                else => {
-                                    print("{}\nThe following command crashed:\n", .{result.stderr});
+                                    progress.log("", .{});
+                                    print("{s}\nThe following command crashed:\n", .{result.stderr});
                                    dumpArgs(build_args.items);
                                    return parseError(tokenizer, code.source_token, "example compile crashed", .{});
                                },
                            }
                            const escaped_stderr = try escapeHtml(allocator, result.stderr);
                            const colored_stderr = try termColor(allocator, escaped_stderr);
-                            try out.print("\n{}</code></pre>\n", .{colored_stderr});
+                            try out.print("\n{s}</code></pre>\n", .{colored_stderr});
                            break :code_block;
                        }
                        const exec_result = exec(allocator, &env_map, build_args.items) catch
@@ -1138,7 +1173,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        }

                        const path_to_exe_dir = mem.trim(u8, exec_result.stdout, " \r\n");
-                        const path_to_exe_basename = try std.fmt.allocPrint(allocator, "{}{}", .{
+                        const path_to_exe_basename = try std.fmt.allocPrint(allocator, "{s}{s}", .{
                            code.name,
                            target.exeFileExt(),
                        });
@@ -1160,7 +1195,8 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                            switch (result.term) {
                                .Exited => |exit_code| {
                                    if (exit_code == 0) {
-                                        print("{}\nThe following command incorrectly succeeded:\n", .{result.stderr});
+                                        progress.log("", .{});
+                                        print("{s}\nThe following command incorrectly succeeded:\n", .{result.stderr});
                                        dumpArgs(run_args);
                                        return parseError(tokenizer, code.source_token, "example incorrectly compiled", .{});
                                    }
@@ -1179,7 +1215,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        const colored_stderr = try termColor(allocator, escaped_stderr);
                        const colored_stdout = try termColor(allocator, escaped_stdout);

-                        try out.print("\n$ ./{}\n{}{}", .{ code.name, colored_stdout, colored_stderr });
+                        try out.print("\n$ ./{s}\n{s}{s}", .{ code.name, colored_stdout, colored_stderr });
                        if (exited_with_signal) {
                            try out.print("(process terminated by signal)", .{});
                        }
@@ -1190,7 +1226,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        defer test_args.deinit();

                        try test_args.appendSlice(&[_][]const u8{ zig_exe, "test", tmp_source_file_name });
-                        try out.print("<pre><code class=\"shell\">$ zig test {}.zig", .{code.name});
+                        try out.print("<pre><code class=\"shell\">$ zig test {s}.zig", .{code.name});
                        switch (code.mode) {
                            .Debug => {},
                            else => {
@@ -1204,12 +1240,12 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        }
                        if (code.target_str) |triple| {
                            try test_args.appendSlice(&[_][]const u8{ "-target", triple });
-                            try out.print(" -target {}", .{triple});
+                            try out.print(" -target {s}", .{triple});
                        }
                        const result = exec(allocator, &env_map, test_args.items) catch return parseError(tokenizer, code.source_token, "test failed", .{});
                        const escaped_stderr = try escapeHtml(allocator, result.stderr);
                        const escaped_stdout = try escapeHtml(allocator, result.stdout);
-                        try out.print("\n{}{}</code></pre>\n", .{ escaped_stderr, escaped_stdout });
+                        try out.print("\n{s}{s}</code></pre>\n", .{ escaped_stderr, escaped_stdout });
                    },
                    Code.Id.TestError => |error_match| {
                        var test_args = std.ArrayList([]const u8).init(allocator);
@@ -1222,7 +1258,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                            "on",
                            tmp_source_file_name,
                        });
-                        try out.print("<pre><code class=\"shell\">$ zig test {}.zig", .{code.name});
+                        try out.print("<pre><code class=\"shell\">$ zig test {s}.zig", .{code.name});
                        switch (code.mode) {
                            .Debug => {},
                            else => {
@@ -1239,24 +1275,27 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        switch (result.term) {
                            .Exited => |exit_code| {
                                if (exit_code == 0) {
-                                    print("{}\nThe following command incorrectly succeeded:\n", .{result.stderr});
+                                    progress.log("", .{});
+                                    print("{s}\nThe following command incorrectly succeeded:\n", .{result.stderr});
                                    dumpArgs(test_args.items);
                                    return parseError(tokenizer, code.source_token, "example incorrectly compiled", .{});
                                }
                            },
                            else => {
-                                print("{}\nThe following command crashed:\n", .{result.stderr});
+                                progress.log("", .{});
+                                print("{s}\nThe following command crashed:\n", .{result.stderr});
                                dumpArgs(test_args.items);
                                return parseError(tokenizer, code.source_token, "example compile crashed", .{});
                            },
                        }
                        if (mem.indexOf(u8, result.stderr, error_match) == null) {
-                            print("{}\nExpected to find '{}' in stderr\n", .{ result.stderr, error_match });
+                            progress.log("", .{});
+                            print("{s}\nExpected to find '{s}' in stderr\n", .{ result.stderr, error_match });
                            return parseError(tokenizer, code.source_token, "example did not have expected compile error", .{});
                        }
                        const escaped_stderr = try escapeHtml(allocator, result.stderr);
                        const colored_stderr = try termColor(allocator, escaped_stderr);
-                        try out.print("\n{}</code></pre>\n", .{colored_stderr});
+                        try out.print("\n{s}</code></pre>\n", .{colored_stderr});
                    },

                    Code.Id.TestSafety => |error_match| {
@@ -1294,31 +1333,34 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        switch (result.term) {
                            .Exited => |exit_code| {
                                if (exit_code == 0) {
-                                    print("{}\nThe following command incorrectly succeeded:\n", .{result.stderr});
+                                    progress.log("", .{});
+                                    print("{s}\nThe following command incorrectly succeeded:\n", .{result.stderr});
                                    dumpArgs(test_args.items);
                                    return parseError(tokenizer, code.source_token, "example test incorrectly succeeded", .{});
                                }
                            },
                            else => {
-                                print("{}\nThe following command crashed:\n", .{result.stderr});
+                                progress.log("", .{});
+                                print("{s}\nThe following command crashed:\n", .{result.stderr});
                                dumpArgs(test_args.items);
                                return parseError(tokenizer, code.source_token, "example compile crashed", .{});
                            },
                        }
                        if (mem.indexOf(u8, result.stderr, error_match) == null) {
-                            print("{}\nExpected to find '{}' in stderr\n", .{ result.stderr, error_match });
+                            progress.log("", .{});
+                            print("{s}\nExpected to find '{s}' in stderr\n", .{ result.stderr, error_match });
                            return parseError(tokenizer, code.source_token, "example did not have expected runtime safety error message", .{});
                        }
                        const escaped_stderr = try escapeHtml(allocator, result.stderr);
                        const colored_stderr = try termColor(allocator, escaped_stderr);
-                        try out.print("<pre><code class=\"shell\">$ zig test {}.zig{}\n{}</code></pre>\n", .{
+                        try out.print("<pre><code class=\"shell\">$ zig test {s}.zig {s}\n{s}</code></pre>\n", .{
                            code.name,
                            mode_arg,
                            colored_stderr,
                        });
                    },
                    Code.Id.Obj => |maybe_error_match| {
-                        const name_plus_obj_ext = try std.fmt.allocPrint(allocator, "{}{}", .{ code.name, obj_ext });
+                        const name_plus_obj_ext = try std.fmt.allocPrint(allocator, "{s}{s}", .{ code.name, obj_ext });
                        const tmp_obj_file_name = try fs.path.join(
                            allocator,
                            &[_][]const u8{ tmp_dir_name, name_plus_obj_ext },
@@ -1326,7 +1368,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        var build_args = std.ArrayList([]const u8).init(allocator);
                        defer build_args.deinit();

-                        const name_plus_h_ext = try std.fmt.allocPrint(allocator, "{}.h", .{code.name});
+                        const name_plus_h_ext = try std.fmt.allocPrint(allocator, "{s}.h", .{code.name});
                        const output_h_file_name = try fs.path.join(
                            allocator,
                            &[_][]const u8{ tmp_dir_name, name_plus_h_ext },
@@ -1345,7 +1387,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                            }),
                        });
                        if (!code.is_inline) {
-                            try out.print("<pre><code class=\"shell\">$ zig build-obj {}.zig", .{code.name});
+                            try out.print("<pre><code class=\"shell\">$ zig build-obj {s}.zig", .{code.name});
                        }

                        switch (code.mode) {
@@ -1360,7 +1402,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any

                        if (code.target_str) |triple| {
                            try build_args.appendSlice(&[_][]const u8{ "-target", triple });
-                            try out.print(" -target {}", .{triple});
+                            try out.print(" -target {s}", .{triple});
                        }

                        if (maybe_error_match) |error_match| {
@@ -1373,24 +1415,27 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                            switch (result.term) {
                                .Exited => |exit_code| {
                                    if (exit_code == 0) {
-                                        print("{}\nThe following command incorrectly succeeded:\n", .{result.stderr});
+                                        progress.log("", .{});
+                                        print("{s}\nThe following command incorrectly succeeded:\n", .{result.stderr});
                                        dumpArgs(build_args.items);
                                        return parseError(tokenizer, code.source_token, "example build incorrectly succeeded", .{});
                                    }
                                },
                                else => {
-                                    print("{}\nThe following command crashed:\n", .{result.stderr});
+                                    progress.log("", .{});
+                                    print("{s}\nThe following command crashed:\n", .{result.stderr});
                                    dumpArgs(build_args.items);
                                    return parseError(tokenizer, code.source_token, "example compile crashed", .{});
                                },
                            }
                            if (mem.indexOf(u8, result.stderr, error_match) == null) {
-                                print("{}\nExpected to find '{}' in stderr\n", .{ result.stderr, error_match });
+                                progress.log("", .{});
+                                print("{s}\nExpected to find '{s}' in stderr\n", .{ result.stderr, error_match });
                                return parseError(tokenizer, code.source_token, "example did not have expected compile error message", .{});
                            }
                            const escaped_stderr = try escapeHtml(allocator, result.stderr);
                            const colored_stderr = try termColor(allocator, escaped_stderr);
-                            try out.print("\n{}", .{colored_stderr});
+                            try out.print("\n{s}", .{colored_stderr});
                        } else {
                            _ = exec(allocator, &env_map, build_args.items) catch return parseError(tokenizer, code.source_token, "example failed to compile", .{});
                        }
@@ -1416,7 +1461,7 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                                tmp_dir_name, fs.path.sep_str, bin_basename,
                            }),
                        });
-                        try out.print("<pre><code class=\"shell\">$ zig build-lib {}.zig", .{code.name});
+                        try out.print("<pre><code class=\"shell\">$ zig build-lib {s}.zig", .{code.name});
                        switch (code.mode) {
                            .Debug => {},
                            else => {
@@ -1426,15 +1471,26 @@ fn genHtml(allocator: *mem.Allocator, tokenizer: *Tokenizer, toc: *Toc, out: any
                        }
                        if (code.target_str) |triple| {
                            try test_args.appendSlice(&[_][]const u8{ "-target", triple });
-                            try out.print(" -target {}", .{triple});
+                            try out.print(" -target {s}", .{triple});
+                        }
+                        if (code.link_mode) |link_mode| {
+                            switch (link_mode) {
+                                .Static => {
+                                    try test_args.append("-static");
+                                    try out.print(" -static", .{});
+                                },
+                                .Dynamic => {
+                                    try test_args.append("-dynamic");
+                                    try out.print(" -dynamic", .{});
+                                },
+                            }
                        }
                        const result = exec(allocator, &env_map, test_args.items) catch return parseError(tokenizer, code.source_token, "test failed", .{});
                        const escaped_stderr = try escapeHtml(allocator, result.stderr);
                        const escaped_stdout = try escapeHtml(allocator, result.stdout);
-                        try out.print("\n{}{}</code></pre>\n", .{ escaped_stderr, escaped_stdout });
+                        try out.print("\n{s}{s}</code></pre>\n", .{ escaped_stderr, escaped_stdout });
                    },
                }
-                print("OK\n", .{});
            },
        }
    }
@@ -1450,13 +1506,13 @@ fn exec(allocator: *mem.Allocator, env_map: *std.BufMap, args: []const []const u
    switch (result.term) {
        .Exited => |exit_code| {
            if (exit_code != 0) {
-                print("{}\nThe following command exited with code {}:\n", .{ result.stderr, exit_code });
+                print("{s}\nThe following command exited with code {}:\n", .{ result.stderr, exit_code });
                dumpArgs(args);
                return error.ChildExitError;
            }
        },
        else => {
-            print("{}\nThe following command crashed:\n", .{result.stderr});
+            print("{s}\nThe following command crashed:\n", .{result.stderr});
            dumpArgs(args);
            return error.ChildCrashed;
        },
@@ -1471,7 +1527,7 @@ fn getBuiltinCode(allocator: *mem.Allocator, env_map: *std.BufMap, zig_exe: []co

 fn dumpArgs(args: []const []const u8) void {
    for (args) |arg|
-        print("{} ", .{arg})
+        print("{s} ", .{arg})
    else
        print("\n", .{});
 }
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
--- a/lib/include/__clang_cuda_builtin_vars.h
+++ b/lib/include/__clang_cuda_builtin_vars.h
@@ -55,7 +55,9 @@ struct __cuda_builtin_threadIdx_t {
  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
  // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
  __attribute__((device)) operator uint3() const;
+
 private:
  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
 };
@@ -66,7 +68,9 @@ struct __cuda_builtin_blockIdx_t {
  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
  // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
  __attribute__((device)) operator uint3() const;
+
 private:
  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
 };
@@ -78,6 +82,8 @@ struct __cuda_builtin_blockDim_t {
  // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
  // dim3).  This function is defined after we pull in vector_types.h.
  __attribute__((device)) operator dim3() const;
+  __attribute__((device)) operator uint3() const;
+
 private:
  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
 };
@@ -89,6 +95,8 @@ struct __cuda_builtin_gridDim_t {
  // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
  // dim3).  This function is defined after we pull in vector_types.h.
  __attribute__((device)) operator dim3() const;
+  __attribute__((device)) operator uint3() const;
+
 private:
  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
 };
@@ -108,5 +116,6 @@ __attribute__((device)) const int warpSize = 32;
 #undef __CUDA_DEVICE_BUILTIN
 #undef __CUDA_BUILTIN_VAR
 #undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
+#undef __DELETE

 #endif /* __CUDA_BUILTIN_VARS_H */
--- a/lib/include/__clang_cuda_cmath.h
+++ b/lib/include/__clang_cuda_cmath.h
@@ -66,10 +66,38 @@ __DEVICE__ float frexp(float __arg, int *__exp) {
 }

 // For inscrutable reasons, the CUDA headers define these functions for us on
-// Windows. For OpenMP we omit these as some old system headers have
-// non-conforming `isinf(float)` and `isnan(float)` implementations that return
-// an `int`. The system versions of these functions should be fine anyway.
-#if !defined(_MSC_VER) && !defined(__OPENMP_NVPTX__)
+// Windows.
+#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
+
+// For OpenMP we work around some old system headers that have non-conforming
+// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
+// this by providing two versions of these functions, differing only in the
+// return type. To avoid conflicting definitions we disable implicit base
+// function generation. That means we will end up with two specializations, one
+// per type, but only one has a base function defined by the system header.
+#if defined(__OPENMP_NVPTX__)
+#pragma omp begin declare variant match(                                       \
+    implementation = {extension(disable_implicit_base)})
+
+// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
+//        add a suffix. This means we would clash with the names of the variants
+//        (note that we do not create implicit base functions here). To avoid
+//        this clash we add a new trait to some of them that is always true
+//        (this is LLVM after all ;)). It will only influence the mangled name
+//        of the variants inside the inner region and avoid the clash.
+#pragma omp begin declare variant match(implementation = {vendor(llvm)})
+
+__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
+__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
+
+#pragma omp end declare variant
+
+#endif
+
 __DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
 __DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
@@ -79,6 +107,11 @@ __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
 __DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
 __DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
 __DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+
+#if defined(__OPENMP_NVPTX__)
+#pragma omp end declare variant
+#endif
+
 #endif

 __DEVICE__ bool isgreater(float __x, float __y) {
@@ -142,6 +175,15 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
 __DEVICE__ float tan(float __x) { return ::tanf(__x); }
 __DEVICE__ float tanh(float __x) { return ::tanhf(__x); }

+// There was a redefinition error for this this overload in CUDA mode.
+// We restrict it to OpenMP mode for now, that is where it is actually needed
+// anyway.
+#ifdef __OPENMP_NVPTX__
+__DEVICE__ float remquo(float __n, float __d, int *__q) {
+  return ::remquof(__n, __d, __q);
+}
+#endif
+
 // Notably missing above is nexttoward.  We omit it because
 // libdevice doesn't provide an implementation, and we don't want to be in the
 // business of implementing tricky libm functions in this header.
--- a/lib/include/__clang_cuda_complex_builtins.h
+++ b/lib/include/__clang_cuda_complex_builtins.h
@@ -16,7 +16,7 @@
 // to work with CUDA and OpenMP target offloading [in C and C++ mode].)

 #pragma push_macro("__DEVICE__")
-#ifdef _OPENMP
+#ifdef __OPENMP_NVPTX__
 #pragma omp declare target
 #define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
 #else
@@ -26,7 +26,7 @@
 // To make the algorithms available for C and C++ in CUDA and OpenMP we select
 // different but equivalent function versions. TODO: For OpenMP we currently
 // select the native builtins as the overload support for templates is lacking.
-#if !defined(_OPENMP)
+#if !defined(__OPENMP_NVPTX__)
 #define _ISNANd std::isnan
 #define _ISNANf std::isnan
 #define _ISINFd std::isinf
@@ -41,6 +41,27 @@
 #define _ABSf std::abs
 #define _LOGBd std::logb
 #define _LOGBf std::logb
+// Rather than pulling in std::max from algorithm everytime, use available ::max.
+#define _fmaxd max
+#define _fmaxf max
+#else
+#ifdef __AMDGCN__
+#define _ISNANd __ocml_isnan_f64
+#define _ISNANf __ocml_isnan_f32
+#define _ISINFd __ocml_isinf_f64
+#define _ISINFf __ocml_isinf_f32
+#define _ISFINITEd __ocml_isfinite_f64
+#define _ISFINITEf __ocml_isfinite_f32
+#define _COPYSIGNd __ocml_copysign_f64
+#define _COPYSIGNf __ocml_copysign_f32
+#define _SCALBNd __ocml_scalbn_f64
+#define _SCALBNf __ocml_scalbn_f32
+#define _ABSd __ocml_fabs_f64
+#define _ABSf __ocml_fabs_f32
+#define _LOGBd __ocml_logb_f64
+#define _LOGBf __ocml_logb_f32
+#define _fmaxd __ocml_fmax_f64
+#define _fmaxf __ocml_fmax_f32
 #else
 #define _ISNANd __nv_isnand
 #define _ISNANf __nv_isnanf
@@ -56,6 +77,9 @@
 #define _ABSf __nv_fabsf
 #define _LOGBd __nv_logb
 #define _LOGBf __nv_logbf
+#define _fmaxd __nv_fmax
+#define _fmaxf __nv_fmaxf
+#endif
 #endif

 #if defined(__cplusplus)
@@ -167,7 +191,7 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
  // Can't use std::max, because that's defined in <algorithm>, and we don't
  // want to pull that in for every compile.  The CUDA headers define
  // ::max(float, float) and ::max(double, double), which is sufficient for us.
-  double __logbw = _LOGBd(max(_ABSd(__c), _ABSd(__d)));
+  double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d)));
  if (_ISFINITEd(__logbw)) {
    __ilogbw = (int)__logbw;
    __c = _SCALBNd(__c, -__ilogbw);
@@ -200,7 +224,7 @@ __DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,

 __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
  int __ilogbw = 0;
-  float __logbw = _LOGBf(max(_ABSf(__c), _ABSf(__d)));
+  float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d)));
  if (_ISFINITEf(__logbw)) {
    __ilogbw = (int)__logbw;
    __c = _SCALBNf(__c, -__ilogbw);
@@ -249,8 +273,10 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
 #undef _ABSf
 #undef _LOGBd
 #undef _LOGBf
+#undef _fmaxd
+#undef _fmaxf

-#ifdef _OPENMP
+#ifdef __OPENMP_NVPTX__
 #pragma omp end declare target
 #endif

--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@@ -195,8 +195,8 @@ __DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
 __DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
 __DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
 __DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
-__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); }
-__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); }
+__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); }
+__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); }
 __DEVICE__ double nextafter(double __a, double __b) {
  return __nv_nextafter(__a, __b);
 }
@@ -249,8 +249,9 @@ __DEVICE__ double rhypot(double __a, double __b) {
 __DEVICE__ float rhypotf(float __a, float __b) {
  return __nv_rhypotf(__a, __b);
 }
-__DEVICE__ double rint(double __a) { return __nv_rint(__a); }
-__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); }
+// __nv_rint* in libdevice is buggy and produces incorrect results.
+__DEVICE__ double rint(double __a) { return __builtin_rint(__a); }
+__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); }
 __DEVICE__ double rnorm(int __a, const double *__b) {
  return __nv_rnorm(__a, __b);
 }
--- a/lib/include/__clang_cuda_math_forward_declares.h
+++ b/lib/include/__clang_cuda_math_forward_declares.h
@@ -160,6 +160,9 @@ __DEVICE__ double scalbln(double, long);
 __DEVICE__ float scalbln(float, long);
 __DEVICE__ double scalbn(double, int);
 __DEVICE__ float scalbn(float, int);
+#ifdef _MSC_VER
+__DEVICE__ bool signbit(long double);
+#endif
 __DEVICE__ bool signbit(double);
 __DEVICE__ bool signbit(float);
 __DEVICE__ double sin(double);
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@@ -377,30 +377,38 @@ __device__ static inline void *malloc(size_t __size) {
 // Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
 // come after we've pulled in the definition of uint3 and dim3.

+__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
 __device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
-  uint3 ret;
-  ret.x = x;
-  ret.y = y;
-  ret.z = z;
-  return ret;
+  return {x, y, z};
+}
+
+__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const {
+  return dim3(x, y, z);
 }

 __device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
-  uint3 ret;
-  ret.x = x;
-  ret.y = y;
-  ret.z = z;
-  return ret;
+  return {x, y, z};
 }

 __device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
  return dim3(x, y, z);
 }

+__device__ inline __cuda_builtin_blockDim_t::operator uint3() const {
+  return {x, y, z};
+}
+
 __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
  return dim3(x, y, z);
 }

+__device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
+  return {x, y, z};
+}
+
 #include <__clang_cuda_cmath.h>
 #include <__clang_cuda_intrinsics.h>
 #include <__clang_cuda_complex_builtins.h>
--- a/lib/include/__clang_hip_cmath.h
+++ b/lib/include/__clang_hip_cmath.h
@@ -0,0 +1,664 @@
+/*===---- __clang_hip_cmath.h - HIP cmath decls -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_HIP_CMATH_H__
+#define __CLANG_HIP_CMATH_H__
+
+#if !defined(__HIP__)
+#error "This file is for HIP and OpenMP AMDGCN device compilation only."
+#endif
+
+#if defined(__cplusplus)
+#include <limits>
+#include <type_traits>
+#include <utility>
+#endif
+#include <limits.h>
+#include <stdint.h>
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__ static __device__ inline __attribute__((always_inline))
+
+// Start with functions that cannot be defined by DEF macros below.
+#if defined(__cplusplus)
+__DEVICE__ double abs(double __x) { return ::fabs(__x); }
+__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
+__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
+__DEVICE__ long abs(long __n) { return ::labs(__n); }
+__DEVICE__ float fma(float __x, float __y, float __z) {
+  return ::fmaf(__x, __y, __z);
+}
+__DEVICE__ int fpclassify(float __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ int fpclassify(double __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ float frexp(float __arg, int *__exp) {
+  return ::frexpf(__arg, __exp);
+}
+__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ bool isgreater(float __x, float __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreater(double __x, double __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(float __x, float __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(double __x, double __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ bool isless(float __x, float __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool isless(double __x, double __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool islessequal(float __x, float __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessequal(double __x, double __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessgreater(float __x, float __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool islessgreater(double __x, double __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isunordered(float __x, float __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ bool isunordered(double __x, double __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
+__DEVICE__ float pow(float __base, int __iexp) {
+  return ::powif(__base, __iexp);
+}
+__DEVICE__ double pow(double __base, int __iexp) {
+  return ::powi(__base, __iexp);
+}
+__DEVICE__ float remquo(float __x, float __y, int *__quo) {
+  return ::remquof(__x, __y, __quo);
+}
+__DEVICE__ float scalbln(float __x, long int __n) {
+  return ::scalblnf(__x, __n);
+}
+__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
+
+// Notably missing above is nexttoward.  We omit it because
+// ocml doesn't provide an implementation, and we don't want to be in the
+// business of implementing tricky libm functions in this header.
+
+// Other functions.
+__DEVICE__ _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
+  return __ocml_fma_f16(__x, __y, __z);
+}
+__DEVICE__ _Float16 pow(_Float16 __base, int __iexp) {
+  return __ocml_pown_f16(__base, __iexp);
+}
+
+// BEGIN DEF_FUN and HIP_OVERLOAD
+
+// BEGIN DEF_FUN
+
+#pragma push_macro("__DEF_FUN1")
+#pragma push_macro("__DEF_FUN2")
+#pragma push_macro("__DEF_FUN2_FI")
+
+// Define cmath functions with float argument and returns __retty.
+#define __DEF_FUN1(__retty, __func)                                            \
+  __DEVICE__                                                                   \
+  __retty __func(float __x) { return __func##f(__x); }
+
+// Define cmath functions with two float arguments and returns __retty.
+#define __DEF_FUN2(__retty, __func)                                            \
+  __DEVICE__                                                                   \
+  __retty __func(float __x, float __y) { return __func##f(__x, __y); }
+
+// Define cmath functions with a float and an int argument and returns __retty.
+#define __DEF_FUN2_FI(__retty, __func)                                         \
+  __DEVICE__                                                                   \
+  __retty __func(float __x, int __y) { return __func##f(__x, __y); }
+
+__DEF_FUN1(float, acos)
+__DEF_FUN1(float, acosh)
+__DEF_FUN1(float, asin)
+__DEF_FUN1(float, asinh)
+__DEF_FUN1(float, atan)
+__DEF_FUN2(float, atan2)
+__DEF_FUN1(float, atanh)
+__DEF_FUN1(float, cbrt)
+__DEF_FUN1(float, ceil)
+__DEF_FUN2(float, copysign)
+__DEF_FUN1(float, cos)
+__DEF_FUN1(float, cosh)
+__DEF_FUN1(float, erf)
+__DEF_FUN1(float, erfc)
+__DEF_FUN1(float, exp)
+__DEF_FUN1(float, exp2)
+__DEF_FUN1(float, expm1)
+__DEF_FUN1(float, fabs)
+__DEF_FUN2(float, fdim)
+__DEF_FUN1(float, floor)
+__DEF_FUN2(float, fmax)
+__DEF_FUN2(float, fmin)
+__DEF_FUN2(float, fmod)
+__DEF_FUN2(float, hypot)
+__DEF_FUN1(int, ilogb)
+__DEF_FUN2_FI(float, ldexp)
+__DEF_FUN1(float, lgamma)
+__DEF_FUN1(float, log)
+__DEF_FUN1(float, log10)
+__DEF_FUN1(float, log1p)
+__DEF_FUN1(float, log2)
+__DEF_FUN1(float, logb)
+__DEF_FUN1(long long, llrint)
+__DEF_FUN1(long long, llround)
+__DEF_FUN1(long, lrint)
+__DEF_FUN1(long, lround)
+__DEF_FUN1(float, nearbyint)
+__DEF_FUN2(float, nextafter)
+__DEF_FUN2(float, pow)
+__DEF_FUN2(float, remainder)
+__DEF_FUN1(float, rint)
+__DEF_FUN1(float, round)
+__DEF_FUN2_FI(float, scalbn)
+__DEF_FUN1(float, sin)
+__DEF_FUN1(float, sinh)
+__DEF_FUN1(float, sqrt)
+__DEF_FUN1(float, tan)
+__DEF_FUN1(float, tanh)
+__DEF_FUN1(float, tgamma)
+__DEF_FUN1(float, trunc)
+
+#pragma pop_macro("__DEF_FUN1")
+#pragma pop_macro("__DEF_FUN2")
+#pragma pop_macro("__DEF_FUN2_FI")
+
+// END DEF_FUN
+
+// BEGIN HIP_OVERLOAD
+
+#pragma push_macro("__HIP_OVERLOAD1")
+#pragma push_macro("__HIP_OVERLOAD2")
+
+// __hip_enable_if::type is a type function which returns __T if __B is true.
+template <bool __B, class __T = void> struct __hip_enable_if {};
+
+template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
+
+// decltype is only available in C++11 and above.
+#if __cplusplus >= 201103L
+// __hip_promote
+namespace __hip {
+
+template <class _Tp> struct __numeric_type {
+  static void __test(...);
+  static _Float16 __test(_Float16);
+  static float __test(float);
+  static double __test(char);
+  static double __test(int);
+  static double __test(unsigned);
+  static double __test(long);
+  static double __test(unsigned long);
+  static double __test(long long);
+  static double __test(unsigned long long);
+  static double __test(double);
+  // No support for long double, use double instead.
+  static double __test(long double);
+
+  typedef decltype(__test(std::declval<_Tp>())) type;
+  static const bool value = !std::is_same<type, void>::value;
+};
+
+template <> struct __numeric_type<void> { static const bool value = true; };
+
+template <class _A1, class _A2 = void, class _A3 = void,
+          bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
+              &&__numeric_type<_A3>::value>
+class __promote_imp {
+public:
+  static const bool value = false;
+};
+
+template <class _A1, class _A2, class _A3>
+class __promote_imp<_A1, _A2, _A3, true> {
+private:
+  typedef typename __promote_imp<_A1>::type __type1;
+  typedef typename __promote_imp<_A2>::type __type2;
+  typedef typename __promote_imp<_A3>::type __type3;
+
+public:
+  typedef decltype(__type1() + __type2() + __type3()) type;
+  static const bool value = true;
+};
+
+template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
+private:
+  typedef typename __promote_imp<_A1>::type __type1;
+  typedef typename __promote_imp<_A2>::type __type2;
+
+public:
+  typedef decltype(__type1() + __type2()) type;
+  static const bool value = true;
+};
+
+template <class _A1> class __promote_imp<_A1, void, void, true> {
+public:
+  typedef typename __numeric_type<_A1>::type type;
+  static const bool value = true;
+};
+
+template <class _A1, class _A2 = void, class _A3 = void>
+class __promote : public __promote_imp<_A1, _A2, _A3> {};
+
+} // namespace __hip
+#endif //__cplusplus >= 201103L
+
+// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
+// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
+// floor(double).
+#define __HIP_OVERLOAD1(__retty, __fn)                                         \
+  template <typename __T>                                                      \
+  __DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+  __fn(__T __x) {                                                              \
+    return ::__fn((double)__x);                                                \
+  }
+
+// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
+// or integer argument to avoid compilation error due to ambibuity. e.g.
+// max(5.0f, 6.0) is resolved with max(double, double).
+#if __cplusplus >= 201103L
+#define __HIP_OVERLOAD2(__retty, __fn)                                         \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__ typename __hip_enable_if<                                         \
+      std::numeric_limits<__T1>::is_specialized &&                             \
+          std::numeric_limits<__T2>::is_specialized,                           \
+      typename __hip::__promote<__T1, __T2>::type>::type                       \
+  __fn(__T1 __x, __T2 __y) {                                                   \
+    typedef typename __hip::__promote<__T1, __T2>::type __result_type;         \
+    return __fn((__result_type)__x, (__result_type)__y);                       \
+  }
+#else
+#define __HIP_OVERLOAD2(__retty, __fn)                                         \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__                                                                   \
+      typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&    \
+                                   std::numeric_limits<__T2>::is_specialized,  \
+                               __retty>::type                                  \
+      __fn(__T1 __x, __T2 __y) {                                               \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+#endif
+
+__HIP_OVERLOAD1(double, abs)
+__HIP_OVERLOAD1(double, acos)
+__HIP_OVERLOAD1(double, acosh)
+__HIP_OVERLOAD1(double, asin)
+__HIP_OVERLOAD1(double, asinh)
+__HIP_OVERLOAD1(double, atan)
+__HIP_OVERLOAD2(double, atan2)
+__HIP_OVERLOAD1(double, atanh)
+__HIP_OVERLOAD1(double, cbrt)
+__HIP_OVERLOAD1(double, ceil)
+__HIP_OVERLOAD2(double, copysign)
+__HIP_OVERLOAD1(double, cos)
+__HIP_OVERLOAD1(double, cosh)
+__HIP_OVERLOAD1(double, erf)
+__HIP_OVERLOAD1(double, erfc)
+__HIP_OVERLOAD1(double, exp)
+__HIP_OVERLOAD1(double, exp2)
+__HIP_OVERLOAD1(double, expm1)
+__HIP_OVERLOAD1(double, fabs)
+__HIP_OVERLOAD2(double, fdim)
+__HIP_OVERLOAD1(double, floor)
+__HIP_OVERLOAD2(double, fmax)
+__HIP_OVERLOAD2(double, fmin)
+__HIP_OVERLOAD2(double, fmod)
+__HIP_OVERLOAD1(int, fpclassify)
+__HIP_OVERLOAD2(double, hypot)
+__HIP_OVERLOAD1(int, ilogb)
+__HIP_OVERLOAD1(bool, isfinite)
+__HIP_OVERLOAD2(bool, isgreater)
+__HIP_OVERLOAD2(bool, isgreaterequal)
+__HIP_OVERLOAD1(bool, isinf)
+__HIP_OVERLOAD2(bool, isless)
+__HIP_OVERLOAD2(bool, islessequal)
+__HIP_OVERLOAD2(bool, islessgreater)
+__HIP_OVERLOAD1(bool, isnan)
+__HIP_OVERLOAD1(bool, isnormal)
+__HIP_OVERLOAD2(bool, isunordered)
+__HIP_OVERLOAD1(double, lgamma)
+__HIP_OVERLOAD1(double, log)
+__HIP_OVERLOAD1(double, log10)
+__HIP_OVERLOAD1(double, log1p)
+__HIP_OVERLOAD1(double, log2)
+__HIP_OVERLOAD1(double, logb)
+__HIP_OVERLOAD1(long long, llrint)
+__HIP_OVERLOAD1(long long, llround)
+__HIP_OVERLOAD1(long, lrint)
+__HIP_OVERLOAD1(long, lround)
+__HIP_OVERLOAD1(double, nearbyint)
+__HIP_OVERLOAD2(double, nextafter)
+__HIP_OVERLOAD2(double, pow)
+__HIP_OVERLOAD2(double, remainder)
+__HIP_OVERLOAD1(double, rint)
+__HIP_OVERLOAD1(double, round)
+__HIP_OVERLOAD1(bool, signbit)
+__HIP_OVERLOAD1(double, sin)
+__HIP_OVERLOAD1(double, sinh)
+__HIP_OVERLOAD1(double, sqrt)
+__HIP_OVERLOAD1(double, tan)
+__HIP_OVERLOAD1(double, tanh)
+__HIP_OVERLOAD1(double, tgamma)
+__HIP_OVERLOAD1(double, trunc)
+
+// Overload these but don't add them to std, they are not part of cmath.
+__HIP_OVERLOAD2(double, max)
+__HIP_OVERLOAD2(double, min)
+
+// Additional Overloads that don't quite match HIP_OVERLOAD.
+#if __cplusplus >= 201103L
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__ typename __hip_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized &&
+        std::numeric_limits<__T3>::is_specialized,
+    typename __hip::__promote<__T1, __T2, __T3>::type>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
+  typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
+  return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
+}
+#else
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
+                                 std::numeric_limits<__T2>::is_specialized &&
+                                 std::numeric_limits<__T3>::is_specialized,
+                             double>::type
+    fma(__T1 __x, __T2 __y, __T3 __z) {
+  return ::fma((double)__x, (double)__y, (double)__z);
+}
+#endif
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    frexp(__T __x, int *__exp) {
+  return ::frexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    ldexp(__T __x, int __exp) {
+  return ::ldexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    modf(__T __x, double *__exp) {
+  return ::modf((double)__x, __exp);
+}
+
+#if __cplusplus >= 201103L
+template <typename __T1, typename __T2>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
+                                 std::numeric_limits<__T2>::is_specialized,
+                             typename __hip::__promote<__T1, __T2>::type>::type
+    remquo(__T1 __x, __T2 __y, int *__quo) {
+  typedef typename __hip::__promote<__T1, __T2>::type __result_type;
+  return ::remquo((__result_type)__x, (__result_type)__y, __quo);
+}
+#else
+template <typename __T1, typename __T2>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
+                                 std::numeric_limits<__T2>::is_specialized,
+                             double>::type
+    remquo(__T1 __x, __T2 __y, int *__quo) {
+  return ::remquo((double)__x, (double)__y, __quo);
+}
+#endif
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    scalbln(__T __x, long int __exp) {
+  return ::scalbln((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__
+    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    scalbn(__T __x, int __exp) {
+  return ::scalbn((double)__x, __exp);
+}
+
+#pragma pop_macro("__HIP_OVERLOAD1")
+#pragma pop_macro("__HIP_OVERLOAD2")
+
+// END HIP_OVERLOAD
+
+// END DEF_FUN and HIP_OVERLOAD
+
+#endif // defined(__cplusplus)
+
+// Define these overloads inside the namespace our standard library uses.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// Pull the new overloads we defined above into namespace std.
+// using ::abs; - This may be considered for C++.
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::llround;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::modf;
+// using ::nan; - This may be considered for C++.
+// using ::nanf; - This may be considered for C++.
+// using ::nanl; - This is not yet defined.
+using ::nearbyint;
+using ::nextafter;
+// using ::nexttoward; - Omit this since we do not have a definition.
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+// Well this is fun: We need to pull these symbols in for libc++, but we can't
+// pull them in with libstdc++, because its ::isinf and ::isnan are different
+// than its std::isinf and std::isnan.
+#ifndef __GLIBCXX__
+using ::isinf;
+using ::isnan;
+#endif
+
+// Finally, pull the "foobarf" functions that HIP defines into std.
+using ::acosf;
+using ::acoshf;
+using ::asinf;
+using ::asinhf;
+using ::atan2f;
+using ::atanf;
+using ::atanhf;
+using ::cbrtf;
+using ::ceilf;
+using ::copysignf;
+using ::cosf;
+using ::coshf;
+using ::erfcf;
+using ::erff;
+using ::exp2f;
+using ::expf;
+using ::expm1f;
+using ::fabsf;
+using ::fdimf;
+using ::floorf;
+using ::fmaf;
+using ::fmaxf;
+using ::fminf;
+using ::fmodf;
+using ::frexpf;
+using ::hypotf;
+using ::ilogbf;
+using ::ldexpf;
+using ::lgammaf;
+using ::llrintf;
+using ::llroundf;
+using ::log10f;
+using ::log1pf;
+using ::log2f;
+using ::logbf;
+using ::logf;
+using ::lrintf;
+using ::lroundf;
+using ::modff;
+using ::nearbyintf;
+using ::nextafterf;
+// using ::nexttowardf; - Omit this since we do not have a definition.
+using ::powf;
+using ::remainderf;
+using ::remquof;
+using ::rintf;
+using ::roundf;
+using ::scalblnf;
+using ::scalbnf;
+using ::sinf;
+using ::sinhf;
+using ::sqrtf;
+using ::tanf;
+using ::tanhf;
+using ::tgammaf;
+using ::truncf;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+// Define device-side math functions from <ymath.h> on MSVC.
+#if defined(_MSC_VER)
+
+// Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
+// But, from VS2019, it's only included in `<complex>`. Need to include
+// `<ymath.h>` here to ensure C functions declared there won't be markded as
+// `__host__` and `__device__` through `<complex>` wrapper.
+#include <ymath.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif // defined(__cplusplus)
+__DEVICE__ __attribute__((overloadable)) double _Cosh(double x, double y) {
+  return cosh(x) * y;
+}
+__DEVICE__ __attribute__((overloadable)) float _FCosh(float x, float y) {
+  return coshf(x) * y;
+}
+__DEVICE__ __attribute__((overloadable)) short _Dtest(double *p) {
+  return fpclassify(*p);
+}
+__DEVICE__ __attribute__((overloadable)) short _FDtest(float *p) {
+  return fpclassify(*p);
+}
+__DEVICE__ __attribute__((overloadable)) double _Sinh(double x, double y) {
+  return sinh(x) * y;
+}
+__DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) {
+  return sinhf(x) * y;
+}
+#if defined(__cplusplus)
+}
+#endif // defined(__cplusplus)
+#endif // defined(_MSC_VER)
+
+#pragma pop_macro("__DEVICE__")
+
+#endif // __CLANG_HIP_CMATH_H__
--- a/lib/include/__clang_hip_libdevice_declares.h
+++ b/lib/include/__clang_hip_libdevice_declares.h
@@ -10,7 +10,9 @@
 #ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
 #define __CLANG_HIP_LIBDEVICE_DECLARES_H__

+#ifdef __cplusplus
 extern "C" {
+#endif

 // BEGIN FLOAT
 __device__ __attribute__((const)) float __ocml_acos_f32(float);
@@ -78,6 +80,7 @@ __device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
 __device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
 __device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
 __device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
+__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
 __device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
 __device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
 __device__ float __ocml_remquo_f32(float, float,
@@ -126,10 +129,10 @@ __device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
 __device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
 __device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
 __device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float, float);
-__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float, float);
+__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
+__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
 __device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
 __device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
 __device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
@@ -205,6 +208,7 @@ __device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
 __device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
 __device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
 __device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
+__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
 __device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
 __device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
 __device__ double __ocml_remquo_f64(double, double,
@@ -252,10 +256,10 @@ __device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
 __device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
 __device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
 __device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double, double);
-__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double, double);
+__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
+__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
 __device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
                                                            double);
 __device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
@@ -290,6 +294,7 @@ __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
 __device__ _Float16 __ocml_sin_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
+__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);

 typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
 typedef short __2i16 __attribute__((ext_vector_type(2)));
@@ -313,14 +318,17 @@ __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
 __device__ inline __2f16
 __llvm_amdgcn_rcp_2f16(__2f16 __x) // Not currently exposed by ROCDL.
 {
-  return __2f16{__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y)};
+  return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y));
 }
 __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
 __device__ __2f16 __ocml_sin_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
+__device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);

+#ifdef __cplusplus
 } // extern "C"
+#endif

 #endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__
--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@@ -28,6 +28,10 @@
 #define __shared__ __attribute__((shared))
 #define __constant__ __attribute__((constant))

+#if !defined(__cplusplus) || __cplusplus < 201103L
+  #define nullptr NULL;
+#endif
+
 #if __HIP_ENABLE_DEVICE_MALLOC__
 extern "C" __device__ void *__hip_malloc(size_t __size);
 extern "C" __device__ void *__hip_free(void *__ptr);
@@ -51,6 +55,7 @@ static inline __device__ void *free(void *__ptr) {

 #if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
 #include <__clang_cuda_math_forward_declares.h>
+#include <__clang_hip_cmath.h>
 #include <__clang_cuda_complex_builtins.h>

 #include <algorithm>
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@@ -15,8 +15,8 @@
 #define __AMXINTRIN_H
 #ifdef __x86_64__

-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("amx-tile")))
+#define __DEFAULT_FN_ATTRS_TILE                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))

 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
@@ -31,9 +31,8 @@
 ///
 /// \param __config
 ///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_loadconfig(const void *__config)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_loadconfig(const void *__config) {
  __builtin_ia32_tile_loadconfig(__config);
 }

@@ -48,9 +47,8 @@ _tile_loadconfig(const void *__config)
 ///
 /// \param __config
 ///    A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_storeconfig(void *__config)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_storeconfig(void *__config) {
  __builtin_ia32_tile_storeconfig(__config);
 }

@@ -60,9 +58,7 @@ _tile_storeconfig(void *__config)
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS
-_tile_release(void)
-{
+static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
  __builtin_ia32_tilerelease();
 }

@@ -80,8 +76,9 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
-#define _tile_loadd(dst, base, stride) \
-  __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+#define _tile_loadd(dst, base, stride)                                         \
+  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
+                             (__SIZE_TYPE__)(stride))

 /// Load tile rows from memory specifieid by "base" address and "stride" into
 /// destination tile "dst" using the tile configuration previously configured
@@ -99,8 +96,9 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
-#define _tile_stream_loadd(dst, base, stride) \
-  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride))
+#define _tile_stream_loadd(dst, base, stride)                                  \
+  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
+                               (__SIZE_TYPE__)(stride))

 /// Store the tile specified by "src" to memory specifieid by "base" address and
 /// "stride" using the tile configuration previously configured via
@@ -116,7 +114,7 @@ _tile_release(void)
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be stored in memory.
-#define _tile_stored(dst, base, stride) \
+#define _tile_stored(dst, base, stride)                                        \
  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))

 /// Zero the tile specified by "tdest".
@@ -145,7 +143,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbssd(dst, src0, src1) __builtin_ia32_tdpbssd((dst), (src0), (src1))
+#define _tile_dpbssd(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbssd((dst), (src0), (src1))

 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
@@ -163,7 +162,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbsud(dst, src0, src1) __builtin_ia32_tdpbsud((dst), (src0), (src1))
+#define _tile_dpbsud(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbsud((dst), (src0), (src1))

 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@@ -181,7 +181,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbusd(dst, src0, src1) __builtin_ia32_tdpbusd((dst), (src0), (src1))
+#define _tile_dpbusd(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbusd((dst), (src0), (src1))

 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
@@ -199,7 +200,8 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbuud(dst, src0, src1) __builtin_ia32_tdpbuud((dst), (src0), (src1))
+#define _tile_dpbuud(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbuud((dst), (src0), (src1))

 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
@@ -216,10 +218,61 @@ _tile_release(void)
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf16ps(dst, src0, src1) \
+#define _tile_dpbf16ps(dst, src0, src1)                                        \
  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))

-#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_INT8                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
+
+typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
+                     __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloadd64_internal(m, n, base,
+                                             (__SIZE_TYPE__)(stride));
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS_INT8
+_tile_stored_internal(unsigned short m, unsigned short n, void *base,
+                      __SIZE_TYPE__ stride, _tile1024i tile) {
+  return __builtin_ia32_tilestored64_internal(m, n, base,
+                                              (__SIZE_TYPE__)(stride), tile);
+}
+
+typedef struct __tile1024i_str {
+  const unsigned short row;
+  const unsigned short col;
+  _tile1024i tile;
+} __tile1024i;
+
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_loadd(__tile1024i *dst, const void *base,
+                         __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
+                          __tile1024i src2) {
+  dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
+                                    src1.tile, src2.tile);
+}
+
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
+  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
+}
+
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_zero(__tile1024i *dst) {
+  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
+}

 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@@ -639,6 +639,32 @@ __jcvt(double __a) {
 }
 #endif

+/* Armv8.7-A load/store 64-byte intrinsics */
+#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
+typedef struct {
+    uint64_t val[8];
+} data512_t;
+
+static __inline__ data512_t __attribute__((__always_inline__, __nodebug__))
+__arm_ld64b(const void *__addr) {
+    data512_t __value;
+    __builtin_arm_ld64b(__addr, __value.val);
+    return __value;
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__arm_st64b(void *__addr, data512_t __value) {
+    __builtin_arm_st64b(__addr, __value.val);
+}
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__arm_st64bv(void *__addr, data512_t __value) {
+    return __builtin_arm_st64bv(__addr, __value.val);
+}
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__arm_st64bv0(void *__addr, data512_t __value) {
+    return __builtin_arm_st64bv0(__addr, __value.val);
+}
+#endif
+
 /* 10.1 Special register intrinsics */
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
@@ -94,7 +94,7 @@ typedef __clang_svbfloat16x2_t svbfloat16x2_t;
 typedef __clang_svbfloat16x3_t svbfloat16x3_t;
 typedef __clang_svbfloat16x4_t svbfloat16x4_t;
 #endif
-typedef enum
+enum svpattern
 {
  SV_POW2 = 0,
  SV_VL1 = 1,
@@ -113,9 +113,9 @@ typedef enum
  SV_MUL4 = 29,
  SV_MUL3 = 30,
  SV_ALL = 31
-} sv_pattern;
+};

-typedef enum
+enum svprfop
 {
  SV_PLDL1KEEP = 0,
  SV_PLDL1STRM = 1,
@@ -129,7 +129,7 @@ typedef enum
  SV_PSTL2STRM = 11,
  SV_PSTL3KEEP = 12,
  SV_PSTL3STRM = 13
-} sv_prfop;
+};

 /* Function attributes */
 #define __aio static inline __attribute__((__always_inline__, __nodebug__, __overloadable__))
@@ -10013,69 +10013,69 @@ int16_t svorv(svbool_t, svint16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
 svbool_t svpfirst(svbool_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
-void svprfb_gather(svbool_t, svuint32_t, sv_prfop);
+void svprfb_gather(svbool_t, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base)))
-void svprfb_gather(svbool_t, svuint64_t, sv_prfop);
+void svprfb_gather(svbool_t, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base_offset)))
-void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, svuint32_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64base_offset)))
-void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, svuint64_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s32offset)))
-void svprfb_gather_offset(svbool_t, void const *, svint32_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, void const *, svint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32offset)))
-void svprfb_gather_offset(svbool_t, void const *, svuint32_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, void const *, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_s64offset)))
-void svprfb_gather_offset(svbool_t, void const *, svint64_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, void const *, svint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u64offset)))
-void svprfb_gather_offset(svbool_t, void const *, svuint64_t, sv_prfop);
+void svprfb_gather_offset(svbool_t, void const *, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base)))
-void svprfd_gather(svbool_t, svuint32_t, sv_prfop);
+void svprfd_gather(svbool_t, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base)))
-void svprfd_gather(svbool_t, svuint64_t, sv_prfop);
+void svprfd_gather(svbool_t, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32base_index)))
-void svprfd_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
+void svprfd_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64base_index)))
-void svprfd_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
+void svprfd_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s32index)))
-void svprfd_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
+void svprfd_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u32index)))
-void svprfd_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
+void svprfd_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_s64index)))
-void svprfd_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
+void svprfd_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfd_gather_u64index)))
-void svprfd_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
+void svprfd_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base)))
-void svprfh_gather(svbool_t, svuint32_t, sv_prfop);
+void svprfh_gather(svbool_t, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base)))
-void svprfh_gather(svbool_t, svuint64_t, sv_prfop);
+void svprfh_gather(svbool_t, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32base_index)))
-void svprfh_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
+void svprfh_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64base_index)))
-void svprfh_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
+void svprfh_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s32index)))
-void svprfh_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
+void svprfh_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u32index)))
-void svprfh_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
+void svprfh_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_s64index)))
-void svprfh_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
+void svprfh_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfh_gather_u64index)))
-void svprfh_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
+void svprfh_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base)))
-void svprfw_gather(svbool_t, svuint32_t, sv_prfop);
+void svprfw_gather(svbool_t, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base)))
-void svprfw_gather(svbool_t, svuint64_t, sv_prfop);
+void svprfw_gather(svbool_t, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32base_index)))
-void svprfw_gather_index(svbool_t, svuint32_t, int64_t, sv_prfop);
+void svprfw_gather_index(svbool_t, svuint32_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64base_index)))
-void svprfw_gather_index(svbool_t, svuint64_t, int64_t, sv_prfop);
+void svprfw_gather_index(svbool_t, svuint64_t, int64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s32index)))
-void svprfw_gather_index(svbool_t, void const *, svint32_t, sv_prfop);
+void svprfw_gather_index(svbool_t, void const *, svint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u32index)))
-void svprfw_gather_index(svbool_t, void const *, svuint32_t, sv_prfop);
+void svprfw_gather_index(svbool_t, void const *, svuint32_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_s64index)))
-void svprfw_gather_index(svbool_t, void const *, svint64_t, sv_prfop);
+void svprfw_gather_index(svbool_t, void const *, svint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfw_gather_u64index)))
-void svprfw_gather_index(svbool_t, void const *, svuint64_t, sv_prfop);
+void svprfw_gather_index(svbool_t, void const *, svuint64_t, enum svprfop);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
 svint8_t svqadd(svint8_t, int8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
@@ -10117,13 +10117,13 @@ uint32_t svqdecb(uint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_n_u64)))
 uint64_t svqdecb(uint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s32)))
-int32_t svqdecb_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqdecb_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_s64)))
-int64_t svqdecb_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqdecb_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u32)))
-uint32_t svqdecb_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqdecb_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecb_pat_n_u64)))
-uint64_t svqdecb_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqdecb_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s32)))
 int32_t svqdecd(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_n_s64)))
@@ -10137,17 +10137,17 @@ svint64_t svqdecd(svint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_u64)))
 svuint64_t svqdecd(svuint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s32)))
-int32_t svqdecd_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqdecd_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_s64)))
-int64_t svqdecd_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqdecd_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u32)))
-uint32_t svqdecd_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqdecd_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_n_u64)))
-uint64_t svqdecd_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqdecd_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_s64)))
-svint64_t svqdecd_pat(svint64_t, sv_pattern, uint64_t);
+svint64_t svqdecd_pat(svint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecd_pat_u64)))
-svuint64_t svqdecd_pat(svuint64_t, sv_pattern, uint64_t);
+svuint64_t svqdecd_pat(svuint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s32)))
 int32_t svqdech(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_n_s64)))
@@ -10161,17 +10161,17 @@ svint16_t svqdech(svint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_u16)))
 svuint16_t svqdech(svuint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s32)))
-int32_t svqdech_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqdech_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_s64)))
-int64_t svqdech_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqdech_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u32)))
-uint32_t svqdech_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqdech_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_n_u64)))
-uint64_t svqdech_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqdech_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_s16)))
-svint16_t svqdech_pat(svint16_t, sv_pattern, uint64_t);
+svint16_t svqdech_pat(svint16_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdech_pat_u16)))
-svuint16_t svqdech_pat(svuint16_t, sv_pattern, uint64_t);
+svuint16_t svqdech_pat(svuint16_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b8)))
 int32_t svqdecp_b8(int32_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecp_n_s32_b32)))
@@ -10229,17 +10229,17 @@ svint32_t svqdecw(svint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_u32)))
 svuint32_t svqdecw(svuint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s32)))
-int32_t svqdecw_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqdecw_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_s64)))
-int64_t svqdecw_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqdecw_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u32)))
-uint32_t svqdecw_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqdecw_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_n_u64)))
-uint64_t svqdecw_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqdecw_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_s32)))
-svint32_t svqdecw_pat(svint32_t, sv_pattern, uint64_t);
+svint32_t svqdecw_pat(svint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqdecw_pat_u32)))
-svuint32_t svqdecw_pat(svuint32_t, sv_pattern, uint64_t);
+svuint32_t svqdecw_pat(svuint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s32)))
 int32_t svqincb(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_s64)))
@@ -10249,13 +10249,13 @@ uint32_t svqincb(uint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_n_u64)))
 uint64_t svqincb(uint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s32)))
-int32_t svqincb_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqincb_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_s64)))
-int64_t svqincb_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqincb_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u32)))
-uint32_t svqincb_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqincb_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincb_pat_n_u64)))
-uint64_t svqincb_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqincb_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s32)))
 int32_t svqincd(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_n_s64)))
@@ -10269,17 +10269,17 @@ svint64_t svqincd(svint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_u64)))
 svuint64_t svqincd(svuint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s32)))
-int32_t svqincd_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqincd_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_s64)))
-int64_t svqincd_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqincd_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u32)))
-uint32_t svqincd_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqincd_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_n_u64)))
-uint64_t svqincd_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqincd_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_s64)))
-svint64_t svqincd_pat(svint64_t, sv_pattern, uint64_t);
+svint64_t svqincd_pat(svint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincd_pat_u64)))
-svuint64_t svqincd_pat(svuint64_t, sv_pattern, uint64_t);
+svuint64_t svqincd_pat(svuint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s32)))
 int32_t svqinch(int32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_n_s64)))
@@ -10293,17 +10293,17 @@ svint16_t svqinch(svint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_u16)))
 svuint16_t svqinch(svuint16_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s32)))
-int32_t svqinch_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqinch_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_s64)))
-int64_t svqinch_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqinch_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u32)))
-uint32_t svqinch_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqinch_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_n_u64)))
-uint64_t svqinch_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqinch_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_s16)))
-svint16_t svqinch_pat(svint16_t, sv_pattern, uint64_t);
+svint16_t svqinch_pat(svint16_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqinch_pat_u16)))
-svuint16_t svqinch_pat(svuint16_t, sv_pattern, uint64_t);
+svuint16_t svqinch_pat(svuint16_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b8)))
 int32_t svqincp_b8(int32_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincp_n_s32_b32)))
@@ -10361,17 +10361,17 @@ svint32_t svqincw(svint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_u32)))
 svuint32_t svqincw(svuint32_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s32)))
-int32_t svqincw_pat(int32_t, sv_pattern, uint64_t);
+int32_t svqincw_pat(int32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_s64)))
-int64_t svqincw_pat(int64_t, sv_pattern, uint64_t);
+int64_t svqincw_pat(int64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u32)))
-uint32_t svqincw_pat(uint32_t, sv_pattern, uint64_t);
+uint32_t svqincw_pat(uint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_n_u64)))
-uint64_t svqincw_pat(uint64_t, sv_pattern, uint64_t);
+uint64_t svqincw_pat(uint64_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_s32)))
-svint32_t svqincw_pat(svint32_t, sv_pattern, uint64_t);
+svint32_t svqincw_pat(svint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqincw_pat_u32)))
-svuint32_t svqincw_pat(svuint32_t, sv_pattern, uint64_t);
+svuint32_t svqincw_pat(svuint32_t, enum svpattern, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s8)))
 svint8_t svqsub(svint8_t, int8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqsub_n_s32)))
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@@ -9297,303 +9297,232 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)

 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
 * outputs. This class of vector operation forms the basis of many scientific
- * computations. In vector-reduction arithmetic, the evaluation off is
+ * computations. In vector-reduction arithmetic, the evaluation order is
 * independent of the order of the input elements of V.

+ * For floating point types, we always assume the elements are reassociable even
+ * if -fast-math is off.
+
 * Used bisection method. At each step, we partition the vector with previous
 * step in half, and the operation is performed on its two halves.
 * This takes log2(n) steps where n is the number of elements in the vector.
 */

-#define _mm512_mask_reduce_operator(op) \
-  __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \
-  __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \
-  __m256i __t3 = (__m256i)(__t1 op __t2); \
-  __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \
-  __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \
-  __v2du __t6 = __t4 op __t5; \
-  __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
-  __v2du __t8 = __t6 op __t7; \
-  return __t8[0]
-
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_add_q512(__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_mul_q512(__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
-  _mm512_mask_reduce_operator(&);
+  return __builtin_ia32_reduce_and_q512(__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
-  _mm512_mask_reduce_operator(|);
+  return __builtin_ia32_reduce_or_q512(__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
  __W = _mm512_maskz_mov_epi64(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_add_q512(__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_mul_q512(__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
-  _mm512_mask_reduce_operator(&);
+  return __builtin_ia32_reduce_and_q512(__W);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
  __W = _mm512_maskz_mov_epi64(__M, __W);
-  _mm512_mask_reduce_operator(|);
+  return __builtin_ia32_reduce_or_q512(__W);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \
-  __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \
-  __m256d __t3 = __t1 op __t2; \
-  __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
-  __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
-  __m128d __t6 = __t4 op __t5; \
-  __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
-  __m128d __t8 = __t6 op __t7; \
-  return __t8[0]

+// -0.0 is used to ignore the start value since it is the neutral value of
+// floating point addition. For more information, please refer to
+// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
  __W = _mm512_maskz_mov_pd(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
  __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \
-  __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \
-  __m256i __t3 = (__m256i)(__t1 op __t2); \
-  __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \
-  __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \
-  __v4su __t6 = __t4 op __t5; \
-  __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
-  __v4su __t8 = __t6 op __t7; \
-  __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
-  __v4su __t10 = __t8 op __t9; \
-  return __t10[0]

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_epi32(__m512i __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_add_d512((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_mul_epi32(__m512i __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_mul_d512((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_and_epi32(__m512i __W) {
-  _mm512_mask_reduce_operator(&);
+  return __builtin_ia32_reduce_and_d512((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_or_epi32(__m512i __W) {
-  _mm512_mask_reduce_operator(|);
+  return __builtin_ia32_reduce_or_d512((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
  __W = _mm512_maskz_mov_epi32(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_add_d512((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_mul_d512((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
-  _mm512_mask_reduce_operator(&);
+  return __builtin_ia32_reduce_and_d512((__v16si)__W);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
  __W = _mm512_maskz_mov_epi32(__M, __W);
-  _mm512_mask_reduce_operator(|);
+  return __builtin_ia32_reduce_or_d512((__v16si)__W);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \
-  __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \
-  __m256 __t3 = __t1 op __t2; \
-  __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
-  __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
-  __m128 __t6 = __t4 op __t5; \
-  __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
-  __m128 __t8 = __t6 op __t7; \
-  __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
-  __m128 __t10 = __t8 op __t9; \
-  return __t10[0]

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_ps(__m512 __W) {
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_mul_ps(__m512 __W) {
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
  __W = _mm512_maskz_mov_ps(__M, __W);
-  _mm512_mask_reduce_operator(+);
+  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
  __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
-  _mm512_mask_reduce_operator(*);
+  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \
-  __m512i __t2 = _mm512_##op(__V, __t1); \
-  __m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \
-  __m512i __t4 = _mm512_##op(__t2, __t3); \
-  __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \
-  __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \
-  return __t6[0]

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epi64(__m512i __V) {
-  _mm512_mask_reduce_operator(max_epi64);
+  return __builtin_ia32_reduce_smax_q512(__V);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epu64(__m512i __V) {
-  _mm512_mask_reduce_operator(max_epu64);
+  return __builtin_ia32_reduce_umax_q512(__V);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_epi64(__m512i __V) {
-  _mm512_mask_reduce_operator(min_epi64);
+  return __builtin_ia32_reduce_smin_q512(__V);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_epu64(__m512i __V) {
-  _mm512_mask_reduce_operator(min_epu64);
+  return __builtin_ia32_reduce_umin_q512(__V);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
-  _mm512_mask_reduce_operator(max_epi64);
+  return __builtin_ia32_reduce_smax_q512(__V);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
  __V = _mm512_maskz_mov_epi64(__M, __V);
-  _mm512_mask_reduce_operator(max_epu64);
+  return __builtin_ia32_reduce_umax_q512(__V);
 }

 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
-  _mm512_mask_reduce_operator(min_epi64);
+  return __builtin_ia32_reduce_smin_q512(__V);
 }

 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
-  _mm512_mask_reduce_operator(min_epu64);
+  return __builtin_ia32_reduce_umin_q512(__V);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \
-  __m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \
-  __m256i __t3 = _mm256_##op(__t1, __t2); \
-  __m128i __t4 = _mm256_extracti128_si256(__t3, 0); \
-  __m128i __t5 = _mm256_extracti128_si256(__t3, 1); \
-  __m128i __t6 = _mm_##op(__t4, __t5); \
-  __m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \
-  __m128i __t8 = _mm_##op(__t6, __t7); \
-  __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \
-  __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \
-  return __t10[0]
-
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epi32(__m512i __V) {
-  _mm512_mask_reduce_operator(max_epi32);
+  return __builtin_ia32_reduce_smax_d512((__v16si)__V);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_epu32(__m512i __V) {
-  _mm512_mask_reduce_operator(max_epu32);
+  return __builtin_ia32_reduce_umax_d512((__v16si)__V);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_epi32(__m512i __V) {
-  _mm512_mask_reduce_operator(min_epi32);
+  return __builtin_ia32_reduce_smin_d512((__v16si)__V);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_epu32(__m512i __V) {
-  _mm512_mask_reduce_operator(min_epu32);
+  return __builtin_ia32_reduce_umin_d512((__v16si)__V);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
-  _mm512_mask_reduce_operator(max_epi32);
+  return __builtin_ia32_reduce_smax_d512((__v16si)__V);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
  __V = _mm512_maskz_mov_epi32(__M, __V);
-  _mm512_mask_reduce_operator(max_epu32);
+  return __builtin_ia32_reduce_umax_d512((__v16si)__V);
 }

 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
-  _mm512_mask_reduce_operator(min_epi32);
+  return __builtin_ia32_reduce_smin_d512((__v16si)__V);
 }

 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
-  _mm512_mask_reduce_operator(min_epu32);
+  return __builtin_ia32_reduce_umin_d512((__v16si)__V);
 }
-#undef _mm512_mask_reduce_operator

 #define _mm512_mask_reduce_operator(op) \
  __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@@ -18,13 +18,157 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))

+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusd_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
-                                             (__v8si)__B);
-}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusds_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssd_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssds_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusd_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusds_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssd_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssds_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
@@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                    (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                        (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                       (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                        (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@@ -2245,7 +2245,7 @@ _mm256_cvttps_epi32(__m256 __a)

 /// Returns the first element of the input vector of [4 x double].
 ///
-/// \headerfile <avxintrin.h>
+/// \headerfile <x86intrin.h>
 ///
 /// This intrinsic is a utility function and does not correspond to a specific
 ///    instruction.
@@ -2261,7 +2261,7 @@ _mm256_cvtsd_f64(__m256d __a)

 /// Returns the first element of the input vector of [8 x i32].
 ///
-/// \headerfile <avxintrin.h>
+/// \headerfile <x86intrin.h>
 ///
 /// This intrinsic is a utility function and does not correspond to a specific
 ///    instruction.
@@ -2278,7 +2278,7 @@ _mm256_cvtsi256_si32(__m256i __a)

 /// Returns the first element of the input vector of [8 x float].
 ///
-/// \headerfile <avxintrin.h>
+/// \headerfile <x86intrin.h>
 ///
 /// This intrinsic is a utility function and does not correspond to a specific
 ///    instruction.
--- a/lib/include/avxvnniintrin.h
+++ b/lib/include/avxvnniintrin.h
@@ -0,0 +1,225 @@
+/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINTRIN_H
+#define __AVXVNNIINTRIN_H
+
+/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
+/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+
+/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINTRIN_H
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@@ -7,6 +7,9 @@
 *===-----------------------------------------------------------------------===
 */

+#ifndef __CPUID_H
+#define __CPUID_H
+
 #if !(__x86_64__ || __i386__)
 #error this header is for x86 only
 #endif
@@ -186,6 +189,7 @@
 /* Features in %edx for leaf 7 sub-leaf 0 */
 #define bit_AVX5124VNNIW  0x00000004
 #define bit_AVX5124FMAPS  0x00000008
+#define bit_UINTR         0x00000020
 #define bit_SERIALIZE     0x00004000
 #define bit_TSXLDTRK      0x00010000
 #define bit_PCONFIG       0x00040000
@@ -195,7 +199,9 @@
 #define bit_AMXINT8       0x02000000

 /* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNI       0x00000008
 #define bit_AVX512BF16    0x00000020
+#define bit_HRESET        0x00400000

 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
@@ -309,3 +315,5 @@ static __inline int __get_cpuid_count (unsigned int __leaf,
    __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
    return 1;
 }
+
+#endif /* __CPUID_H */
--- a/lib/include/cuda_wrappers/algorithm
+++ b/lib/include/cuda_wrappers/algorithm
@@ -1,4 +1,4 @@
-/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
+/*===---- algorithm - CUDA wrapper for <algorithm> -------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
--- a/lib/include/cuda_wrappers/new
+++ b/lib/include/cuda_wrappers/new
@@ -1,4 +1,4 @@
-/*===---- complex - CUDA wrapper for <new> ------------------------------===
+/*===---- new - CUDA wrapper for <new> -------------------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,13 @@

 #include_next <new>

+#if !defined(__device__)
+// The header has been included too early from the standard C++ library
+// and CUDA-specific macros are not available yet.
+// Undo the include guard and try again later.
+#undef __CLANG_CUDA_WRAPPERS_NEW
+#else
+
 #pragma push_macro("CUDA_NOEXCEPT")
 #if __cplusplus >= 201103L
 #define CUDA_NOEXCEPT noexcept
@@ -95,4 +102,5 @@ __device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}

 #pragma pop_macro("CUDA_NOEXCEPT")

+#endif // __device__
 #endif // include guard
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@@ -4025,7 +4025,7 @@ _mm_storeu_si128(__m128i_u *__p, __m128i __b)
 ///
 /// \param __p
 ///    A pointer to a 64-bit memory location. The address of the memory
-///    location does not have to be algned.
+///    location does not have to be aligned.
 /// \param __b
 ///    A 128-bit integer vector containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@@ -14,38 +14,56 @@
 #ifndef __GFNIINTRIN_H
 #define __GFNIINTRIN_H

+/* Default attributes for simple form (no masking). */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
+
+/* Default attributes for YMM unmasked form. */
+#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
+
+/* Default attributes for VLX forms. */
+#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))

 #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
  (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
                                                  (__v16qi)(__m128i)(B),          \
                                                  (char)(I))

-#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
-        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
-        (__v16qi)(__m128i)(S))
-
-
-#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
-        U, A, B, I)
+#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
+  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
+                                                  (__v16qi)(__m128i)(B),          \
+                                                  (char)(I))

+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+              (__v16qi) __B);
+}

+#ifdef __AVXINTRIN_H
 #define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
  (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
                                                  (__v32qi)(__m256i)(B),          \
                                                  (char)(I))

-#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
-        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
-        (__v32qi)(__m256i)(S))
-
-#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
-        U, A, B, I)
+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
+  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
+                                                  (__v32qi)(__m256i)(B),          \
+                                                  (char)(I))

+static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
+_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
+              (__v32qi) __B);
+}
+#endif /* __AVXINTRIN_H */

+#ifdef __AVX512BWINTRIN_H
 #define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
  (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
                                                  (__v64qi)(__m512i)(B),          \
@@ -60,37 +78,6 @@
  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(),    \
        U, A, B, I)

-#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
-                                                  (__v16qi)(__m128i)(B),          \
-                                                  (char)(I))
-
-#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
-        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
-        (__v16qi)(__m128i)(S))
-
-
-#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
-        U, A, B, I)
-
-
-#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
-                                                  (__v32qi)(__m256i)(B),          \
-                                                  (char)(I))
-
-#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
-        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
-        (__v32qi)(__m256i)(S))
-
-#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
-        U, A, B, I)
-
-
 #define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
  (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
                                                  (__v64qi)(__m512i)(B),          \
@@ -105,63 +92,6 @@
  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(),       \
        U, A, B, I)

-/* Default attributes for simple form (no masking). */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
-
-/* Default attributes for YMM unmasked form. */
-#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
-
-/* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
-
-/* Default attributes for VLX forms. */
-#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
-              (__v16qi) __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
-{
-  return (__m128i) __builtin_ia32_selectb_128(__U,
-              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
-              (__v16qi) __S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
-  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
-              __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
-_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
-              (__v32qi) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_selectb_256(__U,
-              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
-              (__v32qi) __S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
-  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
-              __U, __A, __B);
-}
-
 static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
 _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
 {
@@ -183,6 +113,75 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
              __U, __A, __B);
 }
+#endif /* __AVX512BWINTRIN_H */
+
+#ifdef __AVX512VLBWINTRIN_H
+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
+        (__v16qi)(__m128i)(S))
+
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
+        U, A, B, I)
+
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
+        (__v32qi)(__m256i)(S))
+
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+        U, A, B, I)
+
+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
+        (__v16qi)(__m128i)(S))
+
+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
+        U, A, B, I)
+
+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
+        (__v32qi)(__m256i)(S))
+
+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
+        U, A, B, I)
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128(__U,
+              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
+              (__v16qi) __S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
+              __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256(__U,
+              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
+              (__v32qi) __S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
+              __U, __A, __B);
+}
+#endif /* __AVX512VLBWINTRIN_H */

 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_Y
--- a/lib/include/hresetintrin.h
+++ b/lib/include/hresetintrin.h
@@ -0,0 +1,49 @@
+/*===---------------- hresetintrin.h - HRESET intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86GPRINTRIN_H
+#error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef __HRESETINTRIN_H
+#define __HRESETINTRIN_H
+
+#if __has_extension(gnu_asm)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("hreset")))
+
+/// Provides a hint to the processor to selectively reset the prediction
+///    history of the current logical processor specified by a 32-bit integer
+///    value \a __eax.
+///
+/// This intrinsic corresponds to the <c> HRESET </c> instruction.
+///
+/// \operation
+///    IF __eax == 0
+///      // nop
+///    ELSE
+///      FOR i := 0 to 31
+///        IF __eax[i]
+///          ResetPredictionFeature(i)
+///        FI
+///      ENDFOR
+///    FI
+/// \endoperation
+static __inline void __DEFAULT_FN_ATTRS
+_hreset(int __eax)
+{
+  __asm__ ("hreset $0" :: "a"(__eax));
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __has_extension(gnu_asm) */
+
+#endif /* __HRESETINTRIN_H */
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
@@ -14,6 +14,18 @@
 #ifndef __IA32INTRIN_H
 #define __IA32INTRIN_H

+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS_SSE42 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__))
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 /** Find the first set bit starting from the lsb. Result is undefined if
 *  input is 0.
 *
@@ -26,7 +38,7 @@
 *     A 32-bit integer operand.
 *  \returns A 32-bit integer containing the bit number.
 */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfd(int __A) {
  return __builtin_ctz(__A);
 }
@@ -43,7 +55,7 @@ __bsfd(int __A) {
 *     A 32-bit integer operand.
 *  \returns A 32-bit integer containing the bit number.
 */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrd(int __A) {
  return 31 - __builtin_clz(__A);
 }
@@ -59,12 +71,12 @@ __bsrd(int __A) {
 *     A 32-bit integer operand.
 *  \returns A 32-bit integer containing the swapped bytes.
 */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapd(int __A) {
  return __builtin_bswap32(__A);
 }

-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 _bswap(int __A) {
  return __builtin_bswap32(__A);
 }
@@ -85,7 +97,7 @@ _bswap(int __A) {
 *     A 64-bit integer operand.
 *  \returns A 32-bit integer containing the bit number.
 */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfq(long long __A) {
  return __builtin_ctzll(__A);
 }
@@ -102,7 +114,7 @@ __bsfq(long long __A) {
 *     A 64-bit integer operand.
 *  \returns A 32-bit integer containing the bit number.
 */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrq(long long __A) {
  return 63 - __builtin_clzll(__A);
 }
@@ -118,7 +130,7 @@ __bsrq(long long __A) {
 *     A 64-bit integer operand.
 *  \returns A 64-bit integer containing the swapped bytes.
 */
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapq(long long __A) {
  return __builtin_bswap64(__A);
 }
@@ -138,7 +150,7 @@ __bswapq(long long __A) {
 *  \returns A 32-bit integer containing the number of bits with value 1 in the
 *     source operand.
 */
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __popcntd(unsigned int __A)
 {
  return __builtin_popcount(__A);
@@ -159,7 +171,7 @@ __popcntd(unsigned int __A)
 *  \returns A 64-bit integer containing the number of bits with value 1 in the
 *     source operand.
 */
-static __inline__ long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __popcntq(unsigned long long __A)
 {
  return __builtin_popcountll(__A);
@@ -169,26 +181,26 @@ __popcntq(unsigned long long __A)
 #endif /* __x86_64__ */

 #ifdef __x86_64__
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __readeflags(void)
 {
  return __builtin_ia32_readeflags_u64();
 }

-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __writeeflags(unsigned long long __f)
 {
  __builtin_ia32_writeeflags_u64(__f);
 }

 #else /* !__x86_64__ */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __readeflags(void)
 {
  return __builtin_ia32_readeflags_u32();
 }

-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 __writeeflags(unsigned int __f)
 {
  __builtin_ia32_writeeflags_u32(__f);
@@ -205,11 +217,9 @@ __writeeflags(unsigned int __f)
 *     A 32-bit float value.
 *  \returns a 32-bit unsigned integer containing the converted value.
 */
-static __inline__ unsigned int __attribute__((__always_inline__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
 _castf32_u32(float __A) {
-  unsigned int D;
-  __builtin_memcpy(&D, &__A, sizeof(__A));
-  return D;
+  return __builtin_bit_cast(unsigned int, __A);
 }

 /** Cast a 64-bit float value to a 64-bit unsigned integer value
@@ -222,11 +232,9 @@ _castf32_u32(float __A) {
 *     A 64-bit float value.
 *  \returns a 64-bit unsigned integer containing the converted value.
 */
-static __inline__ unsigned long long __attribute__((__always_inline__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
 _castf64_u64(double __A) {
-  unsigned long long D;
-  __builtin_memcpy(&D, &__A, sizeof(__A));
-  return D;
+  return __builtin_bit_cast(unsigned long long, __A);
 }

 /** Cast a 32-bit unsigned integer value to a 32-bit float value
@@ -239,11 +247,9 @@ _castf64_u64(double __A) {
 *     A 32-bit unsigned integer value.
 *  \returns a 32-bit float value containing the converted value.
 */
-static __inline__ float __attribute__((__always_inline__))
+static __inline__ float __DEFAULT_FN_ATTRS_CAST
 _castu32_f32(unsigned int __A) {
-  float D;
-  __builtin_memcpy(&D, &__A, sizeof(__A));
-  return D;
+  return __builtin_bit_cast(float, __A);
 }

 /** Cast a 64-bit unsigned integer value to a 64-bit float value
@@ -256,11 +262,9 @@ _castu32_f32(unsigned int __A) {
 *     A 64-bit unsigned integer value.
 *  \returns a 64-bit float value containing the converted value.
 */
-static __inline__ double __attribute__((__always_inline__))
+static __inline__ double __DEFAULT_FN_ATTRS_CAST
 _castu64_f64(unsigned long long __A) {
-  double D;
-  __builtin_memcpy(&D, &__A, sizeof(__A));
-  return D;
+  return __builtin_bit_cast(double, __A);
 }

 /** Adds the unsigned integer operand to the CRC-32C checksum of the
@@ -278,7 +282,7 @@ _castu64_f64(unsigned long long __A) {
 *  \returns The result of adding operand \a __C to the CRC-32C checksum of
 *     operand \a __D.
 */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
 __crc32b(unsigned int __C, unsigned char __D)
 {
  return __builtin_ia32_crc32qi(__C, __D);
@@ -299,7 +303,7 @@ __crc32b(unsigned int __C, unsigned char __D)
 *  \returns The result of adding operand \a __C to the CRC-32C checksum of
 *     operand \a __D.
 */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
 __crc32w(unsigned int __C, unsigned short __D)
 {
  return __builtin_ia32_crc32hi(__C, __D);
@@ -320,7 +324,7 @@ __crc32w(unsigned int __C, unsigned short __D)
 *  \returns The result of adding operand \a __C to the CRC-32C checksum of
 *     operand \a __D.
 */
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
 __crc32d(unsigned int __C, unsigned int __D)
 {
  return __builtin_ia32_crc32si(__C, __D);
@@ -342,20 +346,20 @@ __crc32d(unsigned int __C, unsigned int __D)
 *  \returns The result of adding operand \a __C to the CRC-32C checksum of
 *     operand \a __D.
 */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_SSE42
 __crc32q(unsigned long long __C, unsigned long long __D)
 {
  return __builtin_ia32_crc32di(__C, __D);
 }
 #endif /* __x86_64__ */

-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdpmc(int __A) {
  return __builtin_ia32_rdpmc(__A);
 }

 /* __rdtscp */
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdtscp(unsigned int *__A) {
  return __builtin_ia32_rdtscp(__A);
 }
@@ -364,48 +368,48 @@ __rdtscp(unsigned int *__A) {

 #define _rdpmc(A) __rdpmc(A)

-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __DEFAULT_FN_ATTRS
 _wbinvd(void) {
  __builtin_ia32_wbinvd();
 }

-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolb(unsigned char __X, int __C) {
  return __builtin_rotateleft8(__X, __C);
 }

-static __inline__ unsigned char __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorb(unsigned char __X, int __C) {
  return __builtin_rotateright8(__X, __C);
 }

-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolw(unsigned short __X, int __C) {
  return __builtin_rotateleft16(__X, __C);
 }

-static __inline__ unsigned short __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorw(unsigned short __X, int __C) {
  return __builtin_rotateright16(__X, __C);
 }

-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rold(unsigned int __X, int __C) {
  return __builtin_rotateleft32(__X, __C);
 }

-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rord(unsigned int __X, int __C) {
  return __builtin_rotateright32(__X, __C);
 }

 #ifdef __x86_64__
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolq(unsigned long long __X, int __C) {
  return __builtin_rotateleft64(__X, __C);
 }

-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorq(unsigned long long __X, int __C) {
  return __builtin_rotateright64(__X, __C);
 }
@@ -429,4 +433,9 @@ __rorq(unsigned long long __X, int __C) {
 #define _rotwl(a,b) __rolw((a), (b))
 #define _rotwr(a,b) __rorw((a), (b))

+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CAST
+#undef __DEFAULT_FN_ATTRS_SSE42
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
+
 #endif /* __IA32INTRIN_H */
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@@ -10,6 +10,8 @@
 #ifndef __IMMINTRIN_H
 #define __IMMINTRIN_H

+#include <x86gprintrin.h>
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__MMX__)
 #include <mmintrin.h>
@@ -143,6 +145,11 @@
 #include <avx512vlvnniintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNI__)
+#include <avxvnniintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
@@ -471,6 +478,11 @@ _storebe_i64(void * __P, long long __D) {
 #include <invpcidintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__KL__) || defined(__WIDEKL__)
+#include <keylockerintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
 #include <amxintrin.h>
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@@ -57,16 +57,11 @@ void __addfsbyte(unsigned long, unsigned char);
 void __addfsdword(unsigned long, unsigned long);
 void __addfsword(unsigned long, unsigned short);
 void __code_seg(const char *);
-static __inline__
 void __cpuid(int[4], int);
-static __inline__
 void __cpuidex(int[4], int, int);
-static __inline__
 __int64 __emul(int, int);
-static __inline__
 unsigned __int64 __emulu(unsigned int, unsigned int);
 unsigned int __getcallerseflags(void);
-static __inline__
 void __halt(void);
 unsigned char __inbyte(unsigned short);
 void __inbytestring(unsigned short, unsigned char *, unsigned long);
@@ -82,13 +77,9 @@ void __inwordstring(unsigned short, unsigned short *, unsigned long);
 void __lidt(void *);
 unsigned __int64 __ll_lshift(unsigned __int64, int);
 __int64 __ll_rshift(__int64, int);
-static __inline__
 void __movsb(unsigned char *, unsigned char const *, size_t);
-static __inline__
 void __movsd(unsigned long *, unsigned long const *, size_t);
-static __inline__
 void __movsw(unsigned short *, unsigned short const *, size_t);
-static __inline__
 void __nop(void);
 void __nvreg_restore_fence(void);
 void __nvreg_save_fence(void);
@@ -105,23 +96,16 @@ unsigned long __readcr4(void);
 unsigned long __readcr8(void);
 unsigned int __readdr(unsigned int);
 #ifdef __i386__
-static __inline__
 unsigned char __readfsbyte(unsigned long);
-static __inline__
 unsigned __int64 __readfsqword(unsigned long);
-static __inline__
 unsigned short __readfsword(unsigned long);
 #endif
-static __inline__
 unsigned __int64 __readmsr(unsigned long);
 unsigned __int64 __readpmc(unsigned long);
 unsigned long __segmentlimit(unsigned long);
 void __sidt(void *);
-static __inline__
 void __stosb(unsigned char *, unsigned char, size_t);
-static __inline__
 void __stosd(unsigned long *, unsigned long, size_t);
-static __inline__
 void __stosw(unsigned short *, unsigned short, size_t);
 void __svm_clgi(void);
 void __svm_invlpga(void *, int);
@@ -136,7 +120,6 @@ void __vmx_off(void);
 void __vmx_vmptrst(unsigned __int64 *);
 void __wbinvd(void);
 void __writecr0(unsigned int);
-static __inline__
 void __writecr3(unsigned __INTPTR_TYPE__);
 void __writecr4(unsigned int);
 void __writecr8(unsigned int);
@@ -146,11 +129,8 @@ void __writefsdword(unsigned long, unsigned long);
 void __writefsqword(unsigned long, unsigned __int64);
 void __writefsword(unsigned long, unsigned short);
 void __writemsr(unsigned long, unsigned __int64);
-static __inline__
 void *_AddressOfReturnAddress(void);
-static __inline__
 unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
-static __inline__
 unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
 unsigned char _bittest(long const *, long);
 unsigned char _bittestandcomplement(long *, long);
@@ -169,12 +149,10 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
 long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadBarrier(void);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_ReadWriteBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
@@ -185,9 +163,8 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
 void _Store_HLERelease(long volatile *, long);
 void _Store64_HLERelease(__int64 volatile *, __int64);
 void _StorePointer_HLERelease(void *volatile *, void *);
-static __inline__ void
-__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
-_WriteBarrier(void);
+void __attribute__((__deprecated__(
+    "use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void);
 unsigned __int32 xbegin(void);
 void _xend(void);

@@ -197,19 +174,14 @@ void __addgsbyte(unsigned long, unsigned char);
 void __addgsdword(unsigned long, unsigned long);
 void __addgsqword(unsigned long, unsigned __int64);
 void __addgsword(unsigned long, unsigned short);
-static __inline__
 void __faststorefence(void);
 void __incgsbyte(unsigned long);
 void __incgsdword(unsigned long);
 void __incgsqword(unsigned long);
 void __incgsword(unsigned long);
-static __inline__
 void __movsq(unsigned long long *, unsigned long long const *, size_t);
-static __inline__
 unsigned char __readgsbyte(unsigned long);
-static __inline__
 unsigned long __readgsdword(unsigned long);
-static __inline__
 unsigned __int64 __readgsqword(unsigned long);
 unsigned short __readgsword(unsigned long);
 unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
@@ -218,7 +190,6 @@ unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
 unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
                                 unsigned __int64 _HighPart,
                                 unsigned char _Shift);
-static __inline__
 void __stosq(unsigned __int64 *, unsigned __int64, size_t);
 unsigned char __vmx_on(unsigned __int64 *);
 unsigned char __vmx_vmclear(unsigned __int64 *);
@@ -243,10 +214,6 @@ unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
 unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
 long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
                                    long _Comparand);
-unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
-                                             __int64 _ExchangeHigh,
-                                             __int64 _ExchangeLow,
-                                             __int64 *_CompareandResult);
 unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
                                                __int64 _ExchangeHigh,
                                                __int64 _ExchangeLow,
@@ -269,13 +236,9 @@ unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
 __int64 _sarx_i64(__int64, unsigned int);
 unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
 unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
-static __inline__
 __int64 __mulh(__int64, __int64);
-static __inline__
 unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
-static __inline__
 __int64 _mul128(__int64, __int64, __int64*);
-static __inline__
 unsigned __int64 _umul128(unsigned __int64,
                          unsigned __int64,
                          unsigned __int64*);
@@ -284,29 +247,19 @@ unsigned __int64 _umul128(unsigned __int64,

 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)

-static __inline__
 unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
 unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);

 #endif

 #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
-static __inline__
 __int64 _InterlockedDecrement64(__int64 volatile *_Addend);
-static __inline__
 __int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
-static __inline__
 __int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
-static __inline__
 __int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
-static __inline__
 __int64 _InterlockedIncrement64(__int64 volatile *_Addend);
-static __inline__
 __int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
-static __inline__
 __int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
-static __inline__
 __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);

 #endif
@@ -470,45 +423,81 @@ __int64 _InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
 __int64 _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
                              __int64 _Exchange, __int64 _Comparand);
 #endif
+#if defined(__x86_64__) || defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_ComparandResult);
+#endif
+#if defined(__aarch64__)
+unsigned char _InterlockedCompareExchange128_acq(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_nf(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
+                                                 __int64 _ExchangeHigh,
+                                                 __int64 _ExchangeLow,
+                                                 __int64 *_ComparandResult);
+#endif

 /*----------------------------------------------------------------------------*\
 |* movs, stos
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
+                                                  unsigned char const *__src,
+                                                  size_t __n) {
  __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
                       : : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
-}
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
-  __asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x)
+static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
+                                                  unsigned long const *__src,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep movsl"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x)
+static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
+                                                  unsigned short const *__src,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep movsw"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
+}
+static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
+                                                  unsigned long __x,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep stosl"
+                       : "+D"(__dst), "+c"(__n)
+                       : "a"(__x)
+                       : "memory");
+}
+static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst,
+                                                  unsigned short __x,
+                                                  size_t __n) {
+  __asm__ __volatile__("rep stosw"
+                       : "+D"(__dst), "+c"(__n)
+                       : "a"(__x)
                       : "memory");
 }
 #endif
 #ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
-  __asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+static __inline__ void __DEFAULT_FN_ATTRS __movsq(
+    unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__ __volatile__("rep movsq"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
+                                                  unsigned __int64 __x,
+                                                  size_t __n) {
  __asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x)
                       : "memory");
 }
@@ -518,26 +507,25 @@ __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
 |* Misc
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuid(int __info[4], int __level) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(0));
+static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
+  __asm__("cpuid"
+          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
+          : "a"(__level), "c"(0));
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__cpuidex(int __info[4], int __level, int __ecx) {
-  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
-                   : "a"(__level), "c"(__ecx));
+static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
+                                                    int __ecx) {
+  __asm__("cpuid"
+          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
+          : "a"(__level), "c"(__ecx));
 }
-static __inline__ void __DEFAULT_FN_ATTRS
-__halt(void) {
-  __asm__ volatile ("hlt");
+static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
+  __asm__ volatile("hlt");
 }
 #endif

 #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__)
-static __inline__ void __DEFAULT_FN_ATTRS
-__nop(void) {
-  __asm__ volatile ("nop");
+static __inline__ void __DEFAULT_FN_ATTRS __nop(void) {
+  __asm__ volatile("nop");
 }
 #endif

@@ -574,8 +562,7 @@ __readmsr(unsigned long __register) {
 }
 #endif

-static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS
-__readcr3(void) {
+static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
  unsigned __LPTRINT_TYPE__ __cr3_val;
  __asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
  return __cr3_val;
--- a/lib/include/keylockerintrin.h
+++ b/lib/include/keylockerintrin.h
@@ -0,0 +1,506 @@
+/*===----------------- keylockerintrin.h - KL Intrinsics -------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <keylockerintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _KEYLOCKERINTRIN_H
+#define _KEYLOCKERINTRIN_H
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__KL__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("kl"),\
+                 __min_vector_width__(128)))
+
+/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl
+/// will assigned to EAX, whch specifies the KeySource and whether backing up
+/// the key is permitted. The 256-bit encryption key is loaded from the two
+/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is
+/// loaded from the implicit operand XMM0 which assigned by __intkey.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
+///
+/// \operation
+/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
+///   GP (0)
+/// FI
+/// IF “LOADIWKEY exiting” VM execution control set
+///   VMexit
+/// FI
+/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used
+///   GP (0)
+/// FI
+/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set
+///   GP (0)
+/// FI
+/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part
+///   GP (0)
+/// FI
+/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part
+///   GP (0)
+/// FI
+/// IF (__ctl[4:1] == 0) // KeySource of 0.
+///   IWKey.Encryption Key[127:0] := __enkey_hi[127:0]:
+///   IWKey.Encryption Key[255:128] := __enkey_lo[127:0]
+///   IWKey.IntegrityKey[127:0] := __intkey[127:0]
+///   IWKey.NoBackup := __ctl[0]
+///   IWKey.KeySource := __ctl[4:1]
+///   ZF := 0
+/// ELSE // KeySource of 1. See RDSEED definition for details of randomness
+///   IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received
+///     IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0]
+///     IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128]
+///     IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0]
+///     IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256]
+///     IWKey.NoBackup := __ctl[0]
+///     IWKey.KeySource := __ctl[4:1]
+///     ZF := 0
+///   ELSE // Random data was not returned from RDSEED. IWKey was not loaded
+///     ZF := 1
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
+               __m128i __enkey_lo, __m128i __enkey_hi) {
+  __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl);
+}
+
+/// Wrap a 128-bit AES key from __key into a key handle and output in
+/// ((__m128i*)__h) to ((__m128i*)__h) + 5  and a 32-bit value as return.
+/// The explicit source operand __htype specifies handle restrictions.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
+///
+/// \operation
+/// InputKey[127:0] := __key[127:0]
+/// KeyMetadata[2:0] := __htype[2:0]
+/// KeyMetadata[23:3] := 0 // Reserved for future usage
+/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0)
+/// KeyMetadata[127:28] := 0 // Reserved for future usage
+/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0],
+///                  IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
+/// dst[0] := IWKey.NoBackup
+/// dst[4:1] := IWKey.KeySource[3:0]
+/// dst[31:5] := 0
+/// MEM[__h+127:__h] := Handle[127:0]   // AAD
+/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
+/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
+/// MEM[__h+511:__h+384] := 0 // Reserved for future usage
+/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
+/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
+/// OF := 0
+/// SF := 0
+/// ZF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
+  return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
+}
+
+/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
+/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 6 and
+/// a 32-bit value as return.
+/// The explicit source operand __htype specifies handle restrictions.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
+///
+/// \operation
+/// InputKey[127:0] := __key_lo[127:0]
+/// InputKey[255:128] := __key_hi[255:128]
+/// KeyMetadata[2:0] := __htype[2:0]
+/// KeyMetadata[23:3] := 0 // Reserved for future usage
+/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1)
+/// KeyMetadata[127:28] := 0 // Reserved for future usage
+/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0],
+///                  IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
+/// dst[0] := IWKey.NoBackup
+/// dst[4:1] := IWKey.KeySource[3:0]
+/// dst[31:5] := 0
+/// MEM[__h+127:__h]   := Handle[127:0] // AAD
+/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
+/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
+/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
+/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
+/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
+/// MEM[__h+895:__h+768] := 0 Integrity// Reserved for future usage
+/// OF := 0
+/// SF := 0
+/// ZF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
+                     void *__h) {
+  return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
+                                         (__v2di)__key_hi, __h);
+}
+
+/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using
+/// the 128-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[383:256] ||
+///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
+/// the 256-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
+/// the 128-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
+///                  (Handle[127:0] AND (CPL > 0)) ||
+///                  Handle[383:256] ||
+///                  HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
+/// the 256-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
+///                   (Handle[127:0] AND (CPL > 0)) ||
+///                   Handle[383:256] ||
+///                   HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
+          || defined(__KL__) */
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__WIDEKL__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
+                 __min_vector_width__(128)))
+
+/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
+///
+/// \operation
+/// Handle := MEM[__h+383:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
+///                   (Handle[127:0] AND (CPL > 0)) ||
+///                   Handle[255:128] ||
+///                   HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
+/// If (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
+          || defined(__WIDEKL__) */
+
+#endif /* _KEYLOCKERINTRIN_H */
--- a/lib/include/mm_malloc.h
+++ b/lib/include/mm_malloc.h
@@ -54,7 +54,13 @@ _mm_malloc(size_t __size, size_t __align)
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _mm_free(void *__p)
 {
+#if defined(__MINGW32__)
+  __mingw_aligned_free(__p);
+#elif defined(_WIN32)
+  _aligned_free(__p);
+#else
  free(__p);
+#endif
 }
 #endif

--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@@ -9,6 +9,21 @@
 #ifndef _OPENCL_BASE_H_
 #define _OPENCL_BASE_H_

+// Define extension macros
+
+#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
+// For SPIR all extensions are supported.
+#if defined(__SPIR__)
+#define cl_khr_subgroup_extended_types 1
+#define cl_khr_subgroup_non_uniform_vote 1
+#define cl_khr_subgroup_ballot 1
+#define cl_khr_subgroup_non_uniform_arithmetic 1
+#define cl_khr_subgroup_shuffle 1
+#define cl_khr_subgroup_shuffle_relative 1
+#define cl_khr_subgroup_clustered_reduce 1
+#endif // defined(__SPIR__)
+#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
+
 // built-in scalar data types:

 /**
@@ -568,4 +583,7 @@ typedef struct {
 #pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
 #endif // cl_intel_device_side_avc_motion_estimation

+// Disable any extensions we may have enabled previously.
+#pragma OPENCL EXTENSION all : disable
+
 #endif //_OPENCL_BASE_H_
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
@@ -4633,6 +4633,7 @@ float16 __ovld __cnfn convert_float16(float16);
 // Conversions with double data type parameters or return value.

 #ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
 char __ovld __cnfn convert_char(double);
 char __ovld __cnfn convert_char_rte(double);
 char __ovld __cnfn convert_char_rtn(double);
@@ -5455,6 +5456,7 @@ double16 __ovld __cnfn convert_double16_rtz(ushort16);
 #endif //cl_khr_fp64

 #ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 // Convert half types to non-double types.
 uchar __ovld __cnfn convert_uchar(half);
 uchar __ovld __cnfn convert_uchar_rte(half);
--- a/lib/include/openmp_wrappers/cmath
+++ b/lib/include/openmp_wrappers/cmath
@@ -24,8 +24,11 @@
 // which might live in cstdlib.
 #include <cstdlib>

+// We need limits because __clang_cuda_cmath.h below uses `std::numeric_limit`.
+#include <limits>
+
 #pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any, allow_templates)})

 #define __CUDA__
 #define __OPENMP_NVPTX__
--- a/lib/include/openmp_wrappers/complex
+++ b/lib/include/openmp_wrappers/complex
@@ -18,8 +18,35 @@
 #include <cmath>

 #define __CUDA__
+#define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
+#undef __OPENMP_NVPTX__
 #endif

 // Grab the host header too.
 #include_next <complex>
+
+
+#ifdef __cplusplus
+
+// If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set
+// after including <cmath> above. Since the complex header we use is a
+// simplified version of the libc++, we don't need it in this case. If we
+// compile against libstdc++, or any other standard library, we will overload
+// the (hopefully template) functions in the <complex> header with the ones we
+// got from libc++ which decomposes math functions, like `std::sin`, into
+// arithmetic and calls to non-complex functions, all of which we can then
+// handle.
+#ifndef _LIBCPP_STD_VER
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)},                                           \
+    implementation = {extension(match_any, allow_templates)})
+
+#include <complex_cmath.h>
+
+#pragma omp end declare variant
+
+#endif
+
+#endif
--- a/lib/include/openmp_wrappers/complex.h
+++ b/lib/include/openmp_wrappers/complex.h
@@ -18,7 +18,9 @@
 #include <math.h>

 #define __CUDA__
+#define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
+#undef __OPENMP_NVPTX__
 #endif

 // Grab the host header too.
--- a/lib/include/openmp_wrappers/complex_cmath.h
+++ b/lib/include/openmp_wrappers/complex_cmath.h
@@ -0,0 +1,388 @@
+//===------------------------- __complex_cmath.h --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// std::complex header copied from the libcxx source and simplified for use in
+// OpenMP target offload regions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#ifndef __cplusplus
+#error "This file is for C++ compilation only."
+#endif
+
+#ifndef _LIBCPP_COMPLEX
+#define _LIBCPP_COMPLEX
+
+#include <cmath>
+#include <type_traits>
+
+#define __DEVICE__ static constexpr __attribute__((nothrow))
+
+namespace std {
+
+// abs
+
+template <class _Tp> __DEVICE__ _Tp abs(const std::complex<_Tp> &__c) {
+  return hypot(__c.real(), __c.imag());
+}
+
+// arg
+
+template <class _Tp> __DEVICE__ _Tp arg(const std::complex<_Tp> &__c) {
+  return atan2(__c.imag(), __c.real());
+}
+
+template <class _Tp>
+typename enable_if<is_integral<_Tp>::value || is_same<_Tp, double>::value,
+                   double>::type
+arg(_Tp __re) {
+  return atan2(0., __re);
+}
+
+template <class _Tp>
+typename enable_if<is_same<_Tp, float>::value, float>::type arg(_Tp __re) {
+  return atan2f(0.F, __re);
+}
+
+// norm
+
+template <class _Tp> __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) {
+  if (std::isinf(__c.real()))
+    return abs(__c.real());
+  if (std::isinf(__c.imag()))
+    return abs(__c.imag());
+  return __c.real() * __c.real() + __c.imag() * __c.imag();
+}
+
+// conj
+
+template <class _Tp> std::complex<_Tp> conj(const std::complex<_Tp> &__c) {
+  return std::complex<_Tp>(__c.real(), -__c.imag());
+}
+
+// proj
+
+template <class _Tp> std::complex<_Tp> proj(const std::complex<_Tp> &__c) {
+  std::complex<_Tp> __r = __c;
+  if (std::isinf(__c.real()) || std::isinf(__c.imag()))
+    __r = std::complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
+  return __r;
+}
+
+// polar
+
+template <class _Tp>
+complex<_Tp> polar(const _Tp &__rho, const _Tp &__theta = _Tp()) {
+  if (std::isnan(__rho) || signbit(__rho))
+    return std::complex<_Tp>(_Tp(NAN), _Tp(NAN));
+  if (std::isnan(__theta)) {
+    if (std::isinf(__rho))
+      return std::complex<_Tp>(__rho, __theta);
+    return std::complex<_Tp>(__theta, __theta);
+  }
+  if (std::isinf(__theta)) {
+    if (std::isinf(__rho))
+      return std::complex<_Tp>(__rho, _Tp(NAN));
+    return std::complex<_Tp>(_Tp(NAN), _Tp(NAN));
+  }
+  _Tp __x = __rho * cos(__theta);
+  if (std::isnan(__x))
+    __x = 0;
+  _Tp __y = __rho * sin(__theta);
+  if (std::isnan(__y))
+    __y = 0;
+  return std::complex<_Tp>(__x, __y);
+}
+
+// log
+
+template <class _Tp> std::complex<_Tp> log(const std::complex<_Tp> &__x) {
+  return std::complex<_Tp>(log(abs(__x)), arg(__x));
+}
+
+// log10
+
+template <class _Tp> std::complex<_Tp> log10(const std::complex<_Tp> &__x) {
+  return log(__x) / log(_Tp(10));
+}
+
+// sqrt
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> sqrt(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(_Tp(INFINITY), __x.imag());
+  if (std::isinf(__x.real())) {
+    if (__x.real() > _Tp(0))
+      return std::complex<_Tp>(__x.real(), std::isnan(__x.imag())
+                                               ? __x.imag()
+                                               : copysign(_Tp(0), __x.imag()));
+    return std::complex<_Tp>(std::isnan(__x.imag()) ? __x.imag() : _Tp(0),
+                             copysign(__x.real(), __x.imag()));
+  }
+  return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
+}
+
+// exp
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> exp(const std::complex<_Tp> &__x) {
+  _Tp __i = __x.imag();
+  if (std::isinf(__x.real())) {
+    if (__x.real() < _Tp(0)) {
+      if (!std::isfinite(__i))
+        __i = _Tp(1);
+    } else if (__i == 0 || !std::isfinite(__i)) {
+      if (std::isinf(__i))
+        __i = _Tp(NAN);
+      return std::complex<_Tp>(__x.real(), __i);
+    }
+  } else if (std::isnan(__x.real()) && __x.imag() == 0)
+    return __x;
+  _Tp __e = exp(__x.real());
+  return std::complex<_Tp>(__e * cos(__i), __e * sin(__i));
+}
+
+// pow
+
+template <class _Tp>
+std::complex<_Tp> pow(const std::complex<_Tp> &__x,
+                      const std::complex<_Tp> &__y) {
+  return exp(__y * log(__x));
+}
+
+// __sqr, computes pow(x, 2)
+
+template <class _Tp> std::complex<_Tp> __sqr(const std::complex<_Tp> &__x) {
+  return std::complex<_Tp>((__x.real() - __x.imag()) *
+                               (__x.real() + __x.imag()),
+                           _Tp(2) * __x.real() * __x.imag());
+}
+
+// asinh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> asinh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return __x;
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.real(),
+                               copysign(__pi * _Tp(0.25), __x.imag()));
+    return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.imag(), __x.real());
+    if (__x.imag() == 0)
+      return __x;
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(copysign(__x.imag(), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) + _Tp(1)));
+  return std::complex<_Tp>(copysign(__z.real(), __x.real()),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// acosh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> acosh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return std::complex<_Tp>(abs(__x.real()), __x.imag());
+    if (std::isinf(__x.imag())) {
+      if (__x.real() > 0)
+        return std::complex<_Tp>(__x.real(),
+                                 copysign(__pi * _Tp(0.25), __x.imag()));
+      else
+        return std::complex<_Tp>(-__x.real(),
+                                 copysign(__pi * _Tp(0.75), __x.imag()));
+    }
+    if (__x.real() < 0)
+      return std::complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
+    return std::complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(abs(__x.imag()), __x.real());
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(abs(__x.imag()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
+  return std::complex<_Tp>(copysign(__z.real(), _Tp(0)),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// atanh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> atanh(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.imag())) {
+    return std::complex<_Tp>(copysign(_Tp(0), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  }
+  if (std::isnan(__x.imag())) {
+    if (std::isinf(__x.real()) || __x.real() == 0)
+      return std::complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
+    return std::complex<_Tp>(__x.imag(), __x.imag());
+  }
+  if (std::isnan(__x.real())) {
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.real())) {
+    return std::complex<_Tp>(copysign(_Tp(0), __x.real()),
+                             copysign(__pi / _Tp(2), __x.imag()));
+  }
+  if (abs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
+    return std::complex<_Tp>(copysign(_Tp(INFINITY), __x.real()),
+                             copysign(_Tp(0), __x.imag()));
+  }
+  std::complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
+  return std::complex<_Tp>(copysign(__z.real(), __x.real()),
+                           copysign(__z.imag(), __x.imag()));
+}
+
+// sinh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> sinh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real()) && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(__x.real(), _Tp(NAN));
+  if (__x.real() == 0 && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(__x.real(), _Tp(NAN));
+  if (__x.imag() == 0 && !std::isfinite(__x.real()))
+    return __x;
+  return std::complex<_Tp>(sinh(__x.real()) * cos(__x.imag()),
+                           cosh(__x.real()) * sin(__x.imag()));
+}
+
+// cosh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> cosh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real()) && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(abs(__x.real()), _Tp(NAN));
+  if (__x.real() == 0 && !std::isfinite(__x.imag()))
+    return std::complex<_Tp>(_Tp(NAN), __x.real());
+  if (__x.real() == 0 && __x.imag() == 0)
+    return std::complex<_Tp>(_Tp(1), __x.imag());
+  if (__x.imag() == 0 && !std::isfinite(__x.real()))
+    return std::complex<_Tp>(abs(__x.real()), __x.imag());
+  return std::complex<_Tp>(cosh(__x.real()) * cos(__x.imag()),
+                           sinh(__x.real()) * sin(__x.imag()));
+}
+
+// tanh
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> tanh(const std::complex<_Tp> &__x) {
+  if (std::isinf(__x.real())) {
+    if (!std::isfinite(__x.imag()))
+      return std::complex<_Tp>(_Tp(1), _Tp(0));
+    return std::complex<_Tp>(_Tp(1),
+                             copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
+  }
+  if (std::isnan(__x.real()) && __x.imag() == 0)
+    return __x;
+  _Tp __2r(_Tp(2) * __x.real());
+  _Tp __2i(_Tp(2) * __x.imag());
+  _Tp __d(cosh(__2r) + cos(__2i));
+  _Tp __2rsh(sinh(__2r));
+  if (std::isinf(__2rsh) && std::isinf(__d))
+    return std::complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1),
+                             __2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
+  return std::complex<_Tp>(__2rsh / __d, sin(__2i) / __d);
+}
+
+// asin
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> asin(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// acos
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> acos(const std::complex<_Tp> &__x) {
+  const _Tp __pi(atan2(+0., -0.));
+  if (std::isinf(__x.real())) {
+    if (std::isnan(__x.imag()))
+      return std::complex<_Tp>(__x.imag(), __x.real());
+    if (std::isinf(__x.imag())) {
+      if (__x.real() < _Tp(0))
+        return std::complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
+      return std::complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
+    }
+    if (__x.real() < _Tp(0))
+      return std::complex<_Tp>(__pi,
+                               signbit(__x.imag()) ? -__x.real() : __x.real());
+    return std::complex<_Tp>(_Tp(0),
+                             signbit(__x.imag()) ? __x.real() : -__x.real());
+  }
+  if (std::isnan(__x.real())) {
+    if (std::isinf(__x.imag()))
+      return std::complex<_Tp>(__x.real(), -__x.imag());
+    return std::complex<_Tp>(__x.real(), __x.real());
+  }
+  if (std::isinf(__x.imag()))
+    return std::complex<_Tp>(__pi / _Tp(2), -__x.imag());
+  if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag())))
+    return std::complex<_Tp>(__pi / _Tp(2), -__x.imag());
+  std::complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
+  if (signbit(__x.imag()))
+    return std::complex<_Tp>(abs(__z.imag()), abs(__z.real()));
+  return std::complex<_Tp>(abs(__z.imag()), -abs(__z.real()));
+}
+
+// atan
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> atan(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// sin
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> sin(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// cos
+
+template <class _Tp> std::complex<_Tp> cos(const std::complex<_Tp> &__x) {
+  return cosh(complex<_Tp>(-__x.imag(), __x.real()));
+}
+
+// tan
+
+template <class _Tp>
+__DEVICE__ std::complex<_Tp> tan(const std::complex<_Tp> &__x) {
+  std::complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
+  return std::complex<_Tp>(__z.imag(), -__z.real());
+}
+
+} // namespace std
+
+#endif
--- a/lib/include/popcntintrin.h
+++ b/lib/include/popcntintrin.h
@@ -13,6 +13,12 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))

+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
 /// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
@@ -23,7 +29,7 @@
 ///    An unsigned 32-bit integer operand.
 /// \returns A 32-bit integer containing the number of bits with value 1 in the
 ///    source operand.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_popcnt_u32(unsigned int __A)
 {
  return __builtin_popcount(__A);
@@ -40,7 +46,7 @@ _mm_popcnt_u32(unsigned int __A)
 ///    An unsigned 64-bit integer operand.
 /// \returns A 64-bit integer containing the number of bits with value 1 in the
 ///    source operand.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm_popcnt_u64(unsigned long long __A)
 {
  return __builtin_popcountll(__A);
@@ -48,5 +54,6 @@ _mm_popcnt_u64(unsigned long long __A)
 #endif /* __x86_64__ */

 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR

 #endif /* __POPCNTINTRIN_H */
--- a/lib/include/ppc_wrappers/smmintrin.h
+++ b/lib/include/ppc_wrappers/smmintrin.h
@@ -78,6 +78,30 @@ extern __inline __m128i
  return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
 }

+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
+  __v16qi result = (__v16qi)__A;
+  result[__N & 0xf] = __D;
+  return (__m128i)result;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
+  __v4si result = (__v4si)__A;
+  result[__N & 3] = __D;
+  return (__m128i)result;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
+  __v2di result = (__v2di)__A;
+  result[__N & 1] = __D;
+  return (__m128i)result;
+}
+
 #else
 #include_next <smmintrin.h>
 #endif /* defined(__linux__) && defined(__ppc64__) */
--- a/lib/include/uintrintrin.h
+++ b/lib/include/uintrintrin.h
@@ -0,0 +1,150 @@
+/*===------------------ uintrintrin.h - UINTR intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef __UINTRINTRIN_H
+#define __UINTRINTRIN_H
+
+/* Define the default attributes for the functions in this file */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("uintr")))
+
+#ifdef __x86_64__
+
+/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
+///    user interrupt cannot be delivered on the instruction boundary following
+///    CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
+///    64-bit mode, and software is not executing inside an enclave; otherwise,
+///    each causes an invalid-opcode exception. Causes a transactional abort if
+///    executed inside a transactional region; the abort loads EAX as it would
+///    had it been due to an execution of CLI.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLUI </c> instruction.
+///
+/// \operation
+///   UIF := 0
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_clui (void)
+{
+  __builtin_ia32_clui();
+}
+
+/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a
+///    user interrupt may be delivered on the instruction boundary following
+///    STUI. Can be executed only if CR4.UINT = 1, the logical processor is in
+///    64-bit mode, and software is not executing inside an enclave; otherwise,
+///    each causes an invalid-opcode exception. Causes a transactional abort if
+///    executed inside a transactional region; the abort loads EAX as it would
+///    had it been due to an execution of STI.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> STUI </c> instruction.
+///
+/// \operation
+///   UIF := 1
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_stui (void)
+{
+  __builtin_ia32_stui();
+}
+
+/// Get the current value of the user interrupt flag (UIF). Can be executed
+///    regardless of CPL and inside a transactional region. Can be executed only
+///    if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is
+///    not executing inside an enclave; otherwise, it causes an invalid-opcode
+///    exception.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> TESTUI </c> instruction.
+///
+/// \returns The current value of the user interrupt flag (UIF).
+///
+/// \operation
+///   CF := UIF
+///   ZF := 0
+///   AF := 0
+///   OF := 0
+///   PF := 0
+///   SF := 0
+///   dst := CF
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_testui (void)
+{
+  return __builtin_ia32_testui();
+}
+
+/// Send interprocessor user interrupt. Can be executed only if
+///    CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode,
+///    and software is not executing inside an enclave; otherwise, it causes an
+///    invalid-opcode exception. May be executed at any privilege level, all of
+///    its memory accesses are performed with supervisor privilege.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> SENDUIPI </c> instruction
+///
+/// \param __a
+///    Index of user-interrupt target table entry in user-interrupt target
+///    table.
+///
+/// \operation
+///   IF __a > UITTSZ
+///     GP (0)
+///   FI
+///   tempUITTE := MEM[UITTADDR + (a<<4)]
+///   // tempUITTE must be valid, and can't have any reserved bit set
+///   IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0)
+///     GP (0)
+///   FI
+///   tempUPID := MEM[tempUITTE.UPIDADDR] // under lock
+///   // tempUPID can't have any reserved bit set
+///   IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0)
+///     GP (0) // release lock
+///   FI
+///   tempUPID.PIR[tempUITTE.UV] := 1;
+///   IF (tempUPID.SN == 0 AND tempUPID.ON == 0)
+///     tempUPID.ON := 1
+///     sendNotify := 1
+///   ELSE
+///     sendNotify := 0
+///   FI
+///   MEM[tempUITTE.UPIDADDR] := tempUPID // release lock
+///   IF sendNotify == 1
+///     IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode
+///       // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC
+///       // ID tempUPID.NDST
+///       SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST)
+///     ELSE
+///       // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC
+///       // ID tempUPID.NDST[15:8]
+///       SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
+///     FI
+///   FI
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_senduipi (unsigned long long __a)
+{
+  __builtin_ia32_senduipi(__a);
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __UINTRINTRIN_H */
--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h
@@ -18,8 +18,7 @@ typedef int32_t v128_t __attribute__((__vector_size__(16), __aligned__(16)));

 // Internal types determined by clang builtin definitions
 typedef int32_t __v128_u __attribute__((__vector_size__(16), __aligned__(1)));
-typedef char __i8x16 __attribute__((__vector_size__(16), __aligned__(16)));
-typedef signed char __s8x16
+typedef signed char __i8x16
    __attribute__((__vector_size__(16), __aligned__(16)));
 typedef unsigned char __u8x16
    __attribute__((__vector_size__(16), __aligned__(16)));
@@ -35,6 +34,13 @@ typedef unsigned long long __u64x2
 typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
 typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16)));

+typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8)));
+typedef unsigned char __u8x8
+    __attribute__((__vector_size__(8), __aligned__(8)));
+typedef short __i16x4 __attribute__((__vector_size__(8), __aligned__(8)));
+typedef unsigned short __u16x4
+    __attribute__((__vector_size__(8), __aligned__(8)));
+
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("simd128"),        \
                 __min_vector_width__(128)))
@@ -273,7 +279,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_splat(int8_t __a) {
  (__builtin_wasm_extract_lane_s_i8x16((__i8x16)(__a), __i))

 #define wasm_u8x16_extract_lane(__a, __i)                                      \
-  (__builtin_wasm_extract_lane_u_i8x16((__i8x16)(__a), __i))
+  (__builtin_wasm_extract_lane_u_i8x16((__u8x16)(__a), __i))

 #define wasm_i8x16_replace_lane(__a, __i, __b)                                 \
  ((v128_t)__builtin_wasm_replace_lane_i8x16((__i8x16)(__a), __i, __b))
@@ -286,7 +292,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) {
  (__builtin_wasm_extract_lane_s_i16x8((__i16x8)(__a), __i))

 #define wasm_u16x8_extract_lane(__a, __i)                                      \
-  (__builtin_wasm_extract_lane_u_i16x8((__i16x8)(__a), __i))
+  (__builtin_wasm_extract_lane_u_i16x8((__u16x8)(__a), __i))

 #define wasm_i16x8_replace_lane(__a, __i, __b)                                 \
  ((v128_t)__builtin_wasm_replace_lane_i16x8((__i16x8)(__a), __i, __b))
@@ -333,17 +339,17 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_splat(double __a) {

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_eq(v128_t __a,
                                                          v128_t __b) {
-  return (v128_t)((__s8x16)__a == (__s8x16)__b);
+  return (v128_t)((__i8x16)__a == (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_ne(v128_t __a,
                                                          v128_t __b) {
-  return (v128_t)((__s8x16)__a != (__s8x16)__b);
+  return (v128_t)((__i8x16)__a != (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_lt(v128_t __a,
                                                          v128_t __b) {
-  return (v128_t)((__s8x16)__a < (__s8x16)__b);
+  return (v128_t)((__i8x16)__a < (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_lt(v128_t __a,
@@ -353,7 +359,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_lt(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_gt(v128_t __a,
                                                          v128_t __b) {
-  return (v128_t)((__s8x16)__a > (__s8x16)__b);
+  return (v128_t)((__i8x16)__a > (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_gt(v128_t __a,
@@ -363,7 +369,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_gt(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_le(v128_t __a,
                                                          v128_t __b) {
-  return (v128_t)((__s8x16)__a <= (__s8x16)__b);
+  return (v128_t)((__i8x16)__a <= (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_le(v128_t __a,
@@ -373,7 +379,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_le(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_ge(v128_t __a,
                                                          v128_t __b) {
-  return (v128_t)((__s8x16)__a >= (__s8x16)__b);
+  return (v128_t)((__i8x16)__a >= (__i8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_ge(v128_t __a,
@@ -595,7 +601,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
                                                           int32_t __b) {
-  return (v128_t)((__s8x16)__a >> __b);
+  return (v128_t)((__i8x16)__a >> __b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
@@ -616,8 +622,8 @@ wasm_i8x16_add_saturate(v128_t __a, v128_t __b) {

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u8x16_add_saturate(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__i8x16)__a,
-                                                     (__i8x16)__b);
+  return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__u8x16)__a,
+                                                     (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a,
@@ -633,8 +639,8 @@ wasm_i8x16_sub_saturate(v128_t __a, v128_t __b) {

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u8x16_sub_saturate(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__i8x16)__a,
-                                                     (__i8x16)__b);
+  return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__u8x16)__a,
+                                                     (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,
@@ -644,7 +650,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_wasm_min_u_i8x16((__u8x16)__a, (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,
@@ -654,12 +660,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_max(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_wasm_max_u_i8x16((__u8x16)__a, (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_avgr(v128_t __a,
                                                            v128_t __b) {
-  return (v128_t)__builtin_wasm_avgr_u_i8x16((__i8x16)__a, (__i8x16)__b);
+  return (v128_t)__builtin_wasm_avgr_u_i8x16((__u8x16)__a, (__u8x16)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_abs(v128_t __a) {
@@ -706,8 +712,8 @@ wasm_i16x8_add_saturate(v128_t __a, v128_t __b) {

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u16x8_add_saturate(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__i16x8)__a,
-                                                     (__i16x8)__b);
+  return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__u16x8)__a,
+                                                     (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a,
@@ -723,8 +729,8 @@ wasm_i16x8_sub_saturate(v128_t __a, v128_t __b) {

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u16x8_sub_saturate(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__i16x8)__a,
-                                                     (__i16x8)__b);
+  return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__u16x8)__a,
+                                                     (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a,
@@ -739,7 +745,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_min(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_wasm_min_u_i16x8((__u16x8)__a, (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,
@@ -749,12 +755,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_max(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_wasm_max_u_i16x8((__u16x8)__a, (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_avgr(v128_t __a,
                                                            v128_t __b) {
-  return (v128_t)__builtin_wasm_avgr_u_i16x8((__i16x8)__a, (__i16x8)__b);
+  return (v128_t)__builtin_wasm_avgr_u_i16x8((__u16x8)__a, (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_abs(v128_t __a) {
@@ -810,7 +816,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_min(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_min(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_min_u_i32x4((__i32x4)__a, (__i32x4)__b);
+  return (v128_t)__builtin_wasm_min_u_i32x4((__u32x4)__a, (__u32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,
@@ -820,7 +826,7 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_max(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_max(v128_t __a,
                                                           v128_t __b) {
-  return (v128_t)__builtin_wasm_max_u_i32x4((__i32x4)__a, (__i32x4)__b);
+  return (v128_t)__builtin_wasm_max_u_i32x4((__u32x4)__a, (__u32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_neg(v128_t __a) {
@@ -1071,8 +1077,8 @@ wasm_i8x16_narrow_i16x8(v128_t __a, v128_t __b) {

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u8x16_narrow_i16x8(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_narrow_u_i8x16_i16x8((__i16x8)__a,
-                                                     (__i16x8)__b);
+  return (v128_t)__builtin_wasm_narrow_u_i8x16_i16x8((__u16x8)__a,
+                                                     (__u16x8)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
@@ -1083,48 +1089,76 @@ wasm_i16x8_narrow_i32x4(v128_t __a, v128_t __b) {

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u16x8_narrow_i32x4(v128_t __a, v128_t __b) {
-  return (v128_t)__builtin_wasm_narrow_u_i16x8_i32x4((__i32x4)__a,
-                                                     (__i32x4)__b);
+  return (v128_t)__builtin_wasm_narrow_u_i16x8_i32x4((__u32x4)__a,
+                                                     (__u32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_widen_low_i8x16(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_low_s_i16x8_i8x16((__i8x16)__a);
+  return (v128_t) __builtin_convertvector(
+      (__i8x8){((__i8x16)__a)[0], ((__i8x16)__a)[1], ((__i8x16)__a)[2],
+               ((__i8x16)__a)[3], ((__i8x16)__a)[4], ((__i8x16)__a)[5],
+               ((__i8x16)__a)[6], ((__i8x16)__a)[7]},
+      __i16x8);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_widen_high_i8x16(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_high_s_i16x8_i8x16((__i8x16)__a);
+  return (v128_t) __builtin_convertvector(
+      (__i8x8){((__i8x16)__a)[8], ((__i8x16)__a)[9], ((__i8x16)__a)[10],
+               ((__i8x16)__a)[11], ((__i8x16)__a)[12], ((__i8x16)__a)[13],
+               ((__i8x16)__a)[14], ((__i8x16)__a)[15]},
+      __i16x8);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_widen_low_u8x16(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_low_u_i16x8_i8x16((__i8x16)__a);
+  return (v128_t) __builtin_convertvector(
+      (__u8x8){((__u8x16)__a)[0], ((__u8x16)__a)[1], ((__u8x16)__a)[2],
+               ((__u8x16)__a)[3], ((__u8x16)__a)[4], ((__u8x16)__a)[5],
+               ((__u8x16)__a)[6], ((__u8x16)__a)[7]},
+      __u16x8);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_widen_high_u8x16(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_high_u_i16x8_i8x16((__i8x16)__a);
+  return (v128_t) __builtin_convertvector(
+      (__u8x8){((__u8x16)__a)[8], ((__u8x16)__a)[9], ((__u8x16)__a)[10],
+               ((__u8x16)__a)[11], ((__u8x16)__a)[12], ((__u8x16)__a)[13],
+               ((__u8x16)__a)[14], ((__u8x16)__a)[15]},
+      __u16x8);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_widen_low_i16x8(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_low_s_i32x4_i16x8((__i16x8)__a);
+  return (v128_t) __builtin_convertvector(
+      (__i16x4){((__i16x8)__a)[0], ((__i16x8)__a)[1], ((__i16x8)__a)[2],
+                ((__i16x8)__a)[3]},
+      __i32x4);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_widen_high_i16x8(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_high_s_i32x4_i16x8((__i16x8)__a);
+  return (v128_t) __builtin_convertvector(
+      (__i16x4){((__i16x8)__a)[4], ((__i16x8)__a)[5], ((__i16x8)__a)[6],
+                ((__i16x8)__a)[7]},
+      __i32x4);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_widen_low_u16x8(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_low_u_i32x4_i16x8((__i16x8)__a);
+  return (v128_t) __builtin_convertvector(
+      (__u16x4){((__u16x8)__a)[0], ((__u16x8)__a)[1], ((__u16x8)__a)[2],
+                ((__u16x8)__a)[3]},
+      __u32x4);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_widen_high_u16x8(v128_t __a) {
-  return (v128_t)__builtin_wasm_widen_high_u_i32x4_i16x8((__i16x8)__a);
+  return (v128_t) __builtin_convertvector(
+      (__u16x4){((__u16x8)__a)[4], ((__u16x8)__a)[5], ((__u16x8)__a)[6],
+                ((__u16x8)__a)[7]},
+      __u32x4);
 }

 // Undefine helper macros
--- a/lib/include/x86gprintrin.h
+++ b/lib/include/x86gprintrin.h
@@ -0,0 +1,23 @@
+/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#define __X86GPRINTRIN_H
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__HRESET__)
+#include <hresetintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__UINTR__)
+#include <uintrintrin.h>
+#endif
+
+#endif /* __X86GPRINTRIN_H */
--- a/lib/libc/glibc/abi.txt
+++ b/lib/libc/glibc/abi.txt
@@ -1195,6 +1195,7 @@ aarch64-linux-gnu aarch64_be-linux-gnu

 29
 29
+
 29
 29
 29
@@ -2097,6 +2098,10 @@ aarch64-linux-gnu aarch64_be-linux-gnu
 29
 29
 29
+43
+43
+43
+43
 29
 29
 29
@@ -2656,10 +2661,13 @@ aarch64-linux-gnu aarch64_be-linux-gnu
 29
 29
 29
+43
+43
 29
 29
 29
 29
+43
 29
 29
 29
@@ -2702,6 +2710,8 @@ aarch64-linux-gnu aarch64_be-linux-gnu
 29
 29
 29
+43
+43
 29
 29
 29
@@ -3440,6 +3450,8 @@ aarch64-linux-gnu aarch64_be-linux-gnu
 29
 29
 29
+43
+43
 29
 29
 29
@@ -5154,6 +5166,7 @@ s390x-linux-gnu

 5
 5
+
 5
 16
 5
@@ -6056,6 +6069,10 @@ s390x-linux-gnu
 5
 5
 12
+43
+43
+43
+43
 5
 5
 5
@@ -6615,10 +6632,13 @@ s390x-linux-gnu
 5
 5
 12
+43
+43
 12
 5
 5
 5
+43
 5
 5
 22
@@ -6661,6 +6681,8 @@ s390x-linux-gnu
 5
 5
 16
+43
+43
 19
 19
 23
@@ -7399,6 +7421,8 @@ s390x-linux-gnu
 5 16
 5
 5
+43
+43
 5
 5
 5
@@ -9113,6 +9137,7 @@ arm-linux-gnueabi armeb-linux-gnueabi arm-linux-gnueabihf armeb-linux-gnueabihf

 16
 16
+
 16
 16
 16
@@ -10015,6 +10040,10 @@ arm-linux-gnueabi armeb-linux-gnueabi arm-linux-gnueabihf armeb-linux-gnueabihf
 16
 16
 16
+43
+43
+43
+43
 16
 16
 16
@@ -10574,10 +10603,13 @@ arm-linux-gnueabi armeb-linux-gnueabi arm-linux-gnueabihf armeb-linux-gnueabihf
 16
 16
 16
+43
+43
 16
 16
 16
 16
+43
 16
 16
 22
@@ -10620,6 +10652,8 @@ arm-linux-gnueabi armeb-linux-gnueabi arm-linux-gnueabihf armeb-linux-gnueabihf
 16
 16
 16
+43
+43
 19
 19
 23
@@ -11358,6 +11392,8 @@ arm-linux-gnueabi armeb-linux-gnueabi arm-linux-gnueabihf armeb-linux-gnueabihf
 16
 16
 16
+43
+43
 16
 16
 16
@@ -13072,6 +13108,7 @@ sparc-linux-gnu sparcel-linux-gnu

 5
 5
+
 0
 16
 0
@@ -13974,6 +14011,10 @@ sparc-linux-gnu sparcel-linux-gnu
 0 5
 1 5
 12
+43
+43
+43
+43
 0
 1
 1
@@ -14533,10 +14574,13 @@ sparc-linux-gnu sparcel-linux-gnu
 0
 1 5
 12
+43
+43
 12
 0
 1
 0
+43
 0
 0
 22
@@ -14579,6 +14623,8 @@ sparc-linux-gnu sparcel-linux-gnu
 5
 0
 16
+43
+43
 19
 19
 23
@@ -15317,6 +15363,8 @@ sparc-linux-gnu sparcel-linux-gnu
 0 16
 0
 0
+43
+43
 0
 1
 1
@@ -17031,6 +17079,7 @@ sparcv9-linux-gnu

 5
 5
+
 5
 16
 5
@@ -17933,6 +17982,10 @@ sparcv9-linux-gnu
 5
 5
 12
+43
+43
+43
+43
 5
 5
 5
@@ -18492,10 +18545,13 @@ sparcv9-linux-gnu
 5
 5
 12
+43
+43
 12
 5
 5
 5
+43
 5
 5
 22
@@ -18538,6 +18594,8 @@ sparcv9-linux-gnu
 5
 5
 16
+43
+43
 19
 19
 23
@@ -19276,6 +19334,8 @@ sparcv9-linux-gnu
 5
 5
 5
+43
+43
 5
 5
 5
@@ -20990,6 +21050,7 @@ mips64el-linux-gnuabi64 mips64-linux-gnuabi64

 5
 5
+
 0
 16
 0
@@ -21892,6 +21953,10 @@ mips64el-linux-gnuabi64 mips64-linux-gnuabi64
 0 5
 5
 12
+43
+43
+43
+43
 0
 5
 5
@@ -22451,10 +22516,13 @@ mips64el-linux-gnuabi64 mips64-linux-gnuabi64
 0
 5
 12
+43
+43
 12
 0
 5
 0
+43
 0
 0
 22
@@ -22497,6 +22565,8 @@ mips64el-linux-gnuabi64 mips64-linux-gnuabi64
 5
 0
 16
+43
+43
 19
 19
 23
@@ -23235,6 +23305,8 @@ mips64el-linux-gnuabi64 mips64-linux-gnuabi64
 0
 0
 0
+43
+43
 0
 5
 5
@@ -24949,6 +25021,7 @@ mips64el-linux-gnuabin32 mips64-linux-gnuabin32

 5
 5
+
 0
 16
 0
@@ -25851,6 +25924,10 @@ mips64el-linux-gnuabin32 mips64-linux-gnuabin32
 0 5
 5
 12
+43
+43
+43
+43
 0
 5
 5
@@ -26410,10 +26487,13 @@ mips64el-linux-gnuabin32 mips64-linux-gnuabin32
 0
 5
 12
+43
+43
 12
 0
 5
 0
+43
 0
 0
 22
@@ -26456,6 +26536,8 @@ mips64el-linux-gnuabin32 mips64-linux-gnuabin32
 5
 0
 16
+43
+43
 19
 19
 23
@@ -27194,6 +27276,8 @@ mips64el-linux-gnuabin32 mips64-linux-gnuabin32
 0
 0
 0
+43
+43
 0
 5
 5
@@ -28908,6 +28992,7 @@ mipsel-linux-gnueabihf mips-linux-gnueabihf

 5
 5
+
 0
 16
 0
@@ -29810,6 +29895,10 @@ mipsel-linux-gnueabihf mips-linux-gnueabihf
 0 5
 5
 12
+43
+43
+43
+43
 0
 5
 5
@@ -30369,10 +30458,13 @@ mipsel-linux-gnueabihf mips-linux-gnueabihf
 0
 5
 12
+43
+43
 12
 0
 5
 0
+43
 0
 0
 22
@@ -30415,6 +30507,8 @@ mipsel-linux-gnueabihf mips-linux-gnueabihf
 5
 0
 16
+43
+43
 19
 19
 23
@@ -31153,6 +31247,8 @@ mipsel-linux-gnueabihf mips-linux-gnueabihf
 0
 0
 0
+43
+43
 0
 5
 5
@@ -32867,6 +32963,7 @@ mipsel-linux-gnueabi mips-linux-gnueabi

 5
 5
+
 0
 16
 0
@@ -33769,6 +33866,10 @@ mipsel-linux-gnueabi mips-linux-gnueabi
 0 5
 5
 12
+43
+43
+43
+43
 0
 5
 5
@@ -34328,10 +34429,13 @@ mipsel-linux-gnueabi mips-linux-gnueabi
 0
 5
 12
+43
+43
 12
 0
 5
 0
+43
 0
 0
 22
@@ -34374,6 +34478,8 @@ mipsel-linux-gnueabi mips-linux-gnueabi
 5
 0
 16
+43
+43
 19
 19
 23
@@ -35112,6 +35218,8 @@ mipsel-linux-gnueabi mips-linux-gnueabi
 0
 0
 0
+43
+43
 0
 5
 5
@@ -36826,6 +36934,7 @@ x86_64-linux-gnu

 10
 10
+43
 10
 16
 10
@@ -37728,6 +37837,10 @@ x86_64-linux-gnu
 10
 10
 12
+43
+43
+43
+43
 10
 10
 10
@@ -38287,10 +38400,13 @@ x86_64-linux-gnu
 10
 10
 12
+43
+43
 12
 10
 10
 10
+43
 10
 10
 22
@@ -38333,6 +38449,8 @@ x86_64-linux-gnu
 10
 10
 16
+43
+43
 19
 19
 23
@@ -39071,6 +39189,8 @@ x86_64-linux-gnu
 10
 10
 10
+43
+43
 10
 10
 10
@@ -40785,6 +40905,7 @@ x86_64-linux-gnux32

 28
 28
+43
 28
 28
 28
@@ -41687,6 +41808,10 @@ x86_64-linux-gnux32
 28
 28
 28
+43
+43
+43
+43
 28
 28
 28
@@ -42246,10 +42371,13 @@ x86_64-linux-gnux32
 28
 28
 28
+43
+43
 28
 28
 28
 28
+43
 28
 28
 28
@@ -42292,6 +42420,8 @@ x86_64-linux-gnux32
 28
 28
 28
+43
+43
 28
 28
 28
@@ -43030,6 +43160,8 @@ x86_64-linux-gnux32
 28
 28
 28
+43
+43
 28
 28
 28
@@ -44744,6 +44876,7 @@ i386-linux-gnu

 5
 5
+43
 0
 16
 0
@@ -45646,6 +45779,10 @@ i386-linux-gnu
 0 5
 1 5
 12
+43
+43
+43
+43
 0
 1
 1
@@ -46205,10 +46342,13 @@ i386-linux-gnu
 0
 1 5
 12
+43
+43
 12
 0
 1
 0
+43
 0
 0
 22
@@ -46251,6 +46391,8 @@ i386-linux-gnu
 5
 0
 16
+43
+43
 19
 19
 23
@@ -46989,6 +47131,8 @@ i386-linux-gnu
 0
 0
 0
+43
+43
 0
 1
 1
@@ -48703,6 +48847,7 @@ powerpc64le-linux-gnu
 42
 29
 29
+
 29
 29
 29
@@ -49605,6 +49750,10 @@ powerpc64le-linux-gnu
 29
 29
 29
+43
+43
+43
+43
 29
 29
 29
@@ -50164,10 +50313,13 @@ powerpc64le-linux-gnu
 29
 29
 29
+43
+43
 29
 29
 29
 29
+43
 29
 29
 29
@@ -50210,6 +50362,8 @@ powerpc64le-linux-gnu
 29
 29
 29
+43
+43
 29
 29
 29
@@ -50948,6 +51102,8 @@ powerpc64le-linux-gnu
 29
 29
 29
+43
+43
 29
 29
 29
@@ -52662,6 +52818,7 @@ powerpc64-linux-gnu

 12
 12
+
 12
 16
 12
@@ -53564,6 +53721,10 @@ powerpc64-linux-gnu
 12
 12
 12
+43
+43
+43
+43
 12
 12
 12
@@ -54123,10 +54284,13 @@ powerpc64-linux-gnu
 12
 12
 12
+43
+43
 12
 12
 12
 12
+43
 12
 12
 22
@@ -54169,6 +54333,8 @@ powerpc64-linux-gnu
 12
 12
 16
+43
+43
 19
 19
 23
@@ -54907,6 +55073,8 @@ powerpc64-linux-gnu
 12 16
 12
 12
+43
+43
 12
 12
 12
@@ -56621,6 +56789,7 @@ powerpc-linux-gnueabi powerpc-linux-gnueabihf

 5
 5
+
 0
 16
 0
@@ -57523,6 +57692,10 @@ powerpc-linux-gnueabi powerpc-linux-gnueabihf
 0 5
 1 5
 12
+43
+43
+43
+43
 0
 1
 1
@@ -58082,10 +58255,13 @@ powerpc-linux-gnueabi powerpc-linux-gnueabihf
 0
 1 5
 12
+43
+43
 12
 0
 1 14 15
 0
+43
 0
 0
 22
@@ -58128,6 +58304,8 @@ powerpc-linux-gnueabi powerpc-linux-gnueabihf
 5
 0
 16
+43
+43
 19
 19
 23
@@ -58866,6 +59044,8 @@ powerpc-linux-gnueabi powerpc-linux-gnueabihf
 0 16
 0
 0
+43
+43
 0
 1
 1
--- a/lib/libc/glibc/fns.txt
+++ b/lib/libc/glibc/fns.txt
@@ -1194,6 +1194,7 @@ __write c
 __wscanfieee128 c
 __wuflow c
 __wunderflow c
+__x86_get_cpuid_feature_leaf c
 __xmknod c
 __xmknodat c
 __xpg_basename c
@@ -2096,6 +2097,10 @@ fseeko64 c
 fsetpos c
 fsetpos64 c
 fsetxattr c
+fstat c
+fstat64 c
+fstatat c
+fstatat64 c
 fstatfs c
 fstatfs64 c
 fstatvfs c
@@ -2655,10 +2660,13 @@ lsearch c
 lseek c
 lseek64 c
 lsetxattr c
+lstat c
+lstat64 c
 lutimes c
 madvise c
 makecontext c
 mallinfo c
+mallinfo2 c
 malloc c
 malloc_get_state c
 malloc_info c
@@ -2701,6 +2709,8 @@ mkdirat c
 mkdtemp c
 mkfifo c
 mkfifoat c
+mknod c
+mknodat c
 mkostemp c
 mkostemp64 c
 mkostemps c
@@ -3439,6 +3449,8 @@ srandom_r c
 sscanf c
 ssignal c
 sstk c
+stat c
+stat64 c
 statfs c
 statfs64 c
 statvfs c
--- a/lib/libc/glibc/sysdeps/i386/sysdep.h
+++ b/lib/libc/glibc/sysdeps/i386/sysdep.h
@@ -61,7 +61,7 @@ lose: SYSCALL_PIC_SETUP							      \

 # define SETUP_PIC_REG(reg) \
  .ifndef GET_PC_THUNK(reg);						      \
-  .section .gnu.linkonce.t.GET_PC_THUNK(reg),"ax",@progbits;		      \
+  .section .text.GET_PC_THUNK(reg),"axG",@progbits,GET_PC_THUNK(reg),comdat;  \
  .globl GET_PC_THUNK(reg);						      \
  .hidden GET_PC_THUNK(reg);						      \
  .p2align 4;								      \
@@ -97,8 +97,9 @@ GET_PC_THUNK(reg):							      \

 # define SETUP_PIC_REG_STR(reg)						\
  ".ifndef " GET_PC_THUNK_STR (reg) "\n"				\
-  ".section .gnu.linkonce.t." GET_PC_THUNK_STR (reg) ",\"ax\",@progbits\n" \
+  "section .text." GET_PC_THUNK_STR (reg) ",\"axG\",@progbits," 	\
  ".globl " GET_PC_THUNK_STR (reg) "\n"					\
+  GET_PC_THUNK_STR (reg) ",comdat\n" \
  ".hidden " GET_PC_THUNK_STR (reg) "\n"				\
  ".p2align 4\n"							\
  ".type " GET_PC_THUNK_STR (reg) ",@function\n"			\
--- a/lib/libc/glibc/sysdeps/sparc/nptl/bits/pthreadtypes-arch.h
+++ b/lib/libc/glibc/sysdeps/sparc/nptl/bits/pthreadtypes-arch.h
@@ -1,81 +0,0 @@
-/* Machine-specific pthread type layouts.  SPARC version.
-   Copyright (C) 2003-2019 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#ifndef _BITS_PTHREADTYPES_ARCH_H
-#define _BITS_PTHREADTYPES_ARCH_H	1
-
-#include <bits/wordsize.h>
-
-#if __WORDSIZE == 64
-# define __SIZEOF_PTHREAD_ATTR_T 56
-# define __SIZEOF_PTHREAD_MUTEX_T 40
-# define __SIZEOF_PTHREAD_CONDATTR_T 4
-# define __SIZEOF_PTHREAD_RWLOCK_T 56
-# define __SIZEOF_PTHREAD_BARRIER_T 32
-#else
-# define __SIZEOF_PTHREAD_ATTR_T 36
-# define __SIZEOF_PTHREAD_MUTEX_T 24
-# define __SIZEOF_PTHREAD_CONDATTR_T 4
-# define __SIZEOF_PTHREAD_RWLOCK_T 32
-# define __SIZEOF_PTHREAD_BARRIER_T 20
-#endif
-#define __SIZEOF_PTHREAD_MUTEXATTR_T 4
-#define __SIZEOF_PTHREAD_COND_T 48
-#define __SIZEOF_PTHREAD_RWLOCKATTR_T 8
-#define __SIZEOF_PTHREAD_BARRIERATTR_T 4
-
-/* Definitions for internal mutex struct.  */
-#define __PTHREAD_COMPAT_PADDING_MID
-#define __PTHREAD_COMPAT_PADDING_END
-#define __PTHREAD_MUTEX_LOCK_ELISION    0
-#define __PTHREAD_MUTEX_NUSERS_AFTER_KIND  (__WORDSIZE != 64)
-#define __PTHREAD_MUTEX_USE_UNION          (__WORDSIZE != 64)
-
-#define __LOCK_ALIGNMENT
-#define __ONCE_ALIGNMENT
-
-struct __pthread_rwlock_arch_t
-{
-  unsigned int __readers;
-  unsigned int __writers;
-  unsigned int __wrphase_futex;
-  unsigned int __writers_futex;
-  unsigned int __pad3;
-  unsigned int __pad4;
-#if __WORDSIZE == 64
-  int __cur_writer;
-  int __shared;
-  unsigned long int __pad1;
-  unsigned long int __pad2;
-  /* FLAGS must stay at this position in the structure to maintain
-     binary compatibility.  */
-  unsigned int __flags;
-#else
-  unsigned char __pad1;
-  unsigned char __pad2;
-  unsigned char __shared;
-  /* FLAGS must stay at this position in the structure to maintain
-     binary compatibility.  */
-  unsigned char __flags;
-  int __cur_writer;
-#endif
-};
-
-#define __PTHREAD_RWLOCK_ELISION_EXTRA 0
-
-#endif	/* bits/pthreadtypes.h */
--- a/lib/libc/glibc/vers.txt
+++ b/lib/libc/glibc/vers.txt
@@ -41,3 +41,4 @@ GLIBC_2.29
 GLIBC_2.30
 GLIBC_2.31
 GLIBC_2.32
+GLIBC_2.33
--- a/lib/libc/include/aarch64-linux-gnu/bits/fcntl.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/fcntl.h
@@ -1,5 +1,5 @@
 /* O_*, F_*, FD_* bit values for the AArch64 Linux ABI.
-   Copyright (C) 2011-2020 Free Software Foundation, Inc.
+   Copyright (C) 2011-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/fenv.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/fenv.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2004-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2004-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/fp-fast.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/fp-fast.h
@@ -1,5 +1,5 @@
 /* Define FP_FAST_* macros.  AArch64 version.
-   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
--- a/lib/libc/include/aarch64-linux-gnu/bits/hwcap.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/hwcap.h
@@ -1,5 +1,5 @@
 /* Defines for bits in AT_HWCAP.  AArch64 Linux version.
-   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
@@ -72,4 +72,5 @@
 #define HWCAP2_BF16		(1 << 14)
 #define HWCAP2_DGH		(1 << 15)
 #define HWCAP2_RNG		(1 << 16)
-#define HWCAP2_BTI		(1 << 17)
+#define HWCAP2_BTI		(1 << 17)
+#define HWCAP2_MTE		(1 << 18)
--- a/lib/libc/include/aarch64-linux-gnu/bits/link.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/link.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/local_lim.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/local_lim.h
@@ -1,5 +1,5 @@
 /* Minimum guaranteed maximum values for system limits.  Linux version.
-   Copyright (C) 1993-2020 Free Software Foundation, Inc.
+   Copyright (C) 1993-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/long-double.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/long-double.h
@@ -1,5 +1,5 @@
 /* Properties of long double type.  ldbl-128 version.
-   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
--- a/lib/libc/include/aarch64-linux-gnu/bits/mman.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/mman.h
@@ -1,5 +1,5 @@
 /* Definitions for POSIX memory map interface.  Linux/AArch64 version.
-   Copyright (C) 2020 Free Software Foundation, Inc.
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
@@ -24,6 +24,7 @@
   arch/arm64/include/uapi/asm/mman.h.  */

 #define PROT_BTI	0x10
+#define PROT_MTE	0x20

 #include <bits/mman-map-flags-generic.h>

--- a/lib/libc/include/aarch64-linux-gnu/bits/procfs.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/procfs.h
@@ -1,5 +1,5 @@
 /* Types for registers for sys/procfs.h.  AArch64 version.
-   Copyright (C) 1996-2020 Free Software Foundation, Inc.
+   Copyright (C) 1996-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/pthreadtypes-arch.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/pthreadtypes-arch.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/semaphore.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/semaphore.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2002-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/setjmp.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/setjmp.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1997-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1997-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/sigstack.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/sigstack.h
@@ -1,5 +1,5 @@
 /* sigstack, sigaltstack definitions.
-   Copyright (C) 2015-2020 Free Software Foundation, Inc.
+   Copyright (C) 2015-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
--- a/lib/libc/include/aarch64-linux-gnu/bits/statfs.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/statfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2020 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.

--- a/lib/libc/include/aarch64-linux-gnu/bits/struct_rwlock.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/struct_rwlock.h
@@ -1,5 +1,5 @@
 /* AArch64 internal rwlock struct definitions.
-   Copyright (C) 2019-2020 Free Software Foundation, Inc.
+   Copyright (C) 2019-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/bits/struct_stat.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/struct_stat.h
@@ -1,6 +1,6 @@
-/* Copyright (C) 2011-2020 Free Software Foundation, Inc.
+/* Definition for struct stat.
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
-   Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -17,29 +17,15 @@
   <https://www.gnu.org/licenses/>.  */

 #if !defined _SYS_STAT_H && !defined _FCNTL_H
-# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
+# error "Never include <bits/struct_stat.h> directly; use <sys/stat.h> instead."
 #endif

-#ifndef _BITS_STAT_H
-#define _BITS_STAT_H	1
+#ifndef _BITS_STRUCT_STAT_H
+#define _BITS_STRUCT_STAT_H	1

 #include <bits/endian.h>
 #include <bits/wordsize.h>

-/* 64-bit libc uses the kernel's 'struct stat', accessed via the
-   stat() syscall; 32-bit libc uses the kernel's 'struct stat64'
-   and accesses it via the stat64() syscall.  All the various
-   APIs offered by libc use the kernel shape for their struct stat
-   structure; the only difference is that 32-bit programs not
-   using __USE_FILE_OFFSET64 only see the low 32 bits of some
-   of the fields (specifically st_ino, st_size, and st_blocks).  */
-#define _STAT_VER_KERNEL	0
-#define _STAT_VER_LINUX		0
-#define _STAT_VER		_STAT_VER_KERNEL
-
-/* Versions of the `xmknod' interface.  */
-#define _MKNOD_VER_LINUX	0
-
 #if defined __USE_FILE_OFFSET64
 # define __field64(type, type64, name) type64 name
 #elif __WORDSIZE == 64 || defined __INO_T_MATCHES_INO64_T
@@ -138,37 +124,4 @@ struct stat64
 /* Nanosecond resolution time values are supported.  */
 #define _STATBUF_ST_NSEC

-/* Encoding of the file mode.  */
-
-#define	__S_IFMT	0170000	/* These bits determine file type.  */
-
-/* File types.  */
-#define	__S_IFDIR	0040000	/* Directory.  */
-#define	__S_IFCHR	0020000	/* Character device.  */
-#define	__S_IFBLK	0060000	/* Block device.  */
-#define	__S_IFREG	0100000	/* Regular file.  */
-#define	__S_IFIFO	0010000	/* FIFO.  */
-#define	__S_IFLNK	0120000	/* Symbolic link.  */
-#define	__S_IFSOCK	0140000	/* Socket.  */
-
-/* POSIX.1b objects.  Note that these macros always evaluate to zero.  But
-   they do it by enforcing the correct use of the macros.  */
-#define __S_TYPEISMQ(buf)  ((buf)->st_mode - (buf)->st_mode)
-#define __S_TYPEISSEM(buf) ((buf)->st_mode - (buf)->st_mode)
-#define __S_TYPEISSHM(buf) ((buf)->st_mode - (buf)->st_mode)
-
-/* Protection bits.  */
-
-#define	__S_ISUID	04000	/* Set user ID on execution.  */
-#define	__S_ISGID	02000	/* Set group ID on execution.  */
-#define	__S_ISVTX	01000	/* Save swapped text after use (sticky).  */
-#define	__S_IREAD	0400	/* Read by owner.  */
-#define	__S_IWRITE	0200	/* Write by owner.  */
-#define	__S_IEXEC	0100	/* Execute by owner.  */
-
-#ifdef __USE_ATFILE
-# define UTIME_NOW	((1l << 30) - 1l)
-# define UTIME_OMIT	((1l << 30) - 2l)
-#endif
-
-#endif /* bits/stat.h */
+#endif /* _BITS_STRUCT_STAT_H  */
--- a/lib/libc/include/aarch64-linux-gnu/bits/typesizes.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/typesizes.h
@@ -1,5 +1,5 @@
 /* bits/typesizes.h -- underlying types for *_t.  For the generic Linux ABI.
-   Copyright (C) 2011-2020 Free Software Foundation, Inc.
+   Copyright (C) 2011-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.

--- a/lib/libc/include/aarch64-linux-gnu/bits/wordsize.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/wordsize.h
@@ -1,6 +1,6 @@
 /* Determine the wordsize from the preprocessor defines.

-   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
--- a/lib/libc/include/aarch64-linux-gnu/fpu_control.h
+++ b/lib/libc/include/aarch64-linux-gnu/fpu_control.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/ieee754.h
+++ b/lib/libc/include/aarch64-linux-gnu/ieee754.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1992-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1992-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
--- a/lib/libc/include/aarch64-linux-gnu/sys/elf.h
+++ b/lib/libc/include/aarch64-linux-gnu/sys/elf.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1996-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/sys/ptrace.h
+++ b/lib/libc/include/aarch64-linux-gnu/sys/ptrace.h
@@ -1,5 +1,5 @@
 /* `ptrace' debugger support interface.  Linux/AArch64 version.
-   Copyright (C) 1996-2020 Free Software Foundation, Inc.
+   Copyright (C) 1996-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/lib/libc/include/aarch64-linux-gnu/sys/ucontext.h
+++ b/lib/libc/include/aarch64-linux-gnu/sys/ucontext.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1998-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1998-2021 Free Software Foundation, Inc.

   This file is part of the GNU C Library.

--- a/Show More
+++ b/Show More